From ae09f92b1977b4a28d9340dcb89f4904d9f18aa0 Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 3 Nov 2013 07:47:09 -0800 Subject: [PATCH] remove keywordext.cpp/h out of src --- src/cppjieba/CMakeLists.txt | 4 +- src/cppjieba/KeyWordExt.cpp | 360 ----------------------------------- src/cppjieba/KeyWordExt.h | 68 ------- src/keywordext.cpp | 56 ------ {src => test}/testlines.utf8 | 0 5 files changed, 2 insertions(+), 486 deletions(-) delete mode 100644 src/cppjieba/KeyWordExt.cpp delete mode 100644 src/cppjieba/KeyWordExt.h delete mode 100644 src/keywordext.cpp rename {src => test}/testlines.utf8 (100%) diff --git a/src/cppjieba/CMakeLists.txt b/src/cppjieba/CMakeLists.txt index 897d1e2..ec98370 100644 --- a/src/cppjieba/CMakeLists.txt +++ b/src/cppjieba/CMakeLists.txt @@ -1,4 +1,4 @@ -SET(LIBCPPJIEBA_SRC HMMSegment.cpp KeyWordExt.cpp MixSegment.cpp MPSegment.cpp Trie.cpp) +SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp) INCLUDE_DIRECTORIES(../limonp) @@ -7,4 +7,4 @@ ADD_LIBRARY(cppjieba SHARED ${LIBCPPJIEBA_SRC}) SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1) INSTALL(TARGETS cppjieba LIBRARY DESTINATION lib/CppJieba) -INSTALL(FILES ChineseFilter.hpp HMMSegment.h KeyWordExt.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) +INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) diff --git a/src/cppjieba/KeyWordExt.cpp b/src/cppjieba/KeyWordExt.cpp deleted file mode 100644 index c0ab565..0000000 --- a/src/cppjieba/KeyWordExt.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#include "KeyWordExt.h" - - -namespace CppJieba -{ - - KeyWordExt::KeyWordExt() - { - } - - KeyWordExt::~KeyWordExt() - { - } - - bool KeyWordExt::init(const char* const segDictFile) - { - LogInfo("KeyWordExt init start ..."); - if(!_segment.init(segDictFile)) - { - LogError("_segment.init failed."); - return false; - } - return true; - } - - bool KeyWordExt::loadStopWords(const char * const filePath) - { - - LogInfo("_loadStopWords(%s) start", filePath); - if(!_stopWords.empty()) - { - LogError("_stopWords has been loaded before! "); - return false; - } - if(!checkFileExist(filePath)) - { - LogError("cann't find file[%s].",filePath); - return false; - } - - ifstream ifile(filePath); - string line; - Unicode word; - while(getline(ifile, line)) - { - if(!TransCode::decode(line, word)) - { - LogError("decode failed ."); - return false; - } - _stopWords.insert(word); - } - LogInfo("load stopwords[%d] finished.", _stopWords.size()); - - return true; - } - - bool KeyWordExt::dispose() - { - _segment.dispose(); - return true; - } - - bool KeyWordExt::_wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b) - { - return a.weight > b.weight; - } - - bool KeyWordExt::_sortWLIDF(vector& wordInfos) - { - for(uint i = 0; i < wordInfos.size(); i++) - { - KeyWordInfo& wInfo = wordInfos[i]; - wInfo.idf = - wInfo.logFreq; - wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf; - } - sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); - return true; - } - - bool KeyWordExt::_extTopN(vector& wordInfos, uint topN) - { - int dis = wordInfos.size() - topN; - if(dis <= 0) - { - return true; - } - - if(uint(dis) <= topN) - { - for(int i = 0; i< dis; i++) - { - wordInfos.pop_back(); - } - } - else// in case that topN << size; - { - - vector tmp(wordInfos.begin(), wordInfos.begin() + topN); - wordInfos.swap(tmp); - } - return true; - } - - - bool KeyWordExt::extract(const vector& words, vector& keyWordInfos, uint topN) - { - if(words.empty()) - { - return false; - } - - keyWordInfos.clear(); - for(uint i = 0; i < words.size(); i++) - { - Unicode uniWord; - if(!TransCode::decode(words[i], uniWord)) - { - LogError("decode failed"); - return false; - } - keyWordInfos.push_back(uniWord); - } - - return _extract(keyWordInfos, topN); - } - - bool KeyWordExt::extract(const string& title, vector& keyWordInfos, uint topN) - { - if(title.empty()) - { - return false; - } - - vector trieNodeInfos; - Unicode unico; - if(!TransCode::decode(title, unico)) - { - return false; - } - _segment.cut(unico.begin(), unico.end(), trieNodeInfos); - - keyWordInfos.clear(); - for(uint i = 0; i < trieNodeInfos.size(); i++) - { - keyWordInfos.push_back(trieNodeInfos[i]); - } - return _extract(keyWordInfos, topN); - } - - bool KeyWordExt::_extract(vector& keyWordInfos, uint topN) - { - if(!_filter(keyWordInfos)) - { - LogError("_filter failed."); - return false; - } - - if(!_sortWLIDF(keyWordInfos)) - { - LogError("_sortWLIDF failed."); - return false; - } - - if(!_extTopN(keyWordInfos, topN)) - { - LogError("_extTopN failed."); - return false; - } - - return true; - } - - bool KeyWordExt::_filter(vector& wordInfos) - { - if(!_filterDuplicate(wordInfos)) - { - LogError("_filterDuplicate failed."); - return false; - } - - if(!_filterSingleWord(wordInfos)) - { - LogError("_filterSingleWord failed."); - return false; - } - - if(!_filterStopWords(wordInfos)) - { - LogError("_filterStopWords failed."); - return false; - } - - if(!_filterSubstr(wordInfos)) - { - LogError("_filterSubstr failed."); - return false; - } - - return true; - } - - bool KeyWordExt::_filterStopWords(vector& wordInfos) - { - if(_stopWords.empty()) - { - return true; - } - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end();) - { - if(_stopWords.find(it->word) != _stopWords.end()) - { - it = wordInfos.erase(it); - } - else - { - it ++; - } - } - return true; - } - - - bool KeyWordExt::_filterDuplicate(vector& wordInfos) - { - set st; - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - { - if(st.find(it->word) != st.end()) - { - it = wordInfos.erase(it); - } - else - { - st.insert(it->word); - it++; - } - } - return true; - } - - bool KeyWordExt::_filterSingleWord(vector& wordInfos) - { - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end();) - { - - // filter single word - if(1 == it->word.size()) - { - it = wordInfos.erase(it); - } - else - { - it++; - } - } - return true; - } - - bool KeyWordExt::_filterSubstr(vector& wordInfos) - { - vector tmp ; - for(uint i = 0; i < wordInfos.size(); i++) - { - tmp.push_back(wordInfos[i].word); - } - - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - { - if(_isSubIn(tmp, it->word)) - { - it = wordInfos.erase(it); - } - else - { - it++; - } - } - - return true; - } - - //bool KeyWordExt::_isContainSubWords(const string& word) - //{ - // for(uint i = 0; i < _priorSubWords.size(); i++) - // { - // if(string::npos != word.find(_priorSubWords[i])) - // { - // return true; - // } - // } - // return false; - //} - - //bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) - //{ - // if(2 > wordInfos.size()) - // { - // return true; - // } - - // KeyWordInfo prior; - // bool flag = false; - // for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - // { - // if(_isContainSubWords(it->word)) - // { - // prior = *it; - // it = wordInfos.erase(it); - // flag = true; - // break; - // } - // else - // { - // it ++; - // } - // } - // if(flag) - // { - // wordInfos.insert(wordInfos.begin(), prior); - // } - // return true; - //} -} - - -#ifdef KEYWORDEXT_UT - -using namespace CppJieba; - -int main() -{ - KeyWordExt ext; - ext.init(); - if(!ext.loadSegDict("../dicts/segdict.gbk.v2.1")) - { - return 1; - } - ext._loadStopWords("../dicts/stopwords.gbk.v1.0"); - - ifstream ifile("testtitle.gbk"); - vector res; - string line; - while(getline(ifile, line)) - { - cout< -#include "MPSegment.h" -#include "structs.h" - -namespace CppJieba -{ - - class KeyWordExt - { - private: - MPSegment _segment; - //vector _priorSubWords; - set _stopWords; - public: - KeyWordExt(); - ~KeyWordExt(); - bool init(const char* const segDictFile); - bool dispose(); - bool loadStopWords(const char * const filePath); - private: - //bool _loadPriorSubWords(const char * const filePath); - - - public: - bool extract(const string& title, vector& keyWordInfos, uint topN); - bool extract(const vector& words, vector& keyWordInfos, uint topN); - private: - static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); - private: - bool _extract(vector& keyWordInfos, uint topN); - bool _extTopN(vector& wordInfos, uint topN); - private: - //sort by word len - idf - bool _sortWLIDF(vector& wordInfos); - private: - bool _filter(vector& ); - bool _filterDuplicate(vector& ); - bool _filterSingleWord(vector& ); - bool _filterSubstr(vector& ); - bool _filterStopWords(vector& ); - private: - inline bool _isSubIn(const vector& words, const Unicode& word)const - { - - for(uint j = 0; j < words.size(); j++) - { - if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end())) - { - return true; - } - } - return false; - } - //bool _prioritizeSubWords(vector& wordInfos); - //bool _isContainSubWords(const string& word); - - }; - -} - -#endif diff --git a/src/keywordext.cpp b/src/keywordext.cpp deleted file mode 100644 index b7713d2..0000000 --- a/src/keywordext.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include -#include "../cppjieba/KeyWordExt.h" - -using namespace CppJieba; - - -void testKeyWordExt(const char * dictPath, const char * filePath) -{ - KeyWordExt ext; - if(!ext.init(dictPath)) - { - return; - } - - ifstream ifile(filePath); - vector res; - string line; - while(getline(ifile, line)) - { - res.clear(); - if(!line.empty()) - { - ext.extract(line, res, 20); - cout< argc) - { - cout<<"usage: \n\t"<\n" - <<"options:\n" - <<"\t--dictpath\tIf not specified, the default is "<