diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index dda52bc..be3840b 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -26,21 +26,27 @@ namespace CppJieba } - bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath) + bool KeyWordExt::loadPriorSubWord(const char * const filePath) { - LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath)); + LogInfo(string_format("loadPriorSubWord(%s) start", filePath)); if(!checkFileExist(filePath)) { LogError(string_format("cann't find file[%s].",filePath)); return false; } - bool retFlag = _priorPrefixTrie.init(filePath); - if(!retFlag) + if(!_priorSubWords.empty()) { - LogError("_priorPrefixTrie.init return false."); + LogError("_priorSubWords has been initted before"); return false; } - LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath)); + ifstream infile(filePath); + string subword; + while(getline(infile, subword)) + { + _priorSubWords.push_back(subword); + } + LogInfo(string_format("loadPriorSubWord(%s) end", filePath)); + infile.close(); return true; } @@ -73,7 +79,6 @@ namespace CppJieba bool KeyWordExt::destroy() { _segment.destroy(); - _priorPrefixTrie.destroy(); return true; } @@ -132,8 +137,8 @@ namespace CppJieba _sortWLIDF(wordInfos); LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str())); - _priorWordPrefixes(wordInfos); - LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str())); + _prioritizeSubWords(wordInfos); + LogDebug(string_format("_prioritizeSubWords res:\n%s", joinWordInfos(wordInfos).c_str())); //extract TopN for(uint i = 0; i < topN && i < wordInfos.size(); i++) { @@ -308,7 +313,19 @@ namespace CppJieba return true; } - bool KeyWordExt::_priorWordPrefixes(vector& wordInfos) + bool KeyWordExt::_isContainSubWords(const string& word) + { + for(uint i = 0; i < _priorSubWords.size(); i++) + { + if(string::npos != word.find(_priorSubWords[i])) + { + return true; + } + } + return false; + } + + bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) { if(2 > wordInfos.size()) { @@ -319,7 +336,7 @@ namespace CppJieba bool flag = false; for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) { - if(NULL != _priorPrefixTrie.findPrefix(it->word)) + if(_isContainSubWords(it->word)) { prior = *it; it = wordInfos.erase(it); @@ -353,7 +370,7 @@ int main() } ext.loadStopWords("stopwords.tmp"); - if(!ext.loadPriorWordPrefixes("prior.utf8")) + if(!ext.loadPriorSubWord("prior.utf8")) { cerr<<"err"< _priorSubWords; set _stopWords; public: KeyWordExt(); @@ -50,7 +50,7 @@ namespace CppJieba bool loadStopWords(const char * const filePath); //load prior words' prefix - bool loadPriorWordPrefixes( const char * const filePath); + bool loadPriorSubWord( const char * const filePath); bool destroy(); @@ -70,7 +70,8 @@ namespace CppJieba bool _filterSubstr(vector& utf8Strs); bool _filterStopWords(vector& utf8Strs); private: - bool _priorWordPrefixes(vector& wordInfos); + bool _prioritizeSubWords(vector& wordInfos); + bool _isContainSubWords(const string& word); }; diff --git a/src/Trie.h b/src/Trie.h index f3611bd..9602a28 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -31,7 +31,7 @@ namespace CppJieba { string word;// utf8 string word size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 . - unsigned int count; + size_t count; string tag; double weight; TrieNodeInfo() diff --git a/src/globals.h b/src/globals.h index 41f6385..94d1d40 100644 --- a/src/globals.h +++ b/src/globals.h @@ -7,8 +7,6 @@ namespace CppJieba { - //file path - //const char * const DICT_FILE_PATH = "dict.txt"; //typedefs typedef uint16_t ChUnicode;