diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index 6827883..dda52bc 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -79,7 +79,43 @@ namespace CppJieba bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b) { - return a.weight < b.weight; + return a.weight > b.weight; + } + + bool KeyWordExt::_sortWLIDF(vector& wordInfos) + { + size_t wLenSum = 0; + for(uint i = 0; i < wordInfos.size(); i++) + { + wordInfos[i].wLen = getUtf8WordLen(wordInfos[i].word); + if(0 == wordInfos[i].wLen) + { + LogFatal("wLen is 0"); + return false; + } + wLenSum += wordInfos[i].wLen; + } + + if(0 == wLenSum) + { + LogFatal("wLenSum == 0."); + return false; + } + + for(uint i = 0; i < wordInfos.size(); i++) + { + WordInfo& wInfo = wordInfos[i]; + double logWordFreq = _segment.getUtf8WordWeight(wInfo.word); + wInfo.idf = -logWordFreq; + size_t wLen = getUtf8WordLen(wInfo.word); + if(0 == wLen) + { + LogFatal("getUtf8WordLen(%s) return 0"); + } + wInfo.weight = 1.0 * wLen / wLenSum * wInfo.idf; + } + sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); + return true; } bool KeyWordExt::_extractTopN(const vector& words, vector& keywords, uint topN) @@ -88,14 +124,12 @@ namespace CppJieba vector wordInfos; for(uint i = 0; i < words.size(); i++) { - double w = _segment.getUtf8WordWeight(words[i]); WordInfo wInfo; wInfo.word = words[i]; - wInfo.weight = w; - wInfo.idf = w; wordInfos.push_back(wInfo); } - sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); + + _sortWLIDF(wordInfos); LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str())); _priorWordPrefixes(wordInfos); diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index aeba697..8fee39f 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -9,17 +9,19 @@ namespace CppJieba struct WordInfo { string word; + size_t wLen; double weight; double idf; WordInfo() { word = ""; + wLen = 0; weight = 0.0; idf = 0.0; } string getInfoStr() const { - return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf); + return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); } }; @@ -58,6 +60,10 @@ namespace CppJieba static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b); private: bool _extractTopN(const vector& words, vector& keywords, uint topN); + private: + //sort by word len - idf + bool _sortWLIDF(vector& wordInfos); + private: bool _filter(vector& utf8Strs); bool _filterDuplicate(vector& utf8Strs); bool _filterSingleWord(vector& utf8Strs); diff --git a/src/Makefile b/src/Makefile index 603b0c5..efc92b1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -38,7 +38,7 @@ $(CMLIB): #unit test Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB) - $(CC) -o $@ $< -DTRIE_UT $(CMLIB) + $(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB) $(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv diff --git a/src/Trie.cpp b/src/Trie.cpp index 575fe41..c3f1751 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -1,3 +1,7 @@ +/* + * file encoding: utf-8 + * author: wuyanyi09@gmail.com + */ #include "Trie.h" namespace CppJieba @@ -72,6 +76,13 @@ namespace CppJieba //insert node TrieNodeInfo nodeInfo; nodeInfo.word = chWord; + size_t wLen = getUtf8WordLen(chWord); + if(0 == wLen) + { + LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str())); + return false; + } + nodeInfo.wLen = wLen; nodeInfo.count = count; nodeInfo.tag = tag; diff --git a/src/Trie.h b/src/Trie.h index bead95b..f3611bd 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -1,3 +1,7 @@ +/* + * file encoding: utf-8 + * author: wuyanyi09@gmail.com + */ #ifndef CPPJIEBA_TRIE_H #define CPPJIEBA_TRIE_H @@ -25,12 +29,18 @@ namespace CppJieba struct TrieNodeInfo { - string word; + string word;// utf8 string word + size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 . unsigned int count; string tag; double weight; - TrieNodeInfo():word(),count(0),tag(),weight(0.0) + TrieNodeInfo() { + word = ""; + wLen = 0; + count = 0; + tag = ""; + weight = 0.0; } };