add wordlen-idf sort into ext

2025-07-18 00:00:12 +08:00 · 2013-07-18 13:08:08 +08:00 · 2013-07-18 13:08:08 +08:00 · 089a63bf2c
commit 089a63bf2c
parent 7aadcb2990
5 changed files with 70 additions and 9 deletions
--- a/src/KeyWordExt.cpp
+++ b/src/KeyWordExt.cpp
@ -79,7 +79,43 @@ namespace CppJieba
 	bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
 	{
-		return a.weight < b.weight;
+		return a.weight > b.weight;
 	}
 	bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
 	{
 		size_t wLenSum = 0;
 		for(uint i = 0; i < wordInfos.size(); i++)
 		{
 			wordInfos[i].wLen = getUtf8WordLen(wordInfos[i].word);
 			if(0 == wordInfos[i].wLen)
 			{
 				LogFatal("wLen is 0");
 				return false;
 			}
 			wLenSum += wordInfos[i].wLen;
 		}
 		if(0 == wLenSum)
 		{
 			LogFatal("wLenSum == 0.");
 			return false;
 		}
 		for(uint i = 0; i < wordInfos.size(); i++)
 		{
 			WordInfo& wInfo = wordInfos[i];
 			double logWordFreq = _segment.getUtf8WordWeight(wInfo.word);
 			wInfo.idf = -logWordFreq;
 			size_t wLen = getUtf8WordLen(wInfo.word);
 			if(0 == wLen)
 			{
 				LogFatal("getUtf8WordLen(%s) return 0");
 			}
 			wInfo.weight = 1.0 * wLen / wLenSum * wInfo.idf;
 		}
 		sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
 		return true;
 	}
 	bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
@ -88,14 +124,12 @@ namespace CppJieba
 		vector<WordInfo> wordInfos;
 		for(uint i = 0; i < words.size(); i++)
 		{
 			double w = _segment.getUtf8WordWeight(words[i]);
 			WordInfo wInfo;
 			wInfo.word = words[i];
 			wInfo.weight = w;
 			wInfo.idf = w;
 			wordInfos.push_back(wInfo);
 		}
-		sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
+		
 		_sortWLIDF(wordInfos);
 		LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
 		_priorWordPrefixes(wordInfos);
--- a/src/KeyWordExt.h
+++ b/src/KeyWordExt.h
@ -9,17 +9,19 @@ namespace CppJieba
 	struct WordInfo
 	{
 		string word;
 		size_t wLen;
 		double weight;
 		double idf;
 		WordInfo()
 		{
 			word = "";
 			wLen = 0;
 			weight = 0.0;
 			idf = 0.0;
 		}
 		string getInfoStr() const
 		{
-			return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf);
+			return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
 		}
 	};
@ -58,6 +60,10 @@ namespace CppJieba
 			static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
 		private:
 			bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
 		private:
 			//sort by word len - idf
 			bool _sortWLIDF(vector<WordInfo>& wordInfos);
 		private:
 			bool _filter(vector<string>& utf8Strs);
 			bool _filterDuplicate(vector<string>& utf8Strs);
 			bool _filterSingleWord(vector<string>& utf8Strs);
--- a/src/Makefile
+++ b/src/Makefile
@ -38,7 +38,7 @@ $(CMLIB):
 #unit test
 Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
-	$(CC) -o $@ $< -DTRIE_UT $(CMLIB) 
+	$(CC) -o $@ $< -DTRIE_UT $(CMLIB)  -liconv
 Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
 	$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv
--- a/src/Trie.cpp
+++ b/src/Trie.cpp
@ -1,3 +1,7 @@
 /*
 * file encoding: utf-8
 * author: wuyanyi09@gmail.com
 */
 #include "Trie.h"
 namespace CppJieba
@ -72,6 +76,13 @@ namespace CppJieba
 			//insert node
 			TrieNodeInfo nodeInfo;
 			nodeInfo.word = chWord;
 			size_t wLen = getUtf8WordLen(chWord);
 			if(0 == wLen)
 			{
 				LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
 				return false;
 			}
 			nodeInfo.wLen = wLen;
 			nodeInfo.count = count;
 			nodeInfo.tag = tag;
--- a/src/Trie.h
+++ b/src/Trie.h
@ -1,3 +1,7 @@
 /*
 * file encoding: utf-8
 * author: wuyanyi09@gmail.com
 */
 #ifndef CPPJIEBA_TRIE_H
 #define CPPJIEBA_TRIE_H
@ -25,12 +29,18 @@ namespace CppJieba
 	struct TrieNodeInfo
 	{
-		string word;
+		string word;// utf8 string word
 		size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
 		unsigned int count;
 		string tag;
 		double weight;
-		TrieNodeInfo():word(),count(0),tag(),weight(0.0)
+		TrieNodeInfo()
 		{
 			word = "";
 			wLen = 0;
 			count = 0;
 			tag = "";
 			weight = 0.0;
 		}
 	};