add wordlen-idf sort into ext

2025-07-18 00:00:12 +08:00 · 2013-07-18 13:08:08 +08:00 · 2013-07-18 13:08:08 +08:00 · 089a63bf2c
commit 089a63bf2c
parent 7aadcb2990
5 changed files with 70 additions and 9 deletions
--- a/src/KeyWordExt.cpp
+++ b/src/KeyWordExt.cpp
@ -79,7 +79,43 @@ namespace CppJieba

 	bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
 	{
-		return a.weight < b.weight;
+		return a.weight > b.weight;
+	}
+
+	bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
+	{
+		size_t wLenSum = 0;
+		for(uint i = 0; i < wordInfos.size(); i++)
+		{
+			wordInfos[i].wLen = getUtf8WordLen(wordInfos[i].word);
+			if(0 == wordInfos[i].wLen)
+			{
+				LogFatal("wLen is 0");
+				return false;
+			}
+			wLenSum += wordInfos[i].wLen;
+		}
+
+		if(0 == wLenSum)
+		{
+			LogFatal("wLenSum == 0.");
+			return false;
+		}
+
+		for(uint i = 0; i < wordInfos.size(); i++)
+		{
+			WordInfo& wInfo = wordInfos[i];
+			double logWordFreq = _segment.getUtf8WordWeight(wInfo.word);
+			wInfo.idf = -logWordFreq;
+			size_t wLen = getUtf8WordLen(wInfo.word);
+			if(0 == wLen)
+			{
+				LogFatal("getUtf8WordLen(%s) return 0");
+			}
+			wInfo.weight = 1.0 * wLen / wLenSum * wInfo.idf;
+		}
+		sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
+		return true;
 	}

 	bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
@ -88,14 +124,12 @@ namespace CppJieba
 		vector<WordInfo> wordInfos;
 		for(uint i = 0; i < words.size(); i++)
 		{
-			double w = _segment.getUtf8WordWeight(words[i]);
 			WordInfo wInfo;
 			wInfo.word = words[i];
-			wInfo.weight = w;
-			wInfo.idf = w;
 			wordInfos.push_back(wInfo);
 		}
-		sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
+		
+		_sortWLIDF(wordInfos);
 		LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
 		
 		_priorWordPrefixes(wordInfos);
--- a/src/KeyWordExt.h
+++ b/src/KeyWordExt.h
@ -9,17 +9,19 @@ namespace CppJieba
 	struct WordInfo
 	{
 		string word;
+		size_t wLen;
 		double weight;
 		double idf;
 		WordInfo()
 		{
 			word = "";
+			wLen = 0;
 			weight = 0.0;
 			idf = 0.0;
 		}
 		string getInfoStr() const
 		{
-			return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf);
+			return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
 		}
 	};

@ -58,6 +60,10 @@ namespace CppJieba
 			static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
 		private:
 			bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
+		private:
+			//sort by word len - idf
+			bool _sortWLIDF(vector<WordInfo>& wordInfos);
+		private:
 			bool _filter(vector<string>& utf8Strs);
 			bool _filterDuplicate(vector<string>& utf8Strs);
 			bool _filterSingleWord(vector<string>& utf8Strs);
--- a/src/Makefile
+++ b/src/Makefile
@ -38,7 +38,7 @@ $(CMLIB):

 #unit test
 Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
-	$(CC) -o $@ $< -DTRIE_UT $(CMLIB) 
+	$(CC) -o $@ $< -DTRIE_UT $(CMLIB)  -liconv

 Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
 	$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv
--- a/src/Trie.cpp
+++ b/src/Trie.cpp
@ -1,3 +1,7 @@
+/*
+ * file encoding: utf-8
+ * author: wuyanyi09@gmail.com
+ */
 #include "Trie.h"

 namespace CppJieba
@ -72,6 +76,13 @@ namespace CppJieba
 			//insert node
 			TrieNodeInfo nodeInfo;
 			nodeInfo.word = chWord;
+			size_t wLen = getUtf8WordLen(chWord);
+			if(0 == wLen)
+			{
+				LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
+				return false;
+			}
+			nodeInfo.wLen = wLen;
 			nodeInfo.count = count;
 			nodeInfo.tag = tag;

--- a/src/Trie.h
+++ b/src/Trie.h
@ -1,3 +1,7 @@
+/*
+ * file encoding: utf-8
+ * author: wuyanyi09@gmail.com
+ */
 #ifndef CPPJIEBA_TRIE_H
 #define CPPJIEBA_TRIE_H

@ -25,12 +29,18 @@ namespace CppJieba

 	struct TrieNodeInfo
 	{
-		string word;
+		string word;// utf8 string word
+		size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
 		unsigned int count;
 		string tag;
 		double weight;
-		TrieNodeInfo():word(),count(0),tag(),weight(0.0)
+		TrieNodeInfo()
 		{
+			word = "";
+			wLen = 0;
+			count = 0;
+			tag = "";
+			weight = 0.0;
 		}
 	};