add wordlen-idf sort into ext

This commit is contained in:
gwdwyy 2013-07-18 13:08:08 +08:00
parent 7aadcb2990
commit 089a63bf2c
5 changed files with 70 additions and 9 deletions

View File

@ -79,7 +79,43 @@ namespace CppJieba
bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
{
return a.weight < b.weight;
return a.weight > b.weight;
}
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
{
size_t wLenSum = 0;
for(uint i = 0; i < wordInfos.size(); i++)
{
wordInfos[i].wLen = getUtf8WordLen(wordInfos[i].word);
if(0 == wordInfos[i].wLen)
{
LogFatal("wLen is 0");
return false;
}
wLenSum += wordInfos[i].wLen;
}
if(0 == wLenSum)
{
LogFatal("wLenSum == 0.");
return false;
}
for(uint i = 0; i < wordInfos.size(); i++)
{
WordInfo& wInfo = wordInfos[i];
double logWordFreq = _segment.getUtf8WordWeight(wInfo.word);
wInfo.idf = -logWordFreq;
size_t wLen = getUtf8WordLen(wInfo.word);
if(0 == wLen)
{
LogFatal("getUtf8WordLen(%s) return 0");
}
wInfo.weight = 1.0 * wLen / wLenSum * wInfo.idf;
}
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
return true;
}
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
@ -88,14 +124,12 @@ namespace CppJieba
vector<WordInfo> wordInfos;
for(uint i = 0; i < words.size(); i++)
{
double w = _segment.getUtf8WordWeight(words[i]);
WordInfo wInfo;
wInfo.word = words[i];
wInfo.weight = w;
wInfo.idf = w;
wordInfos.push_back(wInfo);
}
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
_sortWLIDF(wordInfos);
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
_priorWordPrefixes(wordInfos);

View File

@ -9,17 +9,19 @@ namespace CppJieba
struct WordInfo
{
string word;
size_t wLen;
double weight;
double idf;
WordInfo()
{
word = "";
wLen = 0;
weight = 0.0;
idf = 0.0;
}
string getInfoStr() const
{
return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf);
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
}
};
@ -58,6 +60,10 @@ namespace CppJieba
static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
private:
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
private:
//sort by word len - idf
bool _sortWLIDF(vector<WordInfo>& wordInfos);
private:
bool _filter(vector<string>& utf8Strs);
bool _filterDuplicate(vector<string>& utf8Strs);
bool _filterSingleWord(vector<string>& utf8Strs);

View File

@ -38,7 +38,7 @@ $(CMLIB):
#unit test
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
$(CC) -o $@ $< -DTRIE_UT $(CMLIB)
$(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv

View File

@ -1,3 +1,7 @@
/*
* file encoding: utf-8
* author: wuyanyi09@gmail.com
*/
#include "Trie.h"
namespace CppJieba
@ -72,6 +76,13 @@ namespace CppJieba
//insert node
TrieNodeInfo nodeInfo;
nodeInfo.word = chWord;
size_t wLen = getUtf8WordLen(chWord);
if(0 == wLen)
{
LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
return false;
}
nodeInfo.wLen = wLen;
nodeInfo.count = count;
nodeInfo.tag = tag;

View File

@ -1,3 +1,7 @@
/*
* file encoding: utf-8
* author: wuyanyi09@gmail.com
*/
#ifndef CPPJIEBA_TRIE_H
#define CPPJIEBA_TRIE_H
@ -25,12 +29,18 @@ namespace CppJieba
struct TrieNodeInfo
{
string word;
string word;// utf8 string word
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
unsigned int count;
string tag;
double weight;
TrieNodeInfo():word(),count(0),tag(),weight(0.0)
TrieNodeInfo()
{
word = "";
wLen = 0;
count = 0;
tag = "";
weight = 0.0;
}
};