mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add wordlen-idf sort into ext
This commit is contained in:
parent
7aadcb2990
commit
089a63bf2c
@ -79,7 +79,43 @@ namespace CppJieba
|
||||
|
||||
bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
|
||||
{
|
||||
return a.weight < b.weight;
|
||||
return a.weight > b.weight;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
|
||||
{
|
||||
size_t wLenSum = 0;
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
wordInfos[i].wLen = getUtf8WordLen(wordInfos[i].word);
|
||||
if(0 == wordInfos[i].wLen)
|
||||
{
|
||||
LogFatal("wLen is 0");
|
||||
return false;
|
||||
}
|
||||
wLenSum += wordInfos[i].wLen;
|
||||
}
|
||||
|
||||
if(0 == wLenSum)
|
||||
{
|
||||
LogFatal("wLenSum == 0.");
|
||||
return false;
|
||||
}
|
||||
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
WordInfo& wInfo = wordInfos[i];
|
||||
double logWordFreq = _segment.getUtf8WordWeight(wInfo.word);
|
||||
wInfo.idf = -logWordFreq;
|
||||
size_t wLen = getUtf8WordLen(wInfo.word);
|
||||
if(0 == wLen)
|
||||
{
|
||||
LogFatal("getUtf8WordLen(%s) return 0");
|
||||
}
|
||||
wInfo.weight = 1.0 * wLen / wLenSum * wInfo.idf;
|
||||
}
|
||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
|
||||
@ -88,14 +124,12 @@ namespace CppJieba
|
||||
vector<WordInfo> wordInfos;
|
||||
for(uint i = 0; i < words.size(); i++)
|
||||
{
|
||||
double w = _segment.getUtf8WordWeight(words[i]);
|
||||
WordInfo wInfo;
|
||||
wInfo.word = words[i];
|
||||
wInfo.weight = w;
|
||||
wInfo.idf = w;
|
||||
wordInfos.push_back(wInfo);
|
||||
}
|
||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||
|
||||
_sortWLIDF(wordInfos);
|
||||
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
|
||||
|
||||
_priorWordPrefixes(wordInfos);
|
||||
|
@ -9,17 +9,19 @@ namespace CppJieba
|
||||
struct WordInfo
|
||||
{
|
||||
string word;
|
||||
size_t wLen;
|
||||
double weight;
|
||||
double idf;
|
||||
WordInfo()
|
||||
{
|
||||
word = "";
|
||||
wLen = 0;
|
||||
weight = 0.0;
|
||||
idf = 0.0;
|
||||
}
|
||||
string getInfoStr() const
|
||||
{
|
||||
return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf);
|
||||
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
||||
}
|
||||
};
|
||||
|
||||
@ -58,6 +60,10 @@ namespace CppJieba
|
||||
static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
|
||||
private:
|
||||
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
|
||||
private:
|
||||
//sort by word len - idf
|
||||
bool _sortWLIDF(vector<WordInfo>& wordInfos);
|
||||
private:
|
||||
bool _filter(vector<string>& utf8Strs);
|
||||
bool _filterDuplicate(vector<string>& utf8Strs);
|
||||
bool _filterSingleWord(vector<string>& utf8Strs);
|
||||
|
@ -38,7 +38,7 @@ $(CMLIB):
|
||||
|
||||
#unit test
|
||||
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
|
||||
$(CC) -o $@ $< -DTRIE_UT $(CMLIB)
|
||||
$(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv
|
||||
|
||||
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
|
||||
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv
|
||||
|
11
src/Trie.cpp
11
src/Trie.cpp
@ -1,3 +1,7 @@
|
||||
/*
|
||||
* file encoding: utf-8
|
||||
* author: wuyanyi09@gmail.com
|
||||
*/
|
||||
#include "Trie.h"
|
||||
|
||||
namespace CppJieba
|
||||
@ -72,6 +76,13 @@ namespace CppJieba
|
||||
//insert node
|
||||
TrieNodeInfo nodeInfo;
|
||||
nodeInfo.word = chWord;
|
||||
size_t wLen = getUtf8WordLen(chWord);
|
||||
if(0 == wLen)
|
||||
{
|
||||
LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
|
||||
return false;
|
||||
}
|
||||
nodeInfo.wLen = wLen;
|
||||
nodeInfo.count = count;
|
||||
nodeInfo.tag = tag;
|
||||
|
||||
|
14
src/Trie.h
14
src/Trie.h
@ -1,3 +1,7 @@
|
||||
/*
|
||||
* file encoding: utf-8
|
||||
* author: wuyanyi09@gmail.com
|
||||
*/
|
||||
#ifndef CPPJIEBA_TRIE_H
|
||||
#define CPPJIEBA_TRIE_H
|
||||
|
||||
@ -25,12 +29,18 @@ namespace CppJieba
|
||||
|
||||
struct TrieNodeInfo
|
||||
{
|
||||
string word;
|
||||
string word;// utf8 string word
|
||||
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
|
||||
unsigned int count;
|
||||
string tag;
|
||||
double weight;
|
||||
TrieNodeInfo():word(),count(0),tag(),weight(0.0)
|
||||
TrieNodeInfo()
|
||||
{
|
||||
word = "";
|
||||
wLen = 0;
|
||||
count = 0;
|
||||
tag = "";
|
||||
weight = 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user