mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add wordlen-idf sort into ext
This commit is contained in:
parent
7aadcb2990
commit
089a63bf2c
@ -79,7 +79,43 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
|
bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
|
||||||
{
|
{
|
||||||
return a.weight < b.weight;
|
return a.weight > b.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
|
||||||
|
{
|
||||||
|
size_t wLenSum = 0;
|
||||||
|
for(uint i = 0; i < wordInfos.size(); i++)
|
||||||
|
{
|
||||||
|
wordInfos[i].wLen = getUtf8WordLen(wordInfos[i].word);
|
||||||
|
if(0 == wordInfos[i].wLen)
|
||||||
|
{
|
||||||
|
LogFatal("wLen is 0");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
wLenSum += wordInfos[i].wLen;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(0 == wLenSum)
|
||||||
|
{
|
||||||
|
LogFatal("wLenSum == 0.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(uint i = 0; i < wordInfos.size(); i++)
|
||||||
|
{
|
||||||
|
WordInfo& wInfo = wordInfos[i];
|
||||||
|
double logWordFreq = _segment.getUtf8WordWeight(wInfo.word);
|
||||||
|
wInfo.idf = -logWordFreq;
|
||||||
|
size_t wLen = getUtf8WordLen(wInfo.word);
|
||||||
|
if(0 == wLen)
|
||||||
|
{
|
||||||
|
LogFatal("getUtf8WordLen(%s) return 0");
|
||||||
|
}
|
||||||
|
wInfo.weight = 1.0 * wLen / wLenSum * wInfo.idf;
|
||||||
|
}
|
||||||
|
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
|
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
|
||||||
@ -88,14 +124,12 @@ namespace CppJieba
|
|||||||
vector<WordInfo> wordInfos;
|
vector<WordInfo> wordInfos;
|
||||||
for(uint i = 0; i < words.size(); i++)
|
for(uint i = 0; i < words.size(); i++)
|
||||||
{
|
{
|
||||||
double w = _segment.getUtf8WordWeight(words[i]);
|
|
||||||
WordInfo wInfo;
|
WordInfo wInfo;
|
||||||
wInfo.word = words[i];
|
wInfo.word = words[i];
|
||||||
wInfo.weight = w;
|
|
||||||
wInfo.idf = w;
|
|
||||||
wordInfos.push_back(wInfo);
|
wordInfos.push_back(wInfo);
|
||||||
}
|
}
|
||||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
|
||||||
|
_sortWLIDF(wordInfos);
|
||||||
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
|
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
|
||||||
|
|
||||||
_priorWordPrefixes(wordInfos);
|
_priorWordPrefixes(wordInfos);
|
||||||
|
@ -9,17 +9,19 @@ namespace CppJieba
|
|||||||
struct WordInfo
|
struct WordInfo
|
||||||
{
|
{
|
||||||
string word;
|
string word;
|
||||||
|
size_t wLen;
|
||||||
double weight;
|
double weight;
|
||||||
double idf;
|
double idf;
|
||||||
WordInfo()
|
WordInfo()
|
||||||
{
|
{
|
||||||
word = "";
|
word = "";
|
||||||
|
wLen = 0;
|
||||||
weight = 0.0;
|
weight = 0.0;
|
||||||
idf = 0.0;
|
idf = 0.0;
|
||||||
}
|
}
|
||||||
string getInfoStr() const
|
string getInfoStr() const
|
||||||
{
|
{
|
||||||
return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf);
|
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -58,6 +60,10 @@ namespace CppJieba
|
|||||||
static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
|
static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
|
||||||
private:
|
private:
|
||||||
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
|
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
|
||||||
|
private:
|
||||||
|
//sort by word len - idf
|
||||||
|
bool _sortWLIDF(vector<WordInfo>& wordInfos);
|
||||||
|
private:
|
||||||
bool _filter(vector<string>& utf8Strs);
|
bool _filter(vector<string>& utf8Strs);
|
||||||
bool _filterDuplicate(vector<string>& utf8Strs);
|
bool _filterDuplicate(vector<string>& utf8Strs);
|
||||||
bool _filterSingleWord(vector<string>& utf8Strs);
|
bool _filterSingleWord(vector<string>& utf8Strs);
|
||||||
|
@ -38,7 +38,7 @@ $(CMLIB):
|
|||||||
|
|
||||||
#unit test
|
#unit test
|
||||||
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
|
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
|
||||||
$(CC) -o $@ $< -DTRIE_UT $(CMLIB)
|
$(CC) -o $@ $< -DTRIE_UT $(CMLIB) -liconv
|
||||||
|
|
||||||
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
|
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
|
||||||
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv
|
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) -liconv
|
||||||
|
11
src/Trie.cpp
11
src/Trie.cpp
@ -1,3 +1,7 @@
|
|||||||
|
/*
|
||||||
|
* file encoding: utf-8
|
||||||
|
* author: wuyanyi09@gmail.com
|
||||||
|
*/
|
||||||
#include "Trie.h"
|
#include "Trie.h"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
@ -72,6 +76,13 @@ namespace CppJieba
|
|||||||
//insert node
|
//insert node
|
||||||
TrieNodeInfo nodeInfo;
|
TrieNodeInfo nodeInfo;
|
||||||
nodeInfo.word = chWord;
|
nodeInfo.word = chWord;
|
||||||
|
size_t wLen = getUtf8WordLen(chWord);
|
||||||
|
if(0 == wLen)
|
||||||
|
{
|
||||||
|
LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
nodeInfo.wLen = wLen;
|
||||||
nodeInfo.count = count;
|
nodeInfo.count = count;
|
||||||
nodeInfo.tag = tag;
|
nodeInfo.tag = tag;
|
||||||
|
|
||||||
|
14
src/Trie.h
14
src/Trie.h
@ -1,3 +1,7 @@
|
|||||||
|
/*
|
||||||
|
* file encoding: utf-8
|
||||||
|
* author: wuyanyi09@gmail.com
|
||||||
|
*/
|
||||||
#ifndef CPPJIEBA_TRIE_H
|
#ifndef CPPJIEBA_TRIE_H
|
||||||
#define CPPJIEBA_TRIE_H
|
#define CPPJIEBA_TRIE_H
|
||||||
|
|
||||||
@ -25,12 +29,18 @@ namespace CppJieba
|
|||||||
|
|
||||||
struct TrieNodeInfo
|
struct TrieNodeInfo
|
||||||
{
|
{
|
||||||
string word;
|
string word;// utf8 string word
|
||||||
|
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
|
||||||
unsigned int count;
|
unsigned int count;
|
||||||
string tag;
|
string tag;
|
||||||
double weight;
|
double weight;
|
||||||
TrieNodeInfo():word(),count(0),tag(),weight(0.0)
|
TrieNodeInfo()
|
||||||
{
|
{
|
||||||
|
word = "";
|
||||||
|
wLen = 0;
|
||||||
|
count = 0;
|
||||||
|
tag = "";
|
||||||
|
weight = 0.0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user