mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
using idf.utf8 in keywordExtractor
This commit is contained in:
parent
9229fec6ca
commit
62b83a36a0
258826
dict/idf.utf8
Normal file
258826
dict/idf.utf8
Normal file
File diff suppressed because it is too large
Load Diff
@ -11,13 +11,12 @@ namespace CppJieba
|
||||
struct KeyWordInfo
|
||||
{
|
||||
string word;
|
||||
uint freq;
|
||||
double idf;
|
||||
};
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
||||
{
|
||||
return os << keyword.word << "," << keyword.freq << "," << keyword.idf;
|
||||
return os << keyword.word << "," << keyword.idf;
|
||||
}
|
||||
|
||||
class KeywordExtractor
|
||||
@ -25,9 +24,7 @@ namespace CppJieba
|
||||
private:
|
||||
MPSegment _segment;
|
||||
private:
|
||||
unordered_map<string, const KeyWordInfo* > _wordIndex;
|
||||
vector<KeyWordInfo> _wordinfos;
|
||||
size_t _totalFreq;
|
||||
unordered_map<string, double> _idfMap;
|
||||
protected:
|
||||
bool _isInited;
|
||||
bool _getInitFlag()const{return _isInited;};
|
||||
@ -36,22 +33,19 @@ namespace CppJieba
|
||||
operator bool(){return _getInitFlag();};
|
||||
public:
|
||||
KeywordExtractor(){_setInitFlag(false);};
|
||||
explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
|
||||
explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
|
||||
~KeywordExtractor(){};
|
||||
public:
|
||||
bool init(const string& dictPath)
|
||||
bool init(const string& dictPath, const string& idfPath)
|
||||
{
|
||||
ifstream ifs(dictPath.c_str());
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", dictPath.c_str());
|
||||
LogError("open %s failed.", idfPath.c_str());
|
||||
return false;
|
||||
}
|
||||
_totalFreq = 0;
|
||||
int tfreq;
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
KeyWordInfo keywordInfo;
|
||||
for(uint lineno = 0; getline(ifs, line); lineno++)
|
||||
{
|
||||
buf.clear();
|
||||
@ -60,33 +54,12 @@ namespace CppJieba
|
||||
LogError("line[%d] empty. skipped.", lineno);
|
||||
continue;
|
||||
}
|
||||
if(!split(line, buf, " ") || buf.size() != 3)
|
||||
if(!split(line, buf, " ") || buf.size() != 2)
|
||||
{
|
||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||
continue;
|
||||
}
|
||||
keywordInfo.word = buf[0];
|
||||
tfreq= atoi(buf[1].c_str());
|
||||
if(tfreq <= 0)
|
||||
{
|
||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||
continue;
|
||||
}
|
||||
keywordInfo.freq = tfreq;
|
||||
_totalFreq += tfreq;
|
||||
_wordinfos.push_back(keywordInfo);
|
||||
}
|
||||
|
||||
// calculate idf & make index.
|
||||
for(uint i = 0; i < _wordinfos.size(); i++)
|
||||
{
|
||||
if(_wordinfos[i].freq <= 0)
|
||||
{
|
||||
LogFatal("freq value is not positive.");
|
||||
return false;
|
||||
}
|
||||
_wordinfos[i].idf = -log(_wordinfos[i].freq);
|
||||
_wordIndex[_wordinfos[i].word] = &(_wordinfos[i]);
|
||||
_idfMap[buf[0]] = atof(buf[1].c_str());
|
||||
}
|
||||
return _setInitFlag(_segment.init(dictPath));
|
||||
};
|
||||
@ -110,10 +83,10 @@ namespace CppJieba
|
||||
|
||||
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
|
||||
{
|
||||
unordered_map<string, const KeyWordInfo*>::const_iterator cit = _wordIndex.find(itr->first);
|
||||
if(cit != _wordIndex.end())
|
||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
||||
if(cit != _idfMap.end())
|
||||
{
|
||||
itr->second *= cit->second->idf;
|
||||
itr->second *= cit->second;
|
||||
itr ++;
|
||||
}
|
||||
else
|
||||
|
@ -5,7 +5,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(KeywordExtractorTest, Test1)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8");
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"北京邮电大学", "来自"};
|
||||
vector<string> words;
|
||||
@ -16,9 +16,9 @@ TEST(KeywordExtractorTest, Test1)
|
||||
|
||||
TEST(KeywordExtractorTest, Test2)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8");
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||
const char* res[] = {"北京邮电大学", "来自", "学", "号", "我"};
|
||||
const char* res[] = {"北京邮电大学", "来自"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(extractor);
|
||||
ASSERT_TRUE(extractor.extract(str, words, 9));
|
||||
@ -31,10 +31,9 @@ TEST(KeywordExtractorTest, Test3)
|
||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||
ASSERT_TRUE(ifs);
|
||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8");
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
vector<string> keywords;
|
||||
string res;
|
||||
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||
extractor.extract(str, keywords, 5);
|
||||
res << keywords;
|
||||
ASSERT_EQ("[\"第三性\", \"多愁多病\", \"记挂着\", \"揭去\", \"贫血症\"]", res);
|
||||
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user