mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add blacklist
This commit is contained in:
parent
41a33747f4
commit
f64c11c57e
@ -1,37 +1,37 @@
|
|||||||
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
|
|
||||||
#include "MPSegment.hpp"
|
#include "MixSegment.hpp"
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <unordered_set>
|
||||||
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
|
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
|
|
||||||
//struct KeyWordInfo
|
/*utf8*/
|
||||||
//{
|
const char * BLACK_LIST[] = {"。", ",", "、", "我", "的", "”", "“", "了",
|
||||||
// string word;
|
"你", "她", "他", "它", "说", "是", ":", "不"};
|
||||||
// double tfidf;
|
|
||||||
//};
|
|
||||||
|
|
||||||
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
|
||||||
//{
|
|
||||||
// return os << keyword.word << "," << keyword.idf;
|
|
||||||
//}
|
|
||||||
|
|
||||||
class KeywordExtractor: public InitOnOff
|
class KeywordExtractor: public InitOnOff
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
MPSegment _segment;
|
MixSegment _segment;
|
||||||
private:
|
private:
|
||||||
unordered_map<string, double> _idfMap;
|
unordered_map<string, double> _idfMap;
|
||||||
|
double _idfAverage;
|
||||||
|
|
||||||
|
unordered_set<string> _blackSet;
|
||||||
public:
|
public:
|
||||||
KeywordExtractor(){_setInitFlag(false);};
|
KeywordExtractor(){_setInitFlag(false);};
|
||||||
explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
|
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
|
||||||
|
{
|
||||||
|
_setInitFlag(init(dictPath, hmmFilePath, idfPath));
|
||||||
|
};
|
||||||
~KeywordExtractor(){};
|
~KeywordExtractor(){};
|
||||||
public:
|
public:
|
||||||
bool init(const string& dictPath, const string& idfPath)
|
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
|
||||||
{
|
{
|
||||||
ifstream ifs(idfPath.c_str());
|
ifstream ifs(idfPath.c_str());
|
||||||
if(!ifs)
|
if(!ifs)
|
||||||
@ -41,7 +41,10 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
string line ;
|
string line ;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
for(uint lineno = 0; getline(ifs, line); lineno++)
|
double idf = 0.0;
|
||||||
|
double idfSum = 0.0;
|
||||||
|
size_t lineno = 0;
|
||||||
|
for(;getline(ifs, line); lineno++)
|
||||||
{
|
{
|
||||||
buf.clear();
|
buf.clear();
|
||||||
if(line.empty())
|
if(line.empty())
|
||||||
@ -54,9 +57,22 @@ namespace CppJieba
|
|||||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
_idfMap[buf[0]] = atof(buf[1].c_str());
|
idf = atof(buf[1].c_str());
|
||||||
|
_idfMap[buf[0]] = idf;
|
||||||
|
idfSum += idf;
|
||||||
|
|
||||||
}
|
}
|
||||||
return _setInitFlag(_segment.init(dictPath));
|
|
||||||
|
std::copy(
|
||||||
|
BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]),
|
||||||
|
std::inserter(_blackSet, _blackSet.begin()));
|
||||||
|
|
||||||
|
assert(lineno);
|
||||||
|
_idfAverage = idfSum / lineno;
|
||||||
|
|
||||||
|
assert(_idfAverage > 0.0);
|
||||||
|
|
||||||
|
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -90,18 +106,24 @@ namespace CppJieba
|
|||||||
wordmap[ words[i] ] += 1.0;
|
wordmap[ words[i] ] += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
|
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
||||||
{
|
{
|
||||||
|
if(_blackSet.end() != _blackSet.find(itr->first))
|
||||||
|
{
|
||||||
|
itr = wordmap.erase(itr);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
||||||
if(cit != _idfMap.end())
|
if(cit != _idfMap.end())
|
||||||
{
|
{
|
||||||
itr->second *= cit->second;
|
itr->second *= cit->second;
|
||||||
itr ++;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
itr = wordmap.erase(itr);
|
itr->second *= _idfAverage;
|
||||||
}
|
}
|
||||||
|
itr ++;
|
||||||
}
|
}
|
||||||
|
|
||||||
keywords.resize(MIN(topN, wordmap.size()));
|
keywords.resize(MIN(topN, wordmap.size()));
|
||||||
|
@ -3,25 +3,25 @@
|
|||||||
|
|
||||||
using namespace CppJieba;
|
using namespace CppJieba;
|
||||||
|
|
||||||
|
const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456";
|
||||||
|
|
||||||
TEST(KeywordExtractorTest, Test1)
|
TEST(KeywordExtractorTest, Test1)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
const char* res[] = {"学号", "北京邮电大学"};
|
||||||
const char* res[] = {"北京邮电大学", "来自"};
|
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(extractor);
|
ASSERT_TRUE(extractor);
|
||||||
ASSERT_TRUE(extractor.extract(str, words, 2));
|
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2));
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeywordExtractorTest, Test2)
|
TEST(KeywordExtractorTest, Test2)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"};
|
||||||
const char* res[] = {"北京邮电大学", "来自"};
|
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(extractor);
|
ASSERT_TRUE(extractor);
|
||||||
ASSERT_TRUE(extractor.extract(str, words, 9));
|
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -31,10 +31,13 @@ TEST(KeywordExtractorTest, Test3)
|
|||||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||||
ASSERT_TRUE(!!ifs);
|
ASSERT_TRUE(!!ifs);
|
||||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||||
vector<string> keywords;
|
vector<string> keywords;
|
||||||
extractor.extract(str, keywords, 5);
|
extractor.extract(str, keywords, 5);
|
||||||
|
print(keywords);
|
||||||
|
print(__LINE__);
|
||||||
|
exit(1);
|
||||||
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -44,7 +47,7 @@ TEST(KeywordExtractorTest, Test4)
|
|||||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||||
ASSERT_TRUE(!!ifs);
|
ASSERT_TRUE(!!ifs);
|
||||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||||
vector<pair<string,double> > keywords;
|
vector<pair<string,double> > keywords;
|
||||||
extractor.extract(str, keywords, 5);
|
extractor.extract(str, keywords, 5);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user