add blacklist

This commit is contained in:
wyy 2014-01-31 17:37:40 +08:00
parent 41a33747f4
commit f64c11c57e
2 changed files with 56 additions and 31 deletions

View File

@ -1,37 +1,37 @@
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
#define CPPJIEBA_KEYWORD_EXTRACTOR_H #define CPPJIEBA_KEYWORD_EXTRACTOR_H
#include "MPSegment.hpp" #include "MixSegment.hpp"
#include <cmath> #include <cmath>
#include <unordered_set>
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
namespace CppJieba namespace CppJieba
{ {
using namespace Limonp; using namespace Limonp;
//struct KeyWordInfo /*utf8*/
//{ const char * BLACK_LIST[] = {"", "", "", "", "", "", "", "",
// string word; "", "", "", "", "", "", "", ""};
// double tfidf;
//};
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
//{
// return os << keyword.word << "," << keyword.idf;
//}
class KeywordExtractor: public InitOnOff class KeywordExtractor: public InitOnOff
{ {
private: private:
MPSegment _segment; MixSegment _segment;
private: private:
unordered_map<string, double> _idfMap; unordered_map<string, double> _idfMap;
double _idfAverage;
unordered_set<string> _blackSet;
public: public:
KeywordExtractor(){_setInitFlag(false);}; KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));}; explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
{
_setInitFlag(init(dictPath, hmmFilePath, idfPath));
};
~KeywordExtractor(){}; ~KeywordExtractor(){};
public: public:
bool init(const string& dictPath, const string& idfPath) bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
{ {
ifstream ifs(idfPath.c_str()); ifstream ifs(idfPath.c_str());
if(!ifs) if(!ifs)
@ -41,7 +41,10 @@ namespace CppJieba
} }
string line ; string line ;
vector<string> buf; vector<string> buf;
for(uint lineno = 0; getline(ifs, line); lineno++) double idf = 0.0;
double idfSum = 0.0;
size_t lineno = 0;
for(;getline(ifs, line); lineno++)
{ {
buf.clear(); buf.clear();
if(line.empty()) if(line.empty())
@ -54,9 +57,22 @@ namespace CppJieba
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue; continue;
} }
_idfMap[buf[0]] = atof(buf[1].c_str()); idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf;
idfSum += idf;
} }
return _setInitFlag(_segment.init(dictPath));
std::copy(
BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]),
std::inserter(_blackSet, _blackSet.begin()));
assert(lineno);
_idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0);
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
}; };
public: public:
@ -92,16 +108,22 @@ namespace CppJieba
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
{ {
if(_blackSet.end() != _blackSet.find(itr->first))
{
itr = wordmap.erase(itr);
continue;
}
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
if(cit != _idfMap.end()) if(cit != _idfMap.end())
{ {
itr->second *= cit->second; itr->second *= cit->second;
itr ++;
} }
else else
{ {
itr = wordmap.erase(itr); itr->second *= _idfAverage;
} }
itr ++;
} }
keywords.resize(MIN(topN, wordmap.size())); keywords.resize(MIN(topN, wordmap.size()));

View File

@ -3,25 +3,25 @@
using namespace CppJieba; using namespace CppJieba;
const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456";
TEST(KeywordExtractorTest, Test1) TEST(KeywordExtractorTest, Test1)
{ {
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456"; const char* res[] = {"学号", "北京邮电大学"};
const char* res[] = {"北京邮电大学", "来自"};
vector<string> words; vector<string> words;
ASSERT_TRUE(extractor); ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 2)); ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }
TEST(KeywordExtractorTest, Test2) TEST(KeywordExtractorTest, Test2)
{ {
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* str = "我来自北京邮电大学。。。 学号 123456"; const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"};
const char* res[] = {"北京邮电大学", "来自"};
vector<string> words; vector<string> words;
ASSERT_TRUE(extractor); ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(str, words, 9)); ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }
@ -31,10 +31,13 @@ TEST(KeywordExtractorTest, Test3)
ifstream ifs("../test/testdata/weicheng.utf8"); ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(!!ifs); ASSERT_TRUE(!!ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>())); string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
vector<string> keywords; vector<string> keywords;
extractor.extract(str, keywords, 5); extractor.extract(str, keywords, 5);
print(keywords);
print(__LINE__);
exit(1);
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }
@ -44,7 +47,7 @@ TEST(KeywordExtractorTest, Test4)
ifstream ifs("../test/testdata/weicheng.utf8"); ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(!!ifs); ASSERT_TRUE(!!ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>())); string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
vector<pair<string,double> > keywords; vector<pair<string,double> > keywords;
extractor.extract(str, keywords, 5); extractor.extract(str, keywords, 5);