From f64c11c57e4b281e73b039ece989cc6f2b6d1ab7 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 31 Jan 2014 17:37:40 +0800 Subject: [PATCH] add blacklist --- src/KeywordExtractor.hpp | 64 +++++++++++++++++++---------- test/unittest/TKeywordExtractor.cpp | 23 ++++++----- 2 files changed, 56 insertions(+), 31 deletions(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 295ee05..899fb32 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -1,37 +1,37 @@ #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H #define CPPJIEBA_KEYWORD_EXTRACTOR_H -#include "MPSegment.hpp" +#include "MixSegment.hpp" #include +#include #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) namespace CppJieba { using namespace Limonp; - //struct KeyWordInfo - //{ - // string word; - // double tfidf; - //}; - - //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword) - //{ - // return os << keyword.word << "," << keyword.idf; - //} + /*utf8*/ + const char * BLACK_LIST[] = {"。", ",", "、", "我", "的", "”", "“", "了", + "你", "她", "他", "它", "说", "是", ":", "不"}; class KeywordExtractor: public InitOnOff { private: - MPSegment _segment; + MixSegment _segment; private: unordered_map _idfMap; + double _idfAverage; + + unordered_set _blackSet; public: KeywordExtractor(){_setInitFlag(false);}; - explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));}; + explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath) + { + _setInitFlag(init(dictPath, hmmFilePath, idfPath)); + }; ~KeywordExtractor(){}; public: - bool init(const string& dictPath, const string& idfPath) + bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs) @@ -41,7 +41,10 @@ namespace CppJieba } string line ; vector buf; - for(uint lineno = 0; getline(ifs, line); lineno++) + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for(;getline(ifs, line); lineno++) { buf.clear(); if(line.empty()) @@ -54,9 +57,22 @@ namespace CppJieba LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); continue; } - _idfMap[buf[0]] = atof(buf[1].c_str()); - } - return _setInitFlag(_segment.init(dictPath)); + idf = atof(buf[1].c_str()); + _idfMap[buf[0]] = idf; + idfSum += idf; + + } + + std::copy( + BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), + std::inserter(_blackSet, _blackSet.begin())); + + assert(lineno); + _idfAverage = idfSum / lineno; + + assert(_idfAverage > 0.0); + + return _setInitFlag(_segment.init(dictPath, hmmFilePath)); }; public: @@ -90,18 +106,24 @@ namespace CppJieba wordmap[ words[i] ] += 1.0; } - for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end();) + for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { + if(_blackSet.end() != _blackSet.find(itr->first)) + { + itr = wordmap.erase(itr); + continue; + } + unordered_map::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; - itr ++; } else { - itr = wordmap.erase(itr); + itr->second *= _idfAverage; } + itr ++; } keywords.resize(MIN(topN, wordmap.size())); diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 2709227..355f8b2 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -3,25 +3,25 @@ using namespace CppJieba; +const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456"; + TEST(KeywordExtractorTest, Test1) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* str = "我来自北京邮电大学。。。 学号 123456"; - const char* res[] = {"北京邮电大学", "来自"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"学号", "北京邮电大学"}; vector words; ASSERT_TRUE(extractor); - ASSERT_TRUE(extractor.extract(str, words, 2)); + ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2)); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } TEST(KeywordExtractorTest, Test2) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* str = "我来自北京邮电大学。。。 学号 123456"; - const char* res[] = {"北京邮电大学", "来自"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"}; vector words; ASSERT_TRUE(extractor); - ASSERT_TRUE(extractor.extract(str, words, 9)); + ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9)); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } @@ -31,10 +31,13 @@ TEST(KeywordExtractorTest, Test3) ifstream ifs("../test/testdata/weicheng.utf8"); ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; vector keywords; extractor.extract(str, keywords, 5); + print(keywords); + print(__LINE__); + exit(1); ASSERT_EQ(keywords, vector(res, res + sizeof(res)/sizeof(res[0]))); } @@ -44,7 +47,7 @@ TEST(KeywordExtractorTest, Test4) ifstream ifs("../test/testdata/weicheng.utf8"); ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; vector > keywords; extractor.extract(str, keywords, 5);