diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 899fb32..a78ea1f 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -11,8 +11,7 @@ namespace CppJieba using namespace Limonp; /*utf8*/ - const char * BLACK_LIST[] = {"。", ",", "、", "我", "的", "”", "“", "了", - "你", "她", "他", "它", "说", "是", ":", "不"}; + const char * BLACK_LIST[] = {"我们", "他们"}; class KeywordExtractor: public InitOnOff { @@ -100,6 +99,19 @@ namespace CppJieba return false; } + // filtering single word. + for(vector::iterator iter = words.begin(); iter != words.end(); ) + { + if(_isSingleWord(*iter)) + { + iter = words.erase(iter); + } + else + { + iter++; + } + } + unordered_map wordmap; for(uint i = 0; i < words.size(); i ++) { @@ -130,6 +142,15 @@ namespace CppJieba partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp); return true; } + private: + bool _isSingleWord(const string& str) const + { + Unicode unicode; + TransCode::decode(str, unicode); + if(unicode.size() == 1) + return true; + return false; + } private: static bool _cmp(const pair& lhs, const pair& rhs) diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 355f8b2..8a84985 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -18,7 +18,7 @@ TEST(KeywordExtractorTest, Test1) TEST(KeywordExtractorTest, Test2) { KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); - const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"}; + const char* res[] = {"学号", "北京邮电大学", "123456", "来自"}; vector words; ASSERT_TRUE(extractor); ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9)); @@ -32,28 +32,34 @@ TEST(KeywordExtractorTest, Test3) ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); - const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; + const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"}; + const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]"; vector keywords; + string resStr; + vector > keywords2; extractor.extract(str, keywords, 5); - print(keywords); - print(__LINE__); - exit(1); + extractor.extract(str, keywords2, 5); ASSERT_EQ(keywords, vector(res, res + sizeof(res)/sizeof(res[0]))); + resStr << keywords2; + ASSERT_EQ(res2, resStr); } -TEST(KeywordExtractorTest, Test4) -{ - ifstream ifs("../test/testdata/weicheng.utf8"); - ASSERT_TRUE(!!ifs); - string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); - //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; - vector > keywords; - extractor.extract(str, keywords, 5); - //print(keywords); - string res; - res << keywords; - ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); - -} +//TEST(KeywordExtractorTest, Test4) +//{ +// ifstream ifs("../test/testdata/weicheng.utf8"); +// ASSERT_TRUE(!!ifs); +// string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); +// KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); +// //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; +// vector > keywords; +// extractor.extract(str, keywords, 5); +// //print(keywords); +// string res; +// res << keywords; +// print(keywords); +// print(__LINE__); +// exit(1); +// ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); +// +//}