add filter singword in keywordextractor.

This commit is contained in:
wyy 2014-02-07 17:51:08 +08:00
parent 440b168d8b
commit 5f96dcf09a
2 changed files with 49 additions and 22 deletions

View File

@ -11,8 +11,7 @@ namespace CppJieba
using namespace Limonp; using namespace Limonp;
/*utf8*/ /*utf8*/
const char * BLACK_LIST[] = {"", "", "", "", "", "", "", "", const char * BLACK_LIST[] = {"我们", "他们"};
"", "", "", "", "", "", "", ""};
class KeywordExtractor: public InitOnOff class KeywordExtractor: public InitOnOff
{ {
@ -100,6 +99,19 @@ namespace CppJieba
return false; return false;
} }
// filtering single word.
for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
{
if(_isSingleWord(*iter))
{
iter = words.erase(iter);
}
else
{
iter++;
}
}
unordered_map<string, double> wordmap; unordered_map<string, double> wordmap;
for(uint i = 0; i < words.size(); i ++) for(uint i = 0; i < words.size(); i ++)
{ {
@ -130,6 +142,15 @@ namespace CppJieba
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp); partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
return true; return true;
} }
private:
bool _isSingleWord(const string& str) const
{
Unicode unicode;
TransCode::decode(str, unicode);
if(unicode.size() == 1)
return true;
return false;
}
private: private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs) static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)

View File

@ -18,7 +18,7 @@ TEST(KeywordExtractorTest, Test1)
TEST(KeywordExtractorTest, Test2) TEST(KeywordExtractorTest, Test2)
{ {
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"}; const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
vector<string> words; vector<string> words;
ASSERT_TRUE(extractor); ASSERT_TRUE(extractor);
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9)); ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
@ -32,28 +32,34 @@ TEST(KeywordExtractorTest, Test3)
ASSERT_TRUE(!!ifs); ASSERT_TRUE(!!ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>())); string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
vector<string> keywords; vector<string> keywords;
string resStr;
vector<pair<string,double> > keywords2;
extractor.extract(str, keywords, 5); extractor.extract(str, keywords, 5);
print(keywords); extractor.extract(str, keywords2, 5);
print(__LINE__);
exit(1);
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
resStr << keywords2;
ASSERT_EQ(res2, resStr);
} }
TEST(KeywordExtractorTest, Test4) //TEST(KeywordExtractorTest, Test4)
{ //{
ifstream ifs("../test/testdata/weicheng.utf8"); // ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(!!ifs); // ASSERT_TRUE(!!ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>())); // string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); // KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; // //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
vector<pair<string,double> > keywords; // vector<pair<string,double> > keywords;
extractor.extract(str, keywords, 5); // extractor.extract(str, keywords, 5);
// //print(keywords);
// string res;
// res << keywords;
// print(keywords); // print(keywords);
string res; // print(__LINE__);
res << keywords; // exit(1);
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); // ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
//
} //}