mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add filter singword in keywordextractor.
This commit is contained in:
parent
440b168d8b
commit
5f96dcf09a
@ -11,8 +11,7 @@ namespace CppJieba
|
|||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
|
|
||||||
/*utf8*/
|
/*utf8*/
|
||||||
const char * BLACK_LIST[] = {"。", ",", "、", "我", "的", "”", "“", "了",
|
const char * BLACK_LIST[] = {"我们", "他们"};
|
||||||
"你", "她", "他", "它", "说", "是", ":", "不"};
|
|
||||||
|
|
||||||
class KeywordExtractor: public InitOnOff
|
class KeywordExtractor: public InitOnOff
|
||||||
{
|
{
|
||||||
@ -100,6 +99,19 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// filtering single word.
|
||||||
|
for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
|
||||||
|
{
|
||||||
|
if(_isSingleWord(*iter))
|
||||||
|
{
|
||||||
|
iter = words.erase(iter);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
iter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
unordered_map<string, double> wordmap;
|
unordered_map<string, double> wordmap;
|
||||||
for(uint i = 0; i < words.size(); i ++)
|
for(uint i = 0; i < words.size(); i ++)
|
||||||
{
|
{
|
||||||
@ -130,6 +142,15 @@ namespace CppJieba
|
|||||||
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
|
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
bool _isSingleWord(const string& str) const
|
||||||
|
{
|
||||||
|
Unicode unicode;
|
||||||
|
TransCode::decode(str, unicode);
|
||||||
|
if(unicode.size() == 1)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
||||||
|
@ -18,7 +18,7 @@ TEST(KeywordExtractorTest, Test1)
|
|||||||
TEST(KeywordExtractorTest, Test2)
|
TEST(KeywordExtractorTest, Test2)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"};
|
const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(extractor);
|
ASSERT_TRUE(extractor);
|
||||||
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
|
ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
|
||||||
@ -32,28 +32,34 @@ TEST(KeywordExtractorTest, Test3)
|
|||||||
ASSERT_TRUE(!!ifs);
|
ASSERT_TRUE(!!ifs);
|
||||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
|
||||||
|
const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
|
||||||
vector<string> keywords;
|
vector<string> keywords;
|
||||||
|
string resStr;
|
||||||
|
vector<pair<string,double> > keywords2;
|
||||||
extractor.extract(str, keywords, 5);
|
extractor.extract(str, keywords, 5);
|
||||||
print(keywords);
|
extractor.extract(str, keywords2, 5);
|
||||||
print(__LINE__);
|
|
||||||
exit(1);
|
|
||||||
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
|
resStr << keywords2;
|
||||||
|
ASSERT_EQ(res2, resStr);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(KeywordExtractorTest, Test4)
|
//TEST(KeywordExtractorTest, Test4)
|
||||||
{
|
//{
|
||||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
// ifstream ifs("../test/testdata/weicheng.utf8");
|
||||||
ASSERT_TRUE(!!ifs);
|
// ASSERT_TRUE(!!ifs);
|
||||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
// string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
// KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||||
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
// //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||||
vector<pair<string,double> > keywords;
|
// vector<pair<string,double> > keywords;
|
||||||
extractor.extract(str, keywords, 5);
|
// extractor.extract(str, keywords, 5);
|
||||||
|
// //print(keywords);
|
||||||
|
// string res;
|
||||||
|
// res << keywords;
|
||||||
// print(keywords);
|
// print(keywords);
|
||||||
string res;
|
// print(__LINE__);
|
||||||
res << keywords;
|
// exit(1);
|
||||||
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
|
// ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user