add filter singword in keywordextractor.

2025-07-18 00:00:12 +08:00 · 2014-02-07 17:51:08 +08:00 · 2014-02-07 17:51:08 +08:00 · 5f96dcf09a
commit 5f96dcf09a
parent 440b168d8b
2 changed files with 49 additions and 22 deletions
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -11,8 +11,7 @@ namespace CppJieba
    using namespace Limonp;

    /*utf8*/
-    const char * BLACK_LIST[] = {"。", "，", "、", "我", "的", "”", "“", "了",
-        "你", "她", "他", "它", "说", "是", "：", "不"};
+    const char * BLACK_LIST[] = {"我们", "他们"};

    class KeywordExtractor: public InitOnOff
    {
@ -100,6 +99,19 @@ namespace CppJieba
                    return false;
                }

+                // filtering single word.
+                for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
+                {
+                    if(_isSingleWord(*iter))
+                    {
+                        iter = words.erase(iter);
+                    }
+                    else
+                    {
+                        iter++;
+                    }
+                }
+
                unordered_map<string, double> wordmap;
                for(uint i = 0; i < words.size(); i ++)
                {
@ -130,6 +142,15 @@ namespace CppJieba
                partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
                return true;
            }
+        private:
+            bool _isSingleWord(const string& str) const
+            {
+                Unicode unicode;
+                TransCode::decode(str, unicode);
+                if(unicode.size() == 1)
+                  return true;
+                return false;
+            }

        private:
            static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -18,7 +18,7 @@ TEST(KeywordExtractorTest, Test1)
 TEST(KeywordExtractorTest, Test2)
 {
    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"};
+    const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
    vector<string> words;
    ASSERT_TRUE(extractor);
    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
@ -32,28 +32,34 @@ TEST(KeywordExtractorTest, Test3)
    ASSERT_TRUE(!!ifs);
    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+    const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
+    const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
    vector<string> keywords;
+    string resStr;
+    vector<pair<string,double> >  keywords2;
    extractor.extract(str, keywords, 5);
-    print(keywords);
-    print(__LINE__);
-    exit(1);
+    extractor.extract(str, keywords2, 5);
    ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    resStr << keywords2;
+    ASSERT_EQ(res2, resStr);

 }

-TEST(KeywordExtractorTest, Test4)
-{
-    ifstream ifs("../test/testdata/weicheng.utf8");
-    ASSERT_TRUE(!!ifs);
-    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
-    vector<pair<string,double> >  keywords;
-    extractor.extract(str, keywords, 5);
-    //print(keywords);
-    string res;
-    res << keywords;
-    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
-
-}
+//TEST(KeywordExtractorTest, Test4)
+//{
+//    ifstream ifs("../test/testdata/weicheng.utf8");
+//    ASSERT_TRUE(!!ifs);
+//    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
+//    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+//    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+//    vector<pair<string,double> >  keywords;
+//    extractor.extract(str, keywords, 5);
+//    //print(keywords);
+//    string res;
+//    res << keywords;
+//    print(keywords);
+//    print(__LINE__);
+//    exit(1);
+//    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
+//
+//}