From 453d4a143fb5929c1e7ba233b7bf745cb6187cdd Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Sun, 26 Jan 2014 12:37:01 +0800
Subject: [PATCH 1/9] =?UTF-8?q?add=20=E4=BE=9D=E8=B5=96=E8=BD=AF=E4=BB=B6?=
 =?UTF-8?q?=20in=20readme?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 9a5cd30..c7c96c4 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,11 @@
 
 ## 安装与使用
 
+### 依赖
+
+* g++ (version >= 4.6);
+* cmake (version >= 2.8);
+
 ### 下载和安装
 
 ```sh

From f1093d6cbc3d755d9cd8f368fc84690e5993240c Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Wed, 29 Jan 2014 20:13:26 +0800
Subject: [PATCH 2/9] use mit license

---
 COPYRIGHT | 13 -------------
 LICENSE   | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 13 deletions(-)
 delete mode 100644 COPYRIGHT
 create mode 100644 LICENSE

diff --git a/COPYRIGHT b/COPYRIGHT
deleted file mode 100644
index a1c0776..0000000
--- a/COPYRIGHT
+++ /dev/null
@@ -1,13 +0,0 @@
-           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
-                    Version 2, December 2004
-
- Copyright (C) 2013 Yanyi Wu <wuyanyi09@gmail.com>
-
- Everyone is permitted to copy and distribute verbatim or modified
- copies of this license document, and changing it is allowed as long
- as the name is changed.
-
-            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. You just DO WHAT THE FUCK YOU WANT TO.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8b000fd
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Yanyi Wu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

From 259b296b713c4c5d2337bd3d3532a9839c54616f Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Wed, 29 Jan 2014 20:20:24 +0800
Subject: [PATCH 3/9] int -> uint for avoid warning

---
 test/unittest/TMd5.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unittest/TMd5.cpp b/test/unittest/TMd5.cpp
index a3c6af9..455b125 100644
--- a/test/unittest/TMd5.cpp
+++ b/test/unittest/TMd5.cpp
@@ -19,7 +19,7 @@ TEST(Md5Test, Test1)
 {
     ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0]));
     string tmp;
-    for (int i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
+    for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
     {
         md5File(DICT_FILE[i], tmp);
         ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));

From d5bb4e48ece5acc679e90efc3da45c4f5ad3723b Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Wed, 29 Jan 2014 20:37:26 +0800
Subject: [PATCH 4/9] use InitOnOff

---
 src/Limonp/CMakeLists.txt |  4 +++-
 src/Limonp/InitOnOff.hpp  | 21 +++++++++++++++++++++
 src/Limonp/str_functs.hpp | 26 +++++++++++++++-----------
 src/SegmentBase.hpp       | 11 +++--------
 4 files changed, 42 insertions(+), 20 deletions(-)
 create mode 100644 src/Limonp/InitOnOff.hpp

diff --git a/src/Limonp/CMakeLists.txt b/src/Limonp/CMakeLists.txt
index 7ed15ae..51f62d7 100644
--- a/src/Limonp/CMakeLists.txt
+++ b/src/Limonp/CMakeLists.txt
@@ -1 +1,3 @@
-INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp  std_outbound.hpp DESTINATION include/CppJieba/Limonp)
+INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp
+    str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp
+    std_outbound.hpp InitOnOff.hpp DESTINATION include/CppJieba/Limonp)
diff --git a/src/Limonp/InitOnOff.hpp b/src/Limonp/InitOnOff.hpp
new file mode 100644
index 0000000..926daab
--- /dev/null
+++ b/src/Limonp/InitOnOff.hpp
@@ -0,0 +1,21 @@
+#ifndef LIMONP_INITONOFF_H
+#define LIMONP_INITONOFF_H
+
+namespace Limonp
+{
+    class InitOnOff
+    {
+        public:
+            InitOnOff(){_setInitFlag(false);};
+            ~InitOnOff(){};
+        protected:
+            bool _isInited;
+            bool _getInitFlag()const{return _isInited;};
+            bool _setInitFlag(bool flag){return _isInited = flag;};
+        public:
+            operator bool(){return _getInitFlag();};
+
+    };
+}
+
+#endif
diff --git a/src/Limonp/str_functs.hpp b/src/Limonp/str_functs.hpp
index 7ebb6b4..e610232 100644
--- a/src/Limonp/str_functs.hpp
+++ b/src/Limonp/str_functs.hpp
@@ -100,7 +100,7 @@ namespace Limonp
 
 
 
-    inline bool split(const string& src, vector<string>& res, const string& pattern)
+    inline bool split(const string& src, vector<string>& res, const string& pattern, size_t offset = 0, size_t len = string::npos)
     {
         if(src.empty())
         {
@@ -110,20 +110,28 @@ namespace Limonp
 
         size_t start = 0;
         size_t end = 0;
-        while(start < src.size())
+        size_t cnt = 0;
+        while(start < src.size() && res.size() < len)
         {
             end = src.find_first_of(pattern, start);
             if(string::npos == end)
             {
-                res.push_back(src.substr(start));
+                if(cnt >= offset)
+                {
+                    res.push_back(src.substr(start));
+                }
                 return true;
             }
-            res.push_back(src.substr(start, end - start));
-            if(end == src.size() - 1)
+            //if(end == src.size() - 1)
+            //{
+            //    res.push_back("");
+            //    return true;
+            //}
+            if(cnt >= offset)
             {
-                res.push_back("");
-                break;
+                res.push_back(src.substr(start, end - start));
             }
+            cnt ++;
             start = end + 1;
         }
         return true;
@@ -158,12 +166,8 @@ namespace Limonp
         return ltrim(rtrim(s));
     }
 
-
-
-
     inline bool startsWith(const string& str, const string& prefix)
     {
-        //return str.substr(0, prefix.size()) == prefix;
         if(prefix.length() > str.length())
         {
             return false;
diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp
index 12938cd..dabee0c 100644
--- a/src/SegmentBase.hpp
+++ b/src/SegmentBase.hpp
@@ -3,6 +3,7 @@
 
 #include "TransCode.hpp"
 #include "Limonp/logger.hpp"
+#include "Limonp/InitOnOff.hpp"
 #include "ISegment.hpp"
 #include <cassert>
 
@@ -10,17 +11,11 @@
 namespace CppJieba
 {
     using namespace Limonp;
-    class SegmentBase: public ISegment
+    class SegmentBase: public ISegment, public InitOnOff
     {
         public:
-            SegmentBase(){_setInitFlag(false);};
+            SegmentBase(){};
             virtual ~SegmentBase(){};
-        protected:
-            bool _isInited;
-            bool _getInitFlag()const{return _isInited;};
-            bool _setInitFlag(bool flag){return _isInited = flag;};
-        public:
-            operator bool(){return _getInitFlag();};
 
         public:
             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;

From 41a33747f467d538ffecb6cf61d6433b143c09a7 Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Thu, 30 Jan 2014 01:06:32 +0800
Subject: [PATCH 5/9] use InitOnOff

---
 src/KeywordExtractor.hpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp
index 92c9d76..295ee05 100644
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@@ -20,18 +20,12 @@ namespace CppJieba
     //    return os << keyword.word << "," << keyword.idf;
     //}
 
-    class KeywordExtractor
+    class KeywordExtractor: public InitOnOff
     {
         private:
             MPSegment _segment;
         private:
             unordered_map<string, double> _idfMap;
-        protected:
-            bool _isInited;
-            bool _getInitFlag()const{return _isInited;};
-            bool _setInitFlag(bool flag){return _isInited = flag;};
-        public:
-            operator bool(){return _getInitFlag();};
         public:
             KeywordExtractor(){_setInitFlag(false);};
             explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};

From f64c11c57e4b281e73b039ece989cc6f2b6d1ab7 Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Fri, 31 Jan 2014 17:37:40 +0800
Subject: [PATCH 6/9] add blacklist

---
 src/KeywordExtractor.hpp            | 64 +++++++++++++++++++----------
 test/unittest/TKeywordExtractor.cpp | 23 ++++++-----
 2 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp
index 295ee05..899fb32 100644
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@@ -1,37 +1,37 @@
 #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
 #define CPPJIEBA_KEYWORD_EXTRACTOR_H
 
-#include "MPSegment.hpp"
+#include "MixSegment.hpp"
 #include <cmath>
+#include <unordered_set>
 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 
 namespace CppJieba
 {
     using namespace Limonp;
 
-    //struct KeyWordInfo
-    //{
-    //    string word;
-    //    double tfidf;
-    //};
-
-    //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
-    //{
-    //    return os << keyword.word << "," << keyword.idf;
-    //}
+    /*utf8*/
+    const char * BLACK_LIST[] = {"。", "，", "、", "我", "的", "”", "“", "了",
+        "你", "她", "他", "它", "说", "是", "：", "不"};
 
     class KeywordExtractor: public InitOnOff
     {
         private:
-            MPSegment _segment;
+            MixSegment _segment;
         private:
             unordered_map<string, double> _idfMap;
+            double _idfAverage;
+
+            unordered_set<string> _blackSet;
         public:
             KeywordExtractor(){_setInitFlag(false);};
-            explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
+            explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
+            {
+                _setInitFlag(init(dictPath, hmmFilePath, idfPath));
+            };
             ~KeywordExtractor(){};
         public:
-            bool init(const string& dictPath, const string& idfPath)
+            bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
             {
                 ifstream ifs(idfPath.c_str());
                 if(!ifs)
@@ -41,7 +41,10 @@ namespace CppJieba
                 }
                 string line ;
                 vector<string> buf;
-                for(uint lineno = 0; getline(ifs, line); lineno++)
+                double idf = 0.0;
+                double idfSum = 0.0;
+                size_t lineno = 0;
+                for(;getline(ifs, line); lineno++)
                 {
                     buf.clear();
                     if(line.empty())
@@ -54,9 +57,22 @@ namespace CppJieba
                         LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
                         continue;
                     }
-                    _idfMap[buf[0]] = atof(buf[1].c_str());
-                }
-                return _setInitFlag(_segment.init(dictPath));
+                    idf = atof(buf[1].c_str());
+                    _idfMap[buf[0]] = idf;
+                    idfSum += idf;
+
+                } 
+
+                std::copy(
+                            BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), 
+                            std::inserter(_blackSet, _blackSet.begin()));
+                
+                assert(lineno);
+                _idfAverage = idfSum / lineno;
+
+                assert(_idfAverage > 0.0);
+                
+                return _setInitFlag(_segment.init(dictPath, hmmFilePath));
             };
         public:
 
@@ -90,18 +106,24 @@ namespace CppJieba
                     wordmap[ words[i] ] += 1.0;
                 }
 
-                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
+                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
                 {
+                    if(_blackSet.end() != _blackSet.find(itr->first))
+                    {
+                        itr = wordmap.erase(itr);
+                        continue;
+                    }
+
                     unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
                     if(cit != _idfMap.end())
                     {
                         itr->second *= cit->second;
-                        itr ++;
                     }
                     else
                     {
-                        itr = wordmap.erase(itr);
+                        itr->second *= _idfAverage;
                     }
+                    itr ++;
                 }
 
                 keywords.resize(MIN(topN, wordmap.size()));
diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp
index 2709227..355f8b2 100644
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@@ -3,25 +3,25 @@
 
 using namespace CppJieba;
 
+const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456";
+
 TEST(KeywordExtractorTest, Test1)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
-    const char* str = "我来自北京邮电大学。。。  学号 123456";
-    const char* res[] = {"北京邮电大学", "来自"};
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+    const char* res[] = {"学号", "北京邮电大学"};
     vector<string> words;
     ASSERT_TRUE(extractor);
-    ASSERT_TRUE(extractor.extract(str, words, 2));
+    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2));
     ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 }
 
 TEST(KeywordExtractorTest, Test2)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
-    const char* str = "我来自北京邮电大学。。。  学号 123456";
-    const char* res[] = {"北京邮电大学", "来自"};
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+    const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"};
     vector<string> words;
     ASSERT_TRUE(extractor);
-    ASSERT_TRUE(extractor.extract(str, words, 9));
+    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
     ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 }
 
@@ -31,10 +31,13 @@ TEST(KeywordExtractorTest, Test3)
     ifstream ifs("../test/testdata/weicheng.utf8");
     ASSERT_TRUE(!!ifs);
     string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
     const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
     vector<string> keywords;
     extractor.extract(str, keywords, 5);
+    print(keywords);
+    print(__LINE__);
+    exit(1);
     ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 
 }
@@ -44,7 +47,7 @@ TEST(KeywordExtractorTest, Test4)
     ifstream ifs("../test/testdata/weicheng.utf8");
     ASSERT_TRUE(!!ifs);
     string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
     //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
     vector<pair<string,double> >  keywords;
     extractor.extract(str, keywords, 5);

From 18f73f1c30b0d0012413032afa3a89b83f65b9b9 Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Sun, 2 Feb 2014 13:14:14 +0800
Subject: [PATCH 7/9] add dict/readme.md

---
 dict/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 dict/README.md

diff --git a/dict/README.md b/dict/README.md
new file mode 100644
index 0000000..258c634
--- /dev/null
+++ b/dict/README.md
@@ -0,0 +1,15 @@
+# CppJieba字典
+
+文件后缀名代表的是词典的编码方式。
+比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
+
+## jieba.dict.utf8/gbk
+
+作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
+
+## hmm_model.utf8/gbk
+
+作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
+
+__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
+

From 440b168d8b7e8794ed7d94aec6b5ac3cb5dfca1c Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Sun, 2 Feb 2014 13:53:58 +0800
Subject: [PATCH 8/9] ci

---
 dict/README.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/dict/README.md b/dict/README.md
index 258c634..614e071 100644
--- a/dict/README.md
+++ b/dict/README.md
@@ -3,13 +3,27 @@
 文件后缀名代表的是词典的编码方式。
 比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
 
-## jieba.dict.utf8/gbk
+
+## 分词
+
+### jieba.dict.utf8/gbk
 
 作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
 
-## hmm_model.utf8/gbk
+### hmm_model.utf8/gbk
 
 作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
 
 __对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
 
+
+## 关键词抽取
+
+## idf.utf8
+
+IDF(Inverse Document Frequency)
+在KeywordExtractor中，使用的是经典的TF-IDF算法，所以需要这么一个词典提供IDF信息。
+
+
+
+

From 5f96dcf09aff0cfe235545b2704395af9940b7f8 Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Fri, 7 Feb 2014 17:51:08 +0800
Subject: [PATCH 9/9] add filter singword in keywordextractor.

---
 src/KeywordExtractor.hpp            | 25 ++++++++++++++--
 test/unittest/TKeywordExtractor.cpp | 46 ++++++++++++++++-------------
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp
index 899fb32..a78ea1f 100644
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@@ -11,8 +11,7 @@ namespace CppJieba
     using namespace Limonp;
 
     /*utf8*/
-    const char * BLACK_LIST[] = {"。", "，", "、", "我", "的", "”", "“", "了",
-        "你", "她", "他", "它", "说", "是", "：", "不"};
+    const char * BLACK_LIST[] = {"我们", "他们"};
 
     class KeywordExtractor: public InitOnOff
     {
@@ -100,6 +99,19 @@ namespace CppJieba
                     return false;
                 }
 
+                // filtering single word.
+                for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
+                {
+                    if(_isSingleWord(*iter))
+                    {
+                        iter = words.erase(iter);
+                    }
+                    else
+                    {
+                        iter++;
+                    }
+                }
+
                 unordered_map<string, double> wordmap;
                 for(uint i = 0; i < words.size(); i ++)
                 {
@@ -130,6 +142,15 @@ namespace CppJieba
                 partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
                 return true;
             }
+        private:
+            bool _isSingleWord(const string& str) const
+            {
+                Unicode unicode;
+                TransCode::decode(str, unicode);
+                if(unicode.size() == 1)
+                  return true;
+                return false;
+            }
 
         private:
             static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp
index 355f8b2..8a84985 100644
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@@ -18,7 +18,7 @@ TEST(KeywordExtractorTest, Test1)
 TEST(KeywordExtractorTest, Test2)
 {
     KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"};
+    const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
     vector<string> words;
     ASSERT_TRUE(extractor);
     ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
@@ -32,28 +32,34 @@ TEST(KeywordExtractorTest, Test3)
     ASSERT_TRUE(!!ifs);
     string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
     KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+    const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
+    const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
     vector<string> keywords;
+    string resStr;
+    vector<pair<string,double> >  keywords2;
     extractor.extract(str, keywords, 5);
-    print(keywords);
-    print(__LINE__);
-    exit(1);
+    extractor.extract(str, keywords2, 5);
     ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    resStr << keywords2;
+    ASSERT_EQ(res2, resStr);
 
 }
 
-TEST(KeywordExtractorTest, Test4)
-{
-    ifstream ifs("../test/testdata/weicheng.utf8");
-    ASSERT_TRUE(!!ifs);
-    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
-    vector<pair<string,double> >  keywords;
-    extractor.extract(str, keywords, 5);
-    //print(keywords);
-    string res;
-    res << keywords;
-    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
-
-}
+//TEST(KeywordExtractorTest, Test4)
+//{
+//    ifstream ifs("../test/testdata/weicheng.utf8");
+//    ASSERT_TRUE(!!ifs);
+//    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
+//    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+//    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+//    vector<pair<string,double> >  keywords;
+//    extractor.extract(str, keywords, 5);
+//    //print(keywords);
+//    string res;
+//    res << keywords;
+//    print(keywords);
+//    print(__LINE__);
+//    exit(1);
+//    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
+//
+//}