From 453d4a143fb5929c1e7ba233b7bf745cb6187cdd Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 26 Jan 2014 12:37:01 +0800 Subject: [PATCH 1/9] =?UTF-8?q?add=20=E4=BE=9D=E8=B5=96=E8=BD=AF=E4=BB=B6?= =?UTF-8?q?=20in=20readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 9a5cd30..c7c96c4 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,11 @@ ## 安装与使用 +### 依赖 + +* g++ (version >= 4.6); +* cmake (version >= 2.8); + ### 下载和安装 ```sh From f1093d6cbc3d755d9cd8f368fc84690e5993240c Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 29 Jan 2014 20:13:26 +0800 Subject: [PATCH 2/9] use mit license --- COPYRIGHT | 13 ------------- LICENSE | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 13 deletions(-) delete mode 100644 COPYRIGHT create mode 100644 LICENSE diff --git a/COPYRIGHT b/COPYRIGHT deleted file mode 100644 index a1c0776..0000000 --- a/COPYRIGHT +++ /dev/null @@ -1,13 +0,0 @@ - DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE - Version 2, December 2004 - - Copyright (C) 2013 Yanyi Wu - - Everyone is permitted to copy and distribute verbatim or modified - copies of this license document, and changing it is allowed as long - as the name is changed. - - DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. You just DO WHAT THE FUCK YOU WANT TO. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8b000fd --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 Yanyi Wu + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From 259b296b713c4c5d2337bd3d3532a9839c54616f Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 29 Jan 2014 20:20:24 +0800 Subject: [PATCH 3/9] int -> uint for avoid warning --- test/unittest/TMd5.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unittest/TMd5.cpp b/test/unittest/TMd5.cpp index a3c6af9..455b125 100644 --- a/test/unittest/TMd5.cpp +++ b/test/unittest/TMd5.cpp @@ -19,7 +19,7 @@ TEST(Md5Test, Test1) { ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0])); string tmp; - for (int i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++) + for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++) { md5File(DICT_FILE[i], tmp); ASSERT_EQ(tmp, string(DICT_FILE_MD5[i])); From d5bb4e48ece5acc679e90efc3da45c4f5ad3723b Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 29 Jan 2014 20:37:26 +0800 Subject: [PATCH 4/9] use InitOnOff --- src/Limonp/CMakeLists.txt | 4 +++- src/Limonp/InitOnOff.hpp | 21 +++++++++++++++++++++ src/Limonp/str_functs.hpp | 26 +++++++++++++++----------- src/SegmentBase.hpp | 11 +++-------- 4 files changed, 42 insertions(+), 20 deletions(-) create mode 100644 src/Limonp/InitOnOff.hpp diff --git a/src/Limonp/CMakeLists.txt b/src/Limonp/CMakeLists.txt index 7ed15ae..51f62d7 100644 --- a/src/Limonp/CMakeLists.txt +++ b/src/Limonp/CMakeLists.txt @@ -1 +1,3 @@ -INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp std_outbound.hpp DESTINATION include/CppJieba/Limonp) +INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp + str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp + std_outbound.hpp InitOnOff.hpp DESTINATION include/CppJieba/Limonp) diff --git a/src/Limonp/InitOnOff.hpp b/src/Limonp/InitOnOff.hpp new file mode 100644 index 0000000..926daab --- /dev/null +++ b/src/Limonp/InitOnOff.hpp @@ -0,0 +1,21 @@ +#ifndef LIMONP_INITONOFF_H +#define LIMONP_INITONOFF_H + +namespace Limonp +{ + class InitOnOff + { + public: + InitOnOff(){_setInitFlag(false);}; + ~InitOnOff(){}; + protected: + bool _isInited; + bool _getInitFlag()const{return _isInited;}; + bool _setInitFlag(bool flag){return _isInited = flag;}; + public: + operator bool(){return _getInitFlag();}; + + }; +} + +#endif diff --git a/src/Limonp/str_functs.hpp b/src/Limonp/str_functs.hpp index 7ebb6b4..e610232 100644 --- a/src/Limonp/str_functs.hpp +++ b/src/Limonp/str_functs.hpp @@ -100,7 +100,7 @@ namespace Limonp - inline bool split(const string& src, vector& res, const string& pattern) + inline bool split(const string& src, vector& res, const string& pattern, size_t offset = 0, size_t len = string::npos) { if(src.empty()) { @@ -110,20 +110,28 @@ namespace Limonp size_t start = 0; size_t end = 0; - while(start < src.size()) + size_t cnt = 0; + while(start < src.size() && res.size() < len) { end = src.find_first_of(pattern, start); if(string::npos == end) { - res.push_back(src.substr(start)); + if(cnt >= offset) + { + res.push_back(src.substr(start)); + } return true; } - res.push_back(src.substr(start, end - start)); - if(end == src.size() - 1) + //if(end == src.size() - 1) + //{ + // res.push_back(""); + // return true; + //} + if(cnt >= offset) { - res.push_back(""); - break; + res.push_back(src.substr(start, end - start)); } + cnt ++; start = end + 1; } return true; @@ -158,12 +166,8 @@ namespace Limonp return ltrim(rtrim(s)); } - - - inline bool startsWith(const string& str, const string& prefix) { - //return str.substr(0, prefix.size()) == prefix; if(prefix.length() > str.length()) { return false; diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 12938cd..dabee0c 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -3,6 +3,7 @@ #include "TransCode.hpp" #include "Limonp/logger.hpp" +#include "Limonp/InitOnOff.hpp" #include "ISegment.hpp" #include @@ -10,17 +11,11 @@ namespace CppJieba { using namespace Limonp; - class SegmentBase: public ISegment + class SegmentBase: public ISegment, public InitOnOff { public: - SegmentBase(){_setInitFlag(false);}; + SegmentBase(){}; virtual ~SegmentBase(){}; - protected: - bool _isInited; - bool _getInitFlag()const{return _isInited;}; - bool _setInitFlag(bool flag){return _isInited = flag;}; - public: - operator bool(){return _getInitFlag();}; public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; From 41a33747f467d538ffecb6cf61d6433b143c09a7 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 30 Jan 2014 01:06:32 +0800 Subject: [PATCH 5/9] use InitOnOff --- src/KeywordExtractor.hpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 92c9d76..295ee05 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -20,18 +20,12 @@ namespace CppJieba // return os << keyword.word << "," << keyword.idf; //} - class KeywordExtractor + class KeywordExtractor: public InitOnOff { private: MPSegment _segment; private: unordered_map _idfMap; - protected: - bool _isInited; - bool _getInitFlag()const{return _isInited;}; - bool _setInitFlag(bool flag){return _isInited = flag;}; - public: - operator bool(){return _getInitFlag();}; public: KeywordExtractor(){_setInitFlag(false);}; explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));}; From f64c11c57e4b281e73b039ece989cc6f2b6d1ab7 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 31 Jan 2014 17:37:40 +0800 Subject: [PATCH 6/9] add blacklist --- src/KeywordExtractor.hpp | 64 +++++++++++++++++++---------- test/unittest/TKeywordExtractor.cpp | 23 ++++++----- 2 files changed, 56 insertions(+), 31 deletions(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 295ee05..899fb32 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -1,37 +1,37 @@ #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H #define CPPJIEBA_KEYWORD_EXTRACTOR_H -#include "MPSegment.hpp" +#include "MixSegment.hpp" #include +#include #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) namespace CppJieba { using namespace Limonp; - //struct KeyWordInfo - //{ - // string word; - // double tfidf; - //}; - - //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword) - //{ - // return os << keyword.word << "," << keyword.idf; - //} + /*utf8*/ + const char * BLACK_LIST[] = {"。", ",", "、", "我", "的", "”", "“", "了", + "你", "她", "他", "它", "说", "是", ":", "不"}; class KeywordExtractor: public InitOnOff { private: - MPSegment _segment; + MixSegment _segment; private: unordered_map _idfMap; + double _idfAverage; + + unordered_set _blackSet; public: KeywordExtractor(){_setInitFlag(false);}; - explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));}; + explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath) + { + _setInitFlag(init(dictPath, hmmFilePath, idfPath)); + }; ~KeywordExtractor(){}; public: - bool init(const string& dictPath, const string& idfPath) + bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs) @@ -41,7 +41,10 @@ namespace CppJieba } string line ; vector buf; - for(uint lineno = 0; getline(ifs, line); lineno++) + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for(;getline(ifs, line); lineno++) { buf.clear(); if(line.empty()) @@ -54,9 +57,22 @@ namespace CppJieba LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); continue; } - _idfMap[buf[0]] = atof(buf[1].c_str()); - } - return _setInitFlag(_segment.init(dictPath)); + idf = atof(buf[1].c_str()); + _idfMap[buf[0]] = idf; + idfSum += idf; + + } + + std::copy( + BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), + std::inserter(_blackSet, _blackSet.begin())); + + assert(lineno); + _idfAverage = idfSum / lineno; + + assert(_idfAverage > 0.0); + + return _setInitFlag(_segment.init(dictPath, hmmFilePath)); }; public: @@ -90,18 +106,24 @@ namespace CppJieba wordmap[ words[i] ] += 1.0; } - for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end();) + for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { + if(_blackSet.end() != _blackSet.find(itr->first)) + { + itr = wordmap.erase(itr); + continue; + } + unordered_map::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; - itr ++; } else { - itr = wordmap.erase(itr); + itr->second *= _idfAverage; } + itr ++; } keywords.resize(MIN(topN, wordmap.size())); diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 2709227..355f8b2 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -3,25 +3,25 @@ using namespace CppJieba; +const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456"; + TEST(KeywordExtractorTest, Test1) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* str = "我来自北京邮电大学。。。 学号 123456"; - const char* res[] = {"北京邮电大学", "来自"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"学号", "北京邮电大学"}; vector words; ASSERT_TRUE(extractor); - ASSERT_TRUE(extractor.extract(str, words, 2)); + ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2)); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } TEST(KeywordExtractorTest, Test2) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* str = "我来自北京邮电大学。。。 学号 123456"; - const char* res[] = {"北京邮电大学", "来自"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"}; vector words; ASSERT_TRUE(extractor); - ASSERT_TRUE(extractor.extract(str, words, 9)); + ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9)); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } @@ -31,10 +31,13 @@ TEST(KeywordExtractorTest, Test3) ifstream ifs("../test/testdata/weicheng.utf8"); ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; vector keywords; extractor.extract(str, keywords, 5); + print(keywords); + print(__LINE__); + exit(1); ASSERT_EQ(keywords, vector(res, res + sizeof(res)/sizeof(res[0]))); } @@ -44,7 +47,7 @@ TEST(KeywordExtractorTest, Test4) ifstream ifs("../test/testdata/weicheng.utf8"); ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; vector > keywords; extractor.extract(str, keywords, 5); From 18f73f1c30b0d0012413032afa3a89b83f65b9b9 Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 2 Feb 2014 13:14:14 +0800 Subject: [PATCH 7/9] add dict/readme.md --- dict/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 dict/README.md diff --git a/dict/README.md b/dict/README.md new file mode 100644 index 0000000..258c634 --- /dev/null +++ b/dict/README.md @@ -0,0 +1,15 @@ +# CppJieba字典 + +文件后缀名代表的是词典的编码方式。 +比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。 + +## jieba.dict.utf8/gbk + +作为最大概率法(MPSegment: Max Probability)分词所使用的词典。 + +## hmm_model.utf8/gbk + +作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。 + +__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__ + From 440b168d8b7e8794ed7d94aec6b5ac3cb5dfca1c Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 2 Feb 2014 13:53:58 +0800 Subject: [PATCH 8/9] ci --- dict/README.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/dict/README.md b/dict/README.md index 258c634..614e071 100644 --- a/dict/README.md +++ b/dict/README.md @@ -3,13 +3,27 @@ 文件后缀名代表的是词典的编码方式。 比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。 -## jieba.dict.utf8/gbk + +## 分词 + +### jieba.dict.utf8/gbk 作为最大概率法(MPSegment: Max Probability)分词所使用的词典。 -## hmm_model.utf8/gbk +### hmm_model.utf8/gbk 作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。 __对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__ + +## 关键词抽取 + +## idf.utf8 + +IDF(Inverse Document Frequency) +在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。 + + + + From 5f96dcf09aff0cfe235545b2704395af9940b7f8 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 7 Feb 2014 17:51:08 +0800 Subject: [PATCH 9/9] add filter singword in keywordextractor. --- src/KeywordExtractor.hpp | 25 ++++++++++++++-- test/unittest/TKeywordExtractor.cpp | 46 ++++++++++++++++------------- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 899fb32..a78ea1f 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -11,8 +11,7 @@ namespace CppJieba using namespace Limonp; /*utf8*/ - const char * BLACK_LIST[] = {"。", ",", "、", "我", "的", "”", "“", "了", - "你", "她", "他", "它", "说", "是", ":", "不"}; + const char * BLACK_LIST[] = {"我们", "他们"}; class KeywordExtractor: public InitOnOff { @@ -100,6 +99,19 @@ namespace CppJieba return false; } + // filtering single word. + for(vector::iterator iter = words.begin(); iter != words.end(); ) + { + if(_isSingleWord(*iter)) + { + iter = words.erase(iter); + } + else + { + iter++; + } + } + unordered_map wordmap; for(uint i = 0; i < words.size(); i ++) { @@ -130,6 +142,15 @@ namespace CppJieba partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp); return true; } + private: + bool _isSingleWord(const string& str) const + { + Unicode unicode; + TransCode::decode(str, unicode); + if(unicode.size() == 1) + return true; + return false; + } private: static bool _cmp(const pair& lhs, const pair& rhs) diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 355f8b2..8a84985 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -18,7 +18,7 @@ TEST(KeywordExtractorTest, Test1) TEST(KeywordExtractorTest, Test2) { KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); - const char* res[] = {"学号", "北京邮电大学", "123456", " ", "来自"}; + const char* res[] = {"学号", "北京邮电大学", "123456", "来自"}; vector words; ASSERT_TRUE(extractor); ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9)); @@ -32,28 +32,34 @@ TEST(KeywordExtractorTest, Test3) ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); - const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; + const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"}; + const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]"; vector keywords; + string resStr; + vector > keywords2; extractor.extract(str, keywords, 5); - print(keywords); - print(__LINE__); - exit(1); + extractor.extract(str, keywords2, 5); ASSERT_EQ(keywords, vector(res, res + sizeof(res)/sizeof(res[0]))); + resStr << keywords2; + ASSERT_EQ(res2, resStr); } -TEST(KeywordExtractorTest, Test4) -{ - ifstream ifs("../test/testdata/weicheng.utf8"); - ASSERT_TRUE(!!ifs); - string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); - //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; - vector > keywords; - extractor.extract(str, keywords, 5); - //print(keywords); - string res; - res << keywords; - ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); - -} +//TEST(KeywordExtractorTest, Test4) +//{ +// ifstream ifs("../test/testdata/weicheng.utf8"); +// ASSERT_TRUE(!!ifs); +// string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); +// KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); +// //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; +// vector > keywords; +// extractor.extract(str, keywords, 5); +// //print(keywords); +// string res; +// res << keywords; +// print(keywords); +// print(__LINE__); +// exit(1); +// ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); +// +//}