diff --git a/COPYRIGHT b/COPYRIGHT deleted file mode 100644 index a1c0776..0000000 --- a/COPYRIGHT +++ /dev/null @@ -1,13 +0,0 @@ - DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE - Version 2, December 2004 - - Copyright (C) 2013 Yanyi Wu - - Everyone is permitted to copy and distribute verbatim or modified - copies of this license document, and changing it is allowed as long - as the name is changed. - - DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. You just DO WHAT THE FUCK YOU WANT TO. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8b000fd --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 Yanyi Wu + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 9a5cd30..c7c96c4 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,11 @@ ## 安装与使用 +### 依赖 + +* g++ (version >= 4.6); +* cmake (version >= 2.8); + ### 下载和安装 ```sh diff --git a/dict/README.md b/dict/README.md new file mode 100644 index 0000000..614e071 --- /dev/null +++ b/dict/README.md @@ -0,0 +1,29 @@ +# CppJieba字典 + +文件后缀名代表的是词典的编码方式。 +比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。 + + +## 分词 + +### jieba.dict.utf8/gbk + +作为最大概率法(MPSegment: Max Probability)分词所使用的词典。 + +### hmm_model.utf8/gbk + +作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。 + +__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__ + + +## 关键词抽取 + +## idf.utf8 + +IDF(Inverse Document Frequency) +在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。 + + + + diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 92c9d76..a78ea1f 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -1,43 +1,36 @@ #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H #define CPPJIEBA_KEYWORD_EXTRACTOR_H -#include "MPSegment.hpp" +#include "MixSegment.hpp" #include +#include #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) namespace CppJieba { using namespace Limonp; - //struct KeyWordInfo - //{ - // string word; - // double tfidf; - //}; + /*utf8*/ + const char * BLACK_LIST[] = {"我们", "他们"}; - //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword) - //{ - // return os << keyword.word << "," << keyword.idf; - //} - - class KeywordExtractor + class KeywordExtractor: public InitOnOff { private: - MPSegment _segment; + MixSegment _segment; private: unordered_map _idfMap; - protected: - bool _isInited; - bool _getInitFlag()const{return _isInited;}; - bool _setInitFlag(bool flag){return _isInited = flag;}; - public: - operator bool(){return _getInitFlag();}; + double _idfAverage; + + unordered_set _blackSet; public: KeywordExtractor(){_setInitFlag(false);}; - explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));}; + explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath) + { + _setInitFlag(init(dictPath, hmmFilePath, idfPath)); + }; ~KeywordExtractor(){}; public: - bool init(const string& dictPath, const string& idfPath) + bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs) @@ -47,7 +40,10 @@ namespace CppJieba } string line ; vector buf; - for(uint lineno = 0; getline(ifs, line); lineno++) + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for(;getline(ifs, line); lineno++) { buf.clear(); if(line.empty()) @@ -60,9 +56,22 @@ namespace CppJieba LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); continue; } - _idfMap[buf[0]] = atof(buf[1].c_str()); - } - return _setInitFlag(_segment.init(dictPath)); + idf = atof(buf[1].c_str()); + _idfMap[buf[0]] = idf; + idfSum += idf; + + } + + std::copy( + BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), + std::inserter(_blackSet, _blackSet.begin())); + + assert(lineno); + _idfAverage = idfSum / lineno; + + assert(_idfAverage > 0.0); + + return _setInitFlag(_segment.init(dictPath, hmmFilePath)); }; public: @@ -90,30 +99,58 @@ namespace CppJieba return false; } + // filtering single word. + for(vector::iterator iter = words.begin(); iter != words.end(); ) + { + if(_isSingleWord(*iter)) + { + iter = words.erase(iter); + } + else + { + iter++; + } + } + unordered_map wordmap; for(uint i = 0; i < words.size(); i ++) { wordmap[ words[i] ] += 1.0; } - for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end();) + for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { + if(_blackSet.end() != _blackSet.find(itr->first)) + { + itr = wordmap.erase(itr); + continue; + } + unordered_map::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; - itr ++; } else { - itr = wordmap.erase(itr); + itr->second *= _idfAverage; } + itr ++; } keywords.resize(MIN(topN, wordmap.size())); partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp); return true; } + private: + bool _isSingleWord(const string& str) const + { + Unicode unicode; + TransCode::decode(str, unicode); + if(unicode.size() == 1) + return true; + return false; + } private: static bool _cmp(const pair& lhs, const pair& rhs) diff --git a/src/Limonp/CMakeLists.txt b/src/Limonp/CMakeLists.txt index 7ed15ae..51f62d7 100644 --- a/src/Limonp/CMakeLists.txt +++ b/src/Limonp/CMakeLists.txt @@ -1 +1,3 @@ -INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp std_outbound.hpp DESTINATION include/CppJieba/Limonp) +INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp + str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp + std_outbound.hpp InitOnOff.hpp DESTINATION include/CppJieba/Limonp) diff --git a/src/Limonp/InitOnOff.hpp b/src/Limonp/InitOnOff.hpp new file mode 100644 index 0000000..926daab --- /dev/null +++ b/src/Limonp/InitOnOff.hpp @@ -0,0 +1,21 @@ +#ifndef LIMONP_INITONOFF_H +#define LIMONP_INITONOFF_H + +namespace Limonp +{ + class InitOnOff + { + public: + InitOnOff(){_setInitFlag(false);}; + ~InitOnOff(){}; + protected: + bool _isInited; + bool _getInitFlag()const{return _isInited;}; + bool _setInitFlag(bool flag){return _isInited = flag;}; + public: + operator bool(){return _getInitFlag();}; + + }; +} + +#endif diff --git a/src/Limonp/str_functs.hpp b/src/Limonp/str_functs.hpp index 7ebb6b4..e610232 100644 --- a/src/Limonp/str_functs.hpp +++ b/src/Limonp/str_functs.hpp @@ -100,7 +100,7 @@ namespace Limonp - inline bool split(const string& src, vector& res, const string& pattern) + inline bool split(const string& src, vector& res, const string& pattern, size_t offset = 0, size_t len = string::npos) { if(src.empty()) { @@ -110,20 +110,28 @@ namespace Limonp size_t start = 0; size_t end = 0; - while(start < src.size()) + size_t cnt = 0; + while(start < src.size() && res.size() < len) { end = src.find_first_of(pattern, start); if(string::npos == end) { - res.push_back(src.substr(start)); + if(cnt >= offset) + { + res.push_back(src.substr(start)); + } return true; } - res.push_back(src.substr(start, end - start)); - if(end == src.size() - 1) + //if(end == src.size() - 1) + //{ + // res.push_back(""); + // return true; + //} + if(cnt >= offset) { - res.push_back(""); - break; + res.push_back(src.substr(start, end - start)); } + cnt ++; start = end + 1; } return true; @@ -158,12 +166,8 @@ namespace Limonp return ltrim(rtrim(s)); } - - - inline bool startsWith(const string& str, const string& prefix) { - //return str.substr(0, prefix.size()) == prefix; if(prefix.length() > str.length()) { return false; diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 12938cd..dabee0c 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -3,6 +3,7 @@ #include "TransCode.hpp" #include "Limonp/logger.hpp" +#include "Limonp/InitOnOff.hpp" #include "ISegment.hpp" #include @@ -10,17 +11,11 @@ namespace CppJieba { using namespace Limonp; - class SegmentBase: public ISegment + class SegmentBase: public ISegment, public InitOnOff { public: - SegmentBase(){_setInitFlag(false);}; + SegmentBase(){}; virtual ~SegmentBase(){}; - protected: - bool _isInited; - bool _getInitFlag()const{return _isInited;}; - bool _setInitFlag(bool flag){return _isInited = flag;}; - public: - operator bool(){return _getInitFlag();}; public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 2709227..8a84985 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -3,25 +3,25 @@ using namespace CppJieba; +const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456"; + TEST(KeywordExtractorTest, Test1) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* str = "我来自北京邮电大学。。。 学号 123456"; - const char* res[] = {"北京邮电大学", "来自"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"学号", "北京邮电大学"}; vector words; ASSERT_TRUE(extractor); - ASSERT_TRUE(extractor.extract(str, words, 2)); + ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2)); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } TEST(KeywordExtractorTest, Test2) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* str = "我来自北京邮电大学。。。 学号 123456"; - const char* res[] = {"北京邮电大学", "来自"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"学号", "北京邮电大学", "123456", "来自"}; vector words; ASSERT_TRUE(extractor); - ASSERT_TRUE(extractor.extract(str, words, 9)); + ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9)); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } @@ -31,26 +31,35 @@ TEST(KeywordExtractorTest, Test3) ifstream ifs("../test/testdata/weicheng.utf8"); ASSERT_TRUE(!!ifs); string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"}; + const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]"; vector keywords; + string resStr; + vector > keywords2; extractor.extract(str, keywords, 5); + extractor.extract(str, keywords2, 5); ASSERT_EQ(keywords, vector(res, res + sizeof(res)/sizeof(res[0]))); + resStr << keywords2; + ASSERT_EQ(res2, resStr); } -TEST(KeywordExtractorTest, Test4) -{ - ifstream ifs("../test/testdata/weicheng.utf8"); - ASSERT_TRUE(!!ifs); - string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8"); - //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; - vector > keywords; - extractor.extract(str, keywords, 5); - //print(keywords); - string res; - res << keywords; - ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); - -} +//TEST(KeywordExtractorTest, Test4) +//{ +// ifstream ifs("../test/testdata/weicheng.utf8"); +// ASSERT_TRUE(!!ifs); +// string str((istreambuf_iterator(ifs)), (istreambuf_iterator())); +// KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); +// //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"}; +// vector > keywords; +// extractor.extract(str, keywords, 5); +// //print(keywords); +// string res; +// res << keywords; +// print(keywords); +// print(__LINE__); +// exit(1); +// ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]"); +// +//} diff --git a/test/unittest/TMd5.cpp b/test/unittest/TMd5.cpp index a3c6af9..455b125 100644 --- a/test/unittest/TMd5.cpp +++ b/test/unittest/TMd5.cpp @@ -19,7 +19,7 @@ TEST(Md5Test, Test1) { ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0])); string tmp; - for (int i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++) + for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++) { md5File(DICT_FILE[i], tmp); ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));