diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e00c1a..9a6d5f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,9 @@ PROJECT(CPPJIEBA) SET(CMAKE_INSTALL_PREFIX /usr) ADD_DEFINITIONS(-std=c++0x -O3) +IF (DEFINED ENC) + ADD_DEFINITIONS(-DCPPJIEBA_${ENC}) +ENDIF() ADD_SUBDIRECTORY(src) ADD_SUBDIRECTORY(dicts) ADD_SUBDIRECTORY(scripts) diff --git a/README.md b/README.md index 83ff2bf..3db03fd 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,6 @@ 现在支持utf8,gbk编码的分词。 -- `master`分支支持`utf8`编码 -- `gbk`分支支持`gbk`编码 - ## 安装与使用 ### 下载和安装 @@ -22,6 +19,8 @@ cd cppjieba-master mkdir build cd build cmake .. +# 默认是utf8编码,如果要使用gbk编码则使用下句cmake命令 +# cmake .. -DENC=GBK make sudo make install ``` @@ -122,14 +121,14 @@ Output: 核心目录,包含主要源代码。 #### Trie树 -Trie.cpp/Trie.h 负责载入词典的trie树,主要供Segment模块使用。 +Trie.hpp 负责载入词典的trie树,主要供Segment模块使用。 #### Segment模块 -MPSegment.cpp/MPSegment.h +MPSegment.hpp (Maximum Probability)最大概率法:负责根据Trie树构建有向无环图和进行动态规划算法,是分词算法的核心。 -HMMSegment.cpp/HMMSegment.h +HMMSegment.hpp 是根据HMM模型来进行分词,主要算法思路是根据(B,E,M,S)四个状态来代表每个字的隐藏状态。 HMM模型由dicts/下面的`hmm_model.utf8`提供。 分词算法即viterbi算法。 diff --git a/src/ChineseFilter.hpp b/src/ChineseFilter.hpp index adca186..2957828 100644 --- a/src/ChineseFilter.hpp +++ b/src/ChineseFilter.hpp @@ -6,7 +6,7 @@ namespace CppJieba { - enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1, OTHERS = 2}; + enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1}; typedef Unicode::const_iterator UniConIter; class ChineseFilter; class ChFilterIterator @@ -55,11 +55,7 @@ namespace CppJieba { return DIGIT_OR_LETTER; } - if(x >= 0x4e00 && x <= 0x9fff) - { - return CHWORD; - } - return OTHERS; + return CHWORD; } ChFilterIterator _get(UniConIter iter) { @@ -67,7 +63,7 @@ namespace CppJieba const UniConIter& _end = ptUnico->end(); if(iter == _end) { - return ChFilterIterator(ptUnico, end, end, OTHERS); + return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER); } CHAR_TYPE charType = _charType(*iter); iter ++; @@ -99,7 +95,7 @@ namespace CppJieba } iterator end() { - return iterator(&_unico, _unico.end(), _unico.end(), OTHERS); + return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER); } }; } diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 9bee642..1bc4cf2 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -15,12 +15,13 @@ namespace CppJieba { private: Trie _trie; + const string _dictPath; public: - FullSegment(){}; + FullSegment(const char* dictPath): _dictPath(dictPath){}; virtual ~FullSegment(){dispose();}; public: - bool init(const char* const filePath) + bool init() { if(_getInitFlag()) { @@ -32,8 +33,8 @@ namespace CppJieba LogError("_trie.init failed."); return false; } - LogInfo("_trie.loadDict(%s) start...", filePath); - if(!_trie.loadDict(filePath)) + LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); + if(!_trie.loadDict(_dictPath.c_str())) { LogError("_trie.loadDict faield."); return false; diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index f3f36c2..c9839a4 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -31,9 +31,11 @@ namespace CppJieba EmitProbMap _emitProbM; EmitProbMap _emitProbS; vector _emitProbVec; + private: + const string _hmmModelPath; public: - HMMSegment() + HMMSegment(const char * const filePath): _hmmModelPath(filePath) { memset(_startProb, 0, sizeof(_startProb)); memset(_transProb, 0, sizeof(_transProb)); @@ -51,11 +53,11 @@ namespace CppJieba dispose(); } public: - bool init(const char* const modelPath) + virtual bool init() { - return _setInitFlag(_loadModel(modelPath)); + return _setInitFlag(_loadModel(_hmmModelPath.c_str())); } - bool dispose() + virtual bool dispose() { _setInitFlag(false); return true; @@ -88,11 +90,8 @@ namespace CppJieba } return true; } - bool cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { if(!_getInitFlag()) { diff --git a/src/ISegment.hpp b/src/ISegment.hpp index 48b8cd5..8821289 100644 --- a/src/ISegment.hpp +++ b/src/ISegment.hpp @@ -6,8 +6,11 @@ namespace CppJieba { class ISegment { - //public: - // virtual ~ISegment(){}; + public: + virtual ~ISegment(){}; + public: + virtual bool init() = 0; + virtual bool dispose() = 0; public: virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; virtual bool cut(const string& str, vector& res) const = 0; diff --git a/src/Limonp/str_functs.hpp b/src/Limonp/str_functs.hpp index 8ad62a3..5a480c9 100644 --- a/src/Limonp/str_functs.hpp +++ b/src/Limonp/str_functs.hpp @@ -120,7 +120,7 @@ namespace Limonp return res; } - + inline bool splitStr(const string& src, vector& res, const string& pattern) { @@ -218,41 +218,129 @@ namespace Limonp return str.find(ch) != string::npos; } - //inline void extractWords(const string& sentence, vector& words) - //{ - // bool flag = false; - // uint lhs = 0, len = 0; - // for(uint i = 0; i < sentence.size(); i++) - // { - // char x = sentence[i]; - // if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a)) - // { - // if(flag) - // { - // len ++; - // } - // else - // { - // lhs = i; - // len = 1; - // } - // flag = true; - // } - // else - // { - // if(flag) - // { - // words.push_back(string(sentence, lhs, len)); - // } - // flag = false; - // } - // } - // if(flag) - // { - // words.push_back(string(sentence, lhs, len)); - // } - //} + inline bool utf8ToUnicode(const string& str, vector& vec) + { + char ch1, ch2; + if(str.empty()) + { + return false; + } + vec.clear(); + size_t siz = str.size(); + for(uint i = 0;i < siz;) + { + if(!(str[i] & 0x80)) // 0xxxxxxx + { + vec.push_back(str[i]); + i++; + } + else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx + { + ch1 = (str[i] >> 2) & 0x07; + ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); + vec.push_back(twocharToUint16(ch1, ch2)); + i += 2; + } + else if((unsigned char)str[i] <= 0xef && i + 2 < siz) + { + ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); + ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); + vec.push_back(twocharToUint16(ch1, ch2)); + i += 3; + } + else + { + return false; + } + } + return true; + } + inline bool unicodeToUtf8(vector::const_iterator begin, vector::const_iterator end, string& res) + { + if(begin >= end) + { + return false; + } + res.clear(); + uint16_t ui; + while(begin != end) + { + ui = *begin; + if(ui <= 0x7f) + { + res += char(ui); + } + else if(ui <= 0x7ff) + { + res += char(((ui>>6) & 0x1f) | 0xc0); + res += char((ui & 0x3f) | 0x80); + } + else + { + res += char(((ui >> 12) & 0x0f )| 0xe0); + res += char(((ui>>6) & 0x3f )| 0x80 ); + res += char((ui & 0x3f) | 0x80); + } + begin ++; + } + return true; + } + + inline bool gbkTrans(const string& str, vector& vec) + { + vec.clear(); + if(str.empty()) + { + return false; + } + uint i = 0; + while(i < str.size()) + { + if(0 == (str[i] & 0x80)) + { + vec.push_back(uint16_t(str[i])); + i++; + } + else + { + if(i + 1 < str.size()) //&& (str[i+1] & 0x80)) + { + vec.push_back(twocharToUint16(str[i], str[i + 1])); + i += 2; + } + else + { + return false; + } + } + } + return true; + } + inline bool gbkTrans(vector::const_iterator begin, vector::const_iterator end, string& res) + { + if(begin >= end) + { + return false; + } + res.clear(); + pair pa; + while(begin != end) + { + pa = uint16ToChar2(*begin); + if(pa.first & 0x80) + { + res += pa.first; + res += pa.second; + } + else + { + res += pa.second; + } + begin++; + } + return true; + } } #endif diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index ae75773..d5f39f7 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -32,12 +32,14 @@ namespace CppJieba { private: Trie _trie; + private: + const string _dictPath; public: - MPSegment(){}; + MPSegment(const char * const dictPath): _dictPath(dictPath){}; virtual ~MPSegment(){dispose();}; public: - bool init(const char* const filePath) + virtual bool init() { if(_getInitFlag()) { @@ -49,8 +51,8 @@ namespace CppJieba LogError("_trie.init failed."); return false; } - LogInfo("_trie.loadDict(%s) start...", filePath); - if(!_trie.loadDict(filePath)) + LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); + if(!_trie.loadDict(_dictPath.c_str())) { LogError("_trie.loadDict faield."); return false; @@ -58,7 +60,7 @@ namespace CppJieba LogInfo("_trie.loadDict end."); return _setInitFlag(true); } - bool dispose() + virtual bool dispose() { if(!_getInitFlag()) { @@ -69,12 +71,7 @@ namespace CppJieba return true; } public: - //bool cut(const string& str, vector& segWordInfos)const; - bool cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { if(!_getInitFlag()) { @@ -155,7 +152,6 @@ namespace CppJieba return true; } - //virtual bool cut(const string& str, vector& res)const; private: bool _calcDAG(SegmentContext& segContext)const diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 896d7b1..8914256 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -13,7 +13,7 @@ namespace CppJieba MPSegment _mpSeg; HMMSegment _hmmSeg; public: - MixSegment() + MixSegment(const char * const mpSegDict, const char * const hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict) { } virtual ~MixSegment() @@ -21,26 +21,26 @@ namespace CppJieba dispose(); } public: - bool init(const char* const mpSegDict, const char* const hmmSegDict) + virtual bool init() { if(_getInitFlag()) { LogError("inited."); return false; } - if(!_mpSeg.init(mpSegDict)) + if(!_mpSeg.init()) { LogError("_mpSeg init"); return false; } - if(!_hmmSeg.init(hmmSegDict)) + if(!_hmmSeg.init()) { LogError("_hmmSeg init"); return false; } return _setInitFlag(true); } - bool dispose() + virtual bool dispose() { if(!_getInitFlag()) { @@ -52,12 +52,9 @@ namespace CppJieba return true; } public: - //virtual bool cut(const string& str, vector& res) const; - bool cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + using SegmentBase::cut; + public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { if(!_getInitFlag()) { diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index a187686..44e4df3 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -14,12 +14,17 @@ namespace CppJieba public: SegmentBase(){_setInitFlag(false);}; virtual ~SegmentBase(){}; - private: - bool _isInited; protected: + bool _isInited; bool _getInitFlag()const{return _isInited;}; bool _setInitFlag(bool flag){return _isInited = flag;}; - bool cut(const string& str, vector& res)const + public: + virtual bool init() = 0; + virtual bool dispose() = 0; + + public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; + virtual bool cut(const string& str, vector& res)const { if(!_getInitFlag()) { @@ -45,7 +50,6 @@ namespace CppJieba } return true; } - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; }; } diff --git a/src/TransCode.hpp b/src/TransCode.hpp index 113aceb..e3bb891 100644 --- a/src/TransCode.hpp +++ b/src/TransCode.hpp @@ -17,76 +17,25 @@ namespace CppJieba { inline bool decode(const string& str, vector& vec) { - char ch1, ch2; - if(str.empty()) - { - return false; - } - vec.clear(); - size_t siz = str.size(); - for(uint i = 0;i < siz;) - { - if(!(str[i] & 0x80)) // 0xxxxxxx - { - vec.push_back(str[i]); - i++; - } - else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx - { - ch1 = (str[i] >> 2) & 0x07; - ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 2; - } - else if((unsigned char)str[i] <= 0xef && i + 2 < siz) - { - ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); - ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 3; - } - else - { - return false; - } - } - return true; +#ifdef CPPJIEBA_GBK + return gbkTrans(str, vec); +#else + return utf8ToUnicode(str, vec); +#endif } - inline bool encode(vector::const_iterator begin, vector::const_iterator end, string& res) { - if(begin >= end) - { - return false; - } - res.clear(); - uint16_t ui; - while(begin != end) - { - ui = *begin; - if(ui <= 0x7f) - { - res += char(ui); - } - else if(ui <= 0x7ff) - { - res += char(((ui>>6) & 0x1f) | 0xc0); - res += char((ui & 0x3f) | 0x80); - } - else - { - res += char(((ui >> 12) & 0x0f )| 0xe0); - res += char(((ui>>6) & 0x3f )| 0x80 ); - res += char((ui & 0x3f) | 0x80); - } - begin ++; - } - return true; +#ifdef CPPJIEBA_GBK + return gbkTrans(begin, end, res); +#else + return unicodeToUtf8(begin, end, res); +#endif } - inline bool encode(const vector& sentence, string& res) + + inline bool encode(const vector& uni, string& res) { - return encode(sentence.begin(), sentence.end(), res); + return encode(uni.begin(), uni.end(), res); } } } diff --git a/src/Trie.hpp b/src/Trie.hpp index 785de82..8282587 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -164,14 +164,6 @@ namespace CppJieba { return NULL; } - return find(uintVec); - } - const TrieNodeInfo* find(const Unicode& uintVec)const - { - if(uintVec.empty()) - { - return NULL; - } return find(uintVec.begin(), uintVec.end()); } const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const @@ -261,56 +253,11 @@ namespace CppJieba return false; } - const TrieNodeInfo* findPrefix(const string& str)const - { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return NULL; - } - Unicode uintVec; - - if(!TransCode::decode(str, uintVec)) - { - LogError("TransCode::decode failed."); - return NULL; - } - - //find - TrieNode* p = _root; - uint pos = 0; - uint16_t chUni = 0; - const TrieNodeInfo * res = NULL; - for(uint i = 0; i < uintVec.size(); i++) - { - chUni = uintVec[i]; - if(p->isLeaf) - { - pos = p->nodeInfoVecPos; - if(pos >= _nodeInfoVec.size()) - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return NULL; - } - res = &(_nodeInfoVec[pos]); - - } - if(p->hmap.find(chUni) == p->hmap.end()) - { - break; - } - else - { - p = p->hmap[chUni]; - } - } - return res; - } - public: double getMinLogFreq()const{return _minLogFreq;}; - bool insert(const TrieNodeInfo& nodeInfo) + private: + bool _insert(const TrieNodeInfo& nodeInfo) { if(!_getInitFlag()) { @@ -353,7 +300,7 @@ namespace CppJieba } if(p->isLeaf) { - LogError("this node already inserted"); + LogError("this node already _inserted"); return false; } @@ -391,10 +338,10 @@ namespace CppJieba nodeInfo.tag = vecBuf[2]; } - //insert node - if(!insert(nodeInfo)) + //_insert node + if(!_insert(nodeInfo)) { - LogError("insert node failed!"); + LogError("_insert node failed!"); } } return true; diff --git a/src/segment.cpp b/src/segment.cpp index ac14c67..d46dc7b 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -19,8 +19,14 @@ void cut(const ISegment * seg, const char * const filePath) { cout << line << endl; res.clear(); - seg->cut(line, res); - cout<cut(line, res)) + { + LogError("seg cut failed."); + } + else + { + print(join(res.begin(), res.end(), "/")); + } } } } @@ -35,9 +41,9 @@ int main(int argc, char ** argv) <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" <<"example:\n" - <<"\t"<