From c8ea7610bd34cccc02a1d27bf03b16b816ff66aa Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Sat, 20 Jul 2013 17:39:19 +0800 Subject: [PATCH] bak --- src/Segment.cpp | 23 ++++--- src/Segment.h | 7 +-- src/Trie.cpp | 158 +++++++++++++++++++++++++++++------------------- src/Trie.h | 35 +++++++---- 4 files changed, 136 insertions(+), 87 deletions(-) diff --git a/src/Segment.cpp b/src/Segment.cpp index e06c828..eb86d97 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -6,15 +6,19 @@ namespace CppJieba { - Segment::Segment():_trie() + Segment::Segment() { + _encVec.push_back(Trie::UTF8); + _encVec.push_back(Trie::GBK); + //default encoding : utf8 + _encoding = Trie::UTF8; } Segment::~Segment() { } - bool Segment::init(const char* const dictFilePath) + bool Segment::init(const string& dictFilePath) { bool retFlag; LogInfo(string_format("_trie.init(%s) start...", dictFilePath)); @@ -28,11 +32,11 @@ namespace CppJieba return _trie.destroy(); } - bool Segment::cutDAG(const string& chStr, vector& res) + bool Segment::cutDAG(const string& str, vector& res) { bool retFlag; res.clear(); - string uniStr = _utf8ToUni(chStr); + string uniStr = _utf8ToUni(str; if(uniStr.empty()) { LogError("_utf8ToUni failed."); @@ -79,13 +83,11 @@ namespace CppJieba return _trie.getWeight(word); } - - string Segment::_utf8ToUni(const string& utfStr) { string uniStr = utf8ToUnicode(utfStr); - if(uniStr.empty()) + if(uniStr.empty() || uniStr.size() % 2) { LogError(string_format("utf8ToUnicode [%s] failed!", utfStr.c_str())); return ""; @@ -101,6 +103,7 @@ namespace CppJieba vec.push_back(i/2); for(uint j = i + 4; j <= uniStr.size(); j+=2) { + cout< _encVec; Trie _trie; public: Segment(); ~Segment(); public: - bool init(const char* const dictFilePath); + bool init(const string& dictFilePath); bool destroy(); public: bool cutDAG(const string& chStr, vector& res); @@ -33,9 +35,6 @@ namespace CppJieba bool _calcDP(const string& uniStr, const vector >& dag, vector >& res); bool _cutDAG(const string& uniStr, const vector >& dp, vector& res); - private: - enum {bufSize = 1024}; - }; } diff --git a/src/Trie.cpp b/src/Trie.cpp index 355c8e3..d5bf04a 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -6,6 +6,9 @@ namespace CppJieba { + const string& Trie::UTF8 = "utf-8"; + const string& Trie::GBK = "gbk"; + Trie::iterator Trie::begin() { return _nodeInfoVec.begin(); @@ -16,8 +19,16 @@ namespace CppJieba return _nodeInfoVec.end(); } - Trie::Trie():_root(NULL), _totalCount(0) + Trie::Trie() { + //encodings : utf-8, gbk + _encVec.push_back(UTF8); + _encVec.push_back(GBK); + //default encoding : utf-8 + _encoding = UTF8; + + _root = NULL; + _totalCount = 0; _minWeight = numeric_limits::max(); } @@ -25,12 +36,23 @@ namespace CppJieba { destroy(); } - - bool Trie::init(const char* const filePath) + + bool Trie::setEncoding(const string& enc) { - if(!checkFileExist(filePath)) + if(!isInVec(_encVec, enc)) { - LogError(string_format("cann't find fiel[%s].",filePath)); + LogError(string_format("%s illegal : not in [%s]", enc.c_str(), joinStr(_encVec, ",").c_str())); + return false; + } + _encoding = enc; + return true; + } + + bool Trie::init(const string& filePath) + { + if(!checkFileExist(filePath.c_str())) + { + LogError(string_format("cann't find fiel[%s].",filePath.c_str())); return false; } bool res = false; @@ -49,7 +71,7 @@ namespace CppJieba return true; } - bool Trie::_buildTree(const char* const filePath) + bool Trie::_buildTree(const string& filePath) { if(NULL != _root) { @@ -57,7 +79,8 @@ namespace CppJieba return false; } _root = new TrieNode; - ifstream ifile(filePath); + + ifstream ifile(filePath.c_str()); string line; vector vecBuf; while(getline(ifile, line)) @@ -86,7 +109,7 @@ namespace CppJieba nodeInfo.count = count; nodeInfo.tag = tag; - bool flag = _insert(nodeInfo); + bool flag = insert(nodeInfo); if(!flag) { LogError("insert node failed!"); @@ -178,29 +201,27 @@ namespace CppJieba return res; } - const TrieNodeInfo* Trie::find(const string& uniStr) + const TrieNodeInfo* Trie::find(const string& str) { - ChUnicode* pUni = new ChUnicode[uniStr.size()]; - for(uint i = 0; i < uniStr.size(); i+=2) - { - pUni[i/2] = twocharToUint16(uniStr[i], uniStr[i+1]); - } - const TrieNodeInfo* res = find(pUni, uniStr.size()/2); - delete [] pUni; - return res; + string uniStr = decode(str); + return _findUniStr(uniStr); } - const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len) + const TrieNodeInfo* Trie::_findUniStr(const string& uniStr) { if(NULL == _root) { LogFatal("trie not initted!"); return NULL; } - TrieNode* p = _root; - for(uint i = 0; i < len; i++) + if(uniStr.empty() || uniStr.size() % 2) { - ChUnicode chUni = chUniStr[i]; + LogError("uniStr illegal"); + } + TrieNode* p = _root; + for(uint i = 0; i < uniStr.size(); i+=2) + { + ChUnicode chUni = twocharToUint16(uniStr[i], uniStr[i+1]); if(p->hmap.find(chUni) == p-> hmap.end()) { return NULL; @@ -226,6 +247,7 @@ namespace CppJieba return NULL; } + /* double Trie::getWeight(const ChUnicode* uniStr, size_t len) { const TrieNodeInfo* p = find(uniStr, len); @@ -238,10 +260,11 @@ namespace CppJieba return getMinWeight(); } } + */ double Trie::getWeight(const string& uniStr) { - const TrieNodeInfo * p = find(uniStr); + const TrieNodeInfo * p = _findUniStr(uniStr); if(NULL != p) { return p->weight; @@ -262,29 +285,6 @@ namespace CppJieba return _totalCount; } - /* - bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector >& res) - { - res.clear(); - //cout<()); - vector& vec = res[i]; - for(size_t j = i; j < len; j++) - { - if(find(chUniStr + i, j - i + 1)) - { - vec.push_back(j); - } - } - } - return true; - } - */ - - bool Trie::_destroyNode(TrieNode* node) { for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) @@ -297,21 +297,55 @@ namespace CppJieba return true; } - bool Trie::_insert(const TrieNodeInfo& nodeInfo) + string Trie::decode(const string& str) { - _nodeInfoVec.push_back(nodeInfo); - const string& word = nodeInfo.word; - ChUnicode chUniStr[bufSize]; - memset(chUniStr, 0, sizeof(chUniStr)); - size_t len = utf8ToUnicode(word.c_str(), word.size(), chUniStr); - if(0 == len) + if(_encoding == UTF8) { + return utf8ToUnicode(str); + } + if(_encoding == GBK) + { + return utf8ToUnicode(gbkToUtf8(str)); + } + LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str())); + return ""; + } + + string Trie::encode(const string& str) + { + if(_encoding == UTF8) + { + return unicodeToUtf8(str); + } + if(_encoding == GBK) + { + return utf8ToGbk(unicodeToUtf8(str)); + } + LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str())); + return ""; + } + + bool Trie::insert(const TrieNodeInfo& nodeInfo) + { + if(NULL == _root) + { + LogError("_root is NULL"); return false; } + + const string& word = nodeInfo.word; + + string uniStr = decode(word); + if(uniStr.empty() || uniStr.size() % 2) + { + LogError("decode error."); + return false; + } + TrieNode* p = _root; - for(int i = 0; i < len; i++) + for(uint i = 0; i < uniStr.size(); i+=2) { - ChUnicode cu = chUniStr[i]; + ChUnicode cu = twocharToUint16(uniStr[i], uniStr[i+1]); if(NULL == p) { return false; @@ -327,7 +361,6 @@ namespace CppJieba { return false; } - p->hmap[cu] = next; p = next; } @@ -340,15 +373,16 @@ namespace CppJieba { return false; } - p->isLeaf = true; - if(!_nodeInfoVec.empty()) - { - p->nodeInfoVecPos = _nodeInfoVec.size() - 1; - } - else + if(p->isLeaf) { + LogError("this node already inserted"); return false; } + + p->isLeaf = true; + _nodeInfoVec.push_back(nodeInfo); + p->nodeInfoVecPos = _nodeInfoVec.size() - 1; + return true; } @@ -397,7 +431,7 @@ using namespace CppJieba; int main() { Trie trie; - trie.init("dicts/segdict.utf8.v2.1"); + trie.init("../dicts/segdict.utf8.v2.1"); //trie.init("dicts/jieba.dict.utf8"); //trie.init("dict.100"); //char utf[1024] = "我来到北京清华大学3D电视"; diff --git a/src/Trie.h b/src/Trie.h index 44597cd..6942308 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -29,7 +29,7 @@ namespace CppJieba struct TrieNodeInfo { - string word;// utf8 string word + string word; size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 . size_t count; string tag; @@ -48,17 +48,20 @@ namespace CppJieba { TrieNodeMap hmap; bool isLeaf; - unsigned int nodeInfoVecPos; - + uint nodeInfoVecPos; TrieNode() - :hmap(), isLeaf(false), nodeInfoVecPos(0) { + isLeaf = false; + nodeInfoVecPos = 0; } }; class Trie { + private: + string _encoding; + vector _encVec; TrieNode* _root; vector _nodeInfoVec; @@ -75,30 +78,38 @@ namespace CppJieba public: Trie(); ~Trie(); - bool init(const char* const filePath); + bool init(const string& filePath); + bool setEncoding(const string& enc); bool destroy(); void display(); public: - const TrieNodeInfo* find(const string& uniStr); - const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len); + //const TrieNodeInfo* find(const string& uniStr); + //const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len); + const TrieNodeInfo* find(const string& str); const TrieNodeInfo* findPrefix(const string& utf8Str); public: - double getWeight(const ChUnicode* uniStr, size_t len); + //double getWeight(const ChUnicode* uniStr, size_t len); double getWeight(const string& uniStr); double getMinWeight(); int64_t getTotalCount(); + bool insert(const TrieNodeInfo& nodeInfo); + + string decode(const string& str); + string encode(const string& str); + private: - bool _buildTree(const char* const filePath); + bool _buildTree(const string& filePath); bool _countWeight(); bool _destroyNode(TrieNode* node); - bool _insert(const TrieNodeInfo& nodeInfo); + const TrieNodeInfo* _findUniStr(const string& uniStr); - private: - enum {bufSize = 1024}; + public: + static const string& UTF8; + static const string& GBK; }; }