From f2c5f571f2f4f77747dd6ef19ac946ad426963a9 Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Fri, 5 Jul 2013 17:22:01 +0800 Subject: [PATCH] add findMaxMatch and cutMM --- Segment.cpp | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- Segment.h | 1 + Trie.cpp | 29 +++++++++++++++++++++++++++++ Trie.h | 5 ++++- 4 files changed, 83 insertions(+), 3 deletions(-) diff --git a/Segment.cpp b/Segment.cpp index 55a3645..9e5efb3 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -20,6 +20,54 @@ namespace CppJieba return _trie.destroy(); } + bool Segment::cutMM(const string& chStr, vector& res) + { + res.clear(); + char logBuf[bufSize]; + char utfBuf[bufSize]; + ChUnicode uniStr[bufSize]; + memset(uniStr, 0, sizeof(uniStr)); + size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr); + + if(0 == len) + { + sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); + LogError(logBuf); + return false; + } + + if(sizeof(uniStr) - len <= 5) + { + sprintf(logBuf, "%s too long!", chStr.c_str()); + LogError(logBuf); + return false; + } + + int i = 0; + while(i < len) + { + cout<<__FILE__<<__LINE__<& res) { res.clear(); @@ -42,7 +90,6 @@ namespace CppJieba LogError(logBuf); return false; } - int i = len - 1; while(i >= 0) @@ -85,7 +132,7 @@ int main() segment.init("dict.utf8"); vector res; string title = "我来到北京清华大学3D电视"; - bool flag = segment.cutRMM(title, res); + bool flag = segment.cutMM(title, res); if(flag) { for(int i = 0; i < res.size(); i++) diff --git a/Segment.h b/Segment.h index da8d431..7e7dda6 100644 --- a/Segment.h +++ b/Segment.h @@ -16,6 +16,7 @@ namespace CppJieba bool init(const char* const dictFilePath); bool destroy(); public: + bool cutMM(const string& chStr, vector& res); bool cutRMM(const string& chStr, vector& res); private: diff --git a/Trie.cpp b/Trie.cpp index 0fffa2e..af956f5 100644 --- a/Trie.cpp +++ b/Trie.cpp @@ -75,6 +75,7 @@ namespace CppJieba bool Trie::find(const ChUnicode* chUniStr, size_t len) { + int res = -1; TrieNode* p = _root; for(size_t i = 0; i < len; i++) { @@ -91,6 +92,7 @@ namespace CppJieba return p->isLeaf; } + /* bool Trie::find(const vector& uniVec) { TrieNode * p = _root; @@ -108,6 +110,33 @@ namespace CppJieba } return p->isLeaf; } + */ + + int Trie::findMaxMatch(const ChUnicode* chUniStr, size_t len) + { + int res = -1; + TrieNode * p = _root; + for(int i = 0; i < len; i++) + { + ChUnicode chWord = chUniStr[i]; + TrieNodeHashMap::const_iterator iter = p->hmap.find(chWord); + if(iter != p->hmap.end()) + { + TrieNode * next = iter->second; + if(next->isLeaf) + { + res = i + 1; + } + p = next; + } + else + { + break; + } + } + cout<<__FILE__<<__LINE__< >& res) { diff --git a/Trie.h b/Trie.h index c6097da..62ebfd4 100644 --- a/Trie.h +++ b/Trie.h @@ -95,8 +95,11 @@ namespace CppJieba bool init(const char* const filepath = DICT_FILE_PATH); bool destroy(); void display(); + + public: bool find(const ChUnicode* chUniStr, size_t len); - bool find(const vector& uniVec); + //bool find(const vector& uniVec); + int findMaxMatch(const ChUnicode* chUniStr, size_t len); public: bool cut(const ChUnicode* chUniStr, size_t len, vector< vector >& res);