From 1de2635e44d5e2f1594bb2f10a1c03a0b3715cc7 Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Sat, 6 Jul 2013 14:41:08 +0800 Subject: [PATCH] bak --- Makefile | 4 ++-- Segment.cpp | 55 +++++++++++++++++++++++++++++++---------------------- Segment.h | 4 ++++ Trie.cpp | 6 +++--- Trie.h | 6 ++---- globals.h | 7 +++++++ 6 files changed, 50 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index c111257..b45de33 100644 --- a/Makefile +++ b/Makefile @@ -32,10 +32,10 @@ $(CMLIB): $(CMDIR) cd $(CMDIR) && $(MAKE) #unit test -Trie.ut: Trie.cpp Trie.h $(CMLIB) +Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB) $(CC) -o $@ $< -DTRIE_UT $(CMLIB) -Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h $(CMLIB) +Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB) $(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) diff --git a/Segment.cpp b/Segment.cpp index 56938e9..be56218 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -20,26 +20,22 @@ namespace CppJieba return _trie.destroy(); } + bool Segment::cutDAG(const string& chStr, vector& res) + { + + } + + bool Segment::cutMM(const string& chStr, vector& res) { res.clear(); - char logBuf[bufSize]; char utfBuf[bufSize]; ChUnicode uniStr[bufSize]; memset(uniStr, 0, sizeof(uniStr)); - size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr); - + size_t len = _utf8ToUni(chStr, uniStr, bufSize); if(0 == len) { - sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); - LogError(logBuf); - return false; - } - - if(sizeof(uniStr) - len <= 5) - { - sprintf(logBuf, "%s too long!", chStr.c_str()); - LogError(logBuf); + LogError("_utf8ToUni failed."); return false; } @@ -71,23 +67,14 @@ namespace CppJieba bool Segment::cutRMM(const string& chStr, vector& res) { res.clear(); - char logBuf[bufSize]; char utfBuf[bufSize]; ChUnicode uniStr[bufSize]; memset(uniStr, 0, sizeof(uniStr)); - size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr); + size_t len = _utf8ToUni(chStr, uniStr, bufSize); if(0 == len) { - sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); - LogError(logBuf); - return false; - } - - if(sizeof(uniStr) - len <= 5) - { - sprintf(logBuf, "%s too long!", chStr.c_str()); - LogError(logBuf); + LogError("_utf8ToUni failed."); return false; } @@ -120,6 +107,28 @@ namespace CppJieba } return true; } + + size_t Segment::_utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size) + { + char logBuf[bufSize]; + size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr); + + if(0 == len) + { + sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); + LogError(logBuf); + return 0; + } + + if(size - len <= 5) + { + sprintf(logBuf, "%s too long!", chStr.c_str()); + LogError(logBuf); + return 0; + } + return len; + + } } diff --git a/Segment.h b/Segment.h index 7e7dda6..fc9834a 100644 --- a/Segment.h +++ b/Segment.h @@ -16,9 +16,13 @@ namespace CppJieba bool init(const char* const dictFilePath); bool destroy(); public: + bool cutDAG(const string& chStr, vector& res); bool cutMM(const string& chStr, vector& res); bool cutRMM(const string& chStr, vector& res); + private: + size_t _utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size); + private: enum {bufSize = 1024}; diff --git a/Trie.cpp b/Trie.cpp index bf563dd..58d4554 100644 --- a/Trie.cpp +++ b/Trie.cpp @@ -183,7 +183,7 @@ namespace CppJieba for(int i = 0; i < len; i++) { ChUnicode chWord = chUniStr[i]; - TrieNodeHashMap::const_iterator iter = p->hmap.find(chWord); + TrieNodeMap::const_iterator iter = p->hmap.find(chWord); if(iter != p->hmap.end()) { TrieNode * next = iter->second; @@ -240,7 +240,7 @@ namespace CppJieba bool Trie::_destroyNode(TrieNode* node) { - for(TrieNodeHashMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) + for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) { TrieNode* next = it->second; _destroyNode(next); @@ -257,7 +257,7 @@ namespace CppJieba LogError("failed! node is null."); return; } - for(TrieNodeHashMap::const_iterator it = node->hmap.begin(); it != node->hmap.end(); it++) + for(TrieNodeMap::const_iterator it = node->hmap.begin(); it != node->hmap.end(); it++) { char utfBuf[8]; ChUnicode chBuf[1]; diff --git a/Trie.h b/Trie.h index 649e46c..bcec12a 100644 --- a/Trie.h +++ b/Trie.h @@ -19,9 +19,7 @@ namespace CppJieba using namespace CPPCOMMON; using namespace std; //using __gnu_cxx::hash_map; - typedef uint16_t ChUnicode; - const size_t ChUniMaxLen = 1024; - typedef map TrieNodeHashMap; + typedef map TrieNodeMap; struct TrieNodeInfo { @@ -36,7 +34,7 @@ namespace CppJieba struct TrieNode { - TrieNodeHashMap hmap; + TrieNodeMap hmap; bool isLeaf; unsigned int nodeInfoVecPos; diff --git a/globals.h b/globals.h index 70863d7..c495050 100644 --- a/globals.h +++ b/globals.h @@ -1,6 +1,13 @@ #ifndef GLOBALS_H #define GLOBALS_H +#include + +//file path const char * const DICT_FILE_PATH = "dict.txt"; +//typedefs +typedef uint16_t ChUnicode; + + #endif