This commit is contained in:
gwdwyy 2013-07-06 14:41:08 +08:00
parent 6eb200d867
commit 1de2635e44
6 changed files with 50 additions and 32 deletions

View File

@ -32,10 +32,10 @@ $(CMLIB): $(CMDIR)
cd $(CMDIR) && $(MAKE) cd $(CMDIR) && $(MAKE)
#unit test #unit test
Trie.ut: Trie.cpp Trie.h $(CMLIB) Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
$(CC) -o $@ $< -DTRIE_UT $(CMLIB) $(CC) -o $@ $< -DTRIE_UT $(CMLIB)
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h $(CMLIB) Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB) $(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB)

View File

@ -20,26 +20,22 @@ namespace CppJieba
return _trie.destroy(); return _trie.destroy();
} }
bool Segment::cutDAG(const string& chStr, vector<string>& res)
{
}
bool Segment::cutMM(const string& chStr, vector<string>& res) bool Segment::cutMM(const string& chStr, vector<string>& res)
{ {
res.clear(); res.clear();
char logBuf[bufSize];
char utfBuf[bufSize]; char utfBuf[bufSize];
ChUnicode uniStr[bufSize]; ChUnicode uniStr[bufSize];
memset(uniStr, 0, sizeof(uniStr)); memset(uniStr, 0, sizeof(uniStr));
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr); size_t len = _utf8ToUni(chStr, uniStr, bufSize);
if(0 == len) if(0 == len)
{ {
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); LogError("_utf8ToUni failed.");
LogError(logBuf);
return false;
}
if(sizeof(uniStr) - len <= 5)
{
sprintf(logBuf, "%s too long!", chStr.c_str());
LogError(logBuf);
return false; return false;
} }
@ -71,23 +67,14 @@ namespace CppJieba
bool Segment::cutRMM(const string& chStr, vector<string>& res) bool Segment::cutRMM(const string& chStr, vector<string>& res)
{ {
res.clear(); res.clear();
char logBuf[bufSize];
char utfBuf[bufSize]; char utfBuf[bufSize];
ChUnicode uniStr[bufSize]; ChUnicode uniStr[bufSize];
memset(uniStr, 0, sizeof(uniStr)); memset(uniStr, 0, sizeof(uniStr));
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr);
size_t len = _utf8ToUni(chStr, uniStr, bufSize);
if(0 == len) if(0 == len)
{ {
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); LogError("_utf8ToUni failed.");
LogError(logBuf);
return false;
}
if(sizeof(uniStr) - len <= 5)
{
sprintf(logBuf, "%s too long!", chStr.c_str());
LogError(logBuf);
return false; return false;
} }
@ -120,6 +107,28 @@ namespace CppJieba
} }
return true; return true;
} }
size_t Segment::_utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size)
{
char logBuf[bufSize];
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr);
if(0 == len)
{
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str());
LogError(logBuf);
return 0;
}
if(size - len <= 5)
{
sprintf(logBuf, "%s too long!", chStr.c_str());
LogError(logBuf);
return 0;
}
return len;
}
} }

View File

@ -16,9 +16,13 @@ namespace CppJieba
bool init(const char* const dictFilePath); bool init(const char* const dictFilePath);
bool destroy(); bool destroy();
public: public:
bool cutDAG(const string& chStr, vector<string>& res);
bool cutMM(const string& chStr, vector<string>& res); bool cutMM(const string& chStr, vector<string>& res);
bool cutRMM(const string& chStr, vector<string>& res); bool cutRMM(const string& chStr, vector<string>& res);
private:
size_t _utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size);
private: private:
enum {bufSize = 1024}; enum {bufSize = 1024};

View File

@ -183,7 +183,7 @@ namespace CppJieba
for(int i = 0; i < len; i++) for(int i = 0; i < len; i++)
{ {
ChUnicode chWord = chUniStr[i]; ChUnicode chWord = chUniStr[i];
TrieNodeHashMap::const_iterator iter = p->hmap.find(chWord); TrieNodeMap::const_iterator iter = p->hmap.find(chWord);
if(iter != p->hmap.end()) if(iter != p->hmap.end())
{ {
TrieNode * next = iter->second; TrieNode * next = iter->second;
@ -240,7 +240,7 @@ namespace CppJieba
bool Trie::_destroyNode(TrieNode* node) bool Trie::_destroyNode(TrieNode* node)
{ {
for(TrieNodeHashMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
{ {
TrieNode* next = it->second; TrieNode* next = it->second;
_destroyNode(next); _destroyNode(next);
@ -257,7 +257,7 @@ namespace CppJieba
LogError("failed! node is null."); LogError("failed! node is null.");
return; return;
} }
for(TrieNodeHashMap::const_iterator it = node->hmap.begin(); it != node->hmap.end(); it++) for(TrieNodeMap::const_iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
{ {
char utfBuf[8]; char utfBuf[8];
ChUnicode chBuf[1]; ChUnicode chBuf[1];

6
Trie.h
View File

@ -19,9 +19,7 @@ namespace CppJieba
using namespace CPPCOMMON; using namespace CPPCOMMON;
using namespace std; using namespace std;
//using __gnu_cxx::hash_map; //using __gnu_cxx::hash_map;
typedef uint16_t ChUnicode; typedef map<ChUnicode, struct TrieNode*> TrieNodeMap;
const size_t ChUniMaxLen = 1024;
typedef map<ChUnicode, struct TrieNode*> TrieNodeHashMap;
struct TrieNodeInfo struct TrieNodeInfo
{ {
@ -36,7 +34,7 @@ namespace CppJieba
struct TrieNode struct TrieNode
{ {
TrieNodeHashMap hmap; TrieNodeMap hmap;
bool isLeaf; bool isLeaf;
unsigned int nodeInfoVecPos; unsigned int nodeInfoVecPos;

View File

@ -1,6 +1,13 @@
#ifndef GLOBALS_H #ifndef GLOBALS_H
#define GLOBALS_H #define GLOBALS_H
#include <map>
//file path
const char * const DICT_FILE_PATH = "dict.txt"; const char * const DICT_FILE_PATH = "dict.txt";
//typedefs
typedef uint16_t ChUnicode;
#endif #endif