mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
bak
This commit is contained in:
parent
6eb200d867
commit
1de2635e44
4
Makefile
4
Makefile
@ -32,10 +32,10 @@ $(CMLIB): $(CMDIR)
|
|||||||
cd $(CMDIR) && $(MAKE)
|
cd $(CMDIR) && $(MAKE)
|
||||||
|
|
||||||
#unit test
|
#unit test
|
||||||
Trie.ut: Trie.cpp Trie.h $(CMLIB)
|
Trie.ut: Trie.cpp Trie.h globals.h $(CMLIB)
|
||||||
$(CC) -o $@ $< -DTRIE_UT $(CMLIB)
|
$(CC) -o $@ $< -DTRIE_UT $(CMLIB)
|
||||||
|
|
||||||
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h $(CMLIB)
|
Segment.ut: Segment.cpp Trie.cpp Segment.h Trie.h globals.h $(CMLIB)
|
||||||
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB)
|
$(CC) -o $@ Segment.cpp Trie.cpp -DSEGMENT_UT $(CMLIB)
|
||||||
|
|
||||||
|
|
||||||
|
55
Segment.cpp
55
Segment.cpp
@ -20,26 +20,22 @@ namespace CppJieba
|
|||||||
return _trie.destroy();
|
return _trie.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Segment::cutDAG(const string& chStr, vector<string>& res)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Segment::cutMM(const string& chStr, vector<string>& res)
|
bool Segment::cutMM(const string& chStr, vector<string>& res)
|
||||||
{
|
{
|
||||||
res.clear();
|
res.clear();
|
||||||
char logBuf[bufSize];
|
|
||||||
char utfBuf[bufSize];
|
char utfBuf[bufSize];
|
||||||
ChUnicode uniStr[bufSize];
|
ChUnicode uniStr[bufSize];
|
||||||
memset(uniStr, 0, sizeof(uniStr));
|
memset(uniStr, 0, sizeof(uniStr));
|
||||||
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr);
|
size_t len = _utf8ToUni(chStr, uniStr, bufSize);
|
||||||
|
|
||||||
if(0 == len)
|
if(0 == len)
|
||||||
{
|
{
|
||||||
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str());
|
LogError("_utf8ToUni failed.");
|
||||||
LogError(logBuf);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(sizeof(uniStr) - len <= 5)
|
|
||||||
{
|
|
||||||
sprintf(logBuf, "%s too long!", chStr.c_str());
|
|
||||||
LogError(logBuf);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,23 +67,14 @@ namespace CppJieba
|
|||||||
bool Segment::cutRMM(const string& chStr, vector<string>& res)
|
bool Segment::cutRMM(const string& chStr, vector<string>& res)
|
||||||
{
|
{
|
||||||
res.clear();
|
res.clear();
|
||||||
char logBuf[bufSize];
|
|
||||||
char utfBuf[bufSize];
|
char utfBuf[bufSize];
|
||||||
ChUnicode uniStr[bufSize];
|
ChUnicode uniStr[bufSize];
|
||||||
memset(uniStr, 0, sizeof(uniStr));
|
memset(uniStr, 0, sizeof(uniStr));
|
||||||
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr);
|
|
||||||
|
|
||||||
|
size_t len = _utf8ToUni(chStr, uniStr, bufSize);
|
||||||
if(0 == len)
|
if(0 == len)
|
||||||
{
|
{
|
||||||
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str());
|
LogError("_utf8ToUni failed.");
|
||||||
LogError(logBuf);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(sizeof(uniStr) - len <= 5)
|
|
||||||
{
|
|
||||||
sprintf(logBuf, "%s too long!", chStr.c_str());
|
|
||||||
LogError(logBuf);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,6 +107,28 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t Segment::_utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size)
|
||||||
|
{
|
||||||
|
char logBuf[bufSize];
|
||||||
|
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr);
|
||||||
|
|
||||||
|
if(0 == len)
|
||||||
|
{
|
||||||
|
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str());
|
||||||
|
LogError(logBuf);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(size - len <= 5)
|
||||||
|
{
|
||||||
|
sprintf(logBuf, "%s too long!", chStr.c_str());
|
||||||
|
LogError(logBuf);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,9 +16,13 @@ namespace CppJieba
|
|||||||
bool init(const char* const dictFilePath);
|
bool init(const char* const dictFilePath);
|
||||||
bool destroy();
|
bool destroy();
|
||||||
public:
|
public:
|
||||||
|
bool cutDAG(const string& chStr, vector<string>& res);
|
||||||
bool cutMM(const string& chStr, vector<string>& res);
|
bool cutMM(const string& chStr, vector<string>& res);
|
||||||
bool cutRMM(const string& chStr, vector<string>& res);
|
bool cutRMM(const string& chStr, vector<string>& res);
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t _utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum {bufSize = 1024};
|
enum {bufSize = 1024};
|
||||||
|
|
||||||
|
6
Trie.cpp
6
Trie.cpp
@ -183,7 +183,7 @@ namespace CppJieba
|
|||||||
for(int i = 0; i < len; i++)
|
for(int i = 0; i < len; i++)
|
||||||
{
|
{
|
||||||
ChUnicode chWord = chUniStr[i];
|
ChUnicode chWord = chUniStr[i];
|
||||||
TrieNodeHashMap::const_iterator iter = p->hmap.find(chWord);
|
TrieNodeMap::const_iterator iter = p->hmap.find(chWord);
|
||||||
if(iter != p->hmap.end())
|
if(iter != p->hmap.end())
|
||||||
{
|
{
|
||||||
TrieNode * next = iter->second;
|
TrieNode * next = iter->second;
|
||||||
@ -240,7 +240,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool Trie::_destroyNode(TrieNode* node)
|
bool Trie::_destroyNode(TrieNode* node)
|
||||||
{
|
{
|
||||||
for(TrieNodeHashMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
||||||
{
|
{
|
||||||
TrieNode* next = it->second;
|
TrieNode* next = it->second;
|
||||||
_destroyNode(next);
|
_destroyNode(next);
|
||||||
@ -257,7 +257,7 @@ namespace CppJieba
|
|||||||
LogError("failed! node is null.");
|
LogError("failed! node is null.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for(TrieNodeHashMap::const_iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
for(TrieNodeMap::const_iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
||||||
{
|
{
|
||||||
char utfBuf[8];
|
char utfBuf[8];
|
||||||
ChUnicode chBuf[1];
|
ChUnicode chBuf[1];
|
||||||
|
6
Trie.h
6
Trie.h
@ -19,9 +19,7 @@ namespace CppJieba
|
|||||||
using namespace CPPCOMMON;
|
using namespace CPPCOMMON;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
//using __gnu_cxx::hash_map;
|
//using __gnu_cxx::hash_map;
|
||||||
typedef uint16_t ChUnicode;
|
typedef map<ChUnicode, struct TrieNode*> TrieNodeMap;
|
||||||
const size_t ChUniMaxLen = 1024;
|
|
||||||
typedef map<ChUnicode, struct TrieNode*> TrieNodeHashMap;
|
|
||||||
|
|
||||||
struct TrieNodeInfo
|
struct TrieNodeInfo
|
||||||
{
|
{
|
||||||
@ -36,7 +34,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
struct TrieNode
|
struct TrieNode
|
||||||
{
|
{
|
||||||
TrieNodeHashMap hmap;
|
TrieNodeMap hmap;
|
||||||
bool isLeaf;
|
bool isLeaf;
|
||||||
unsigned int nodeInfoVecPos;
|
unsigned int nodeInfoVecPos;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user