add findMaxMatch and cutMM

This commit is contained in:
gwdwyy 2013-07-05 17:22:01 +08:00
parent 0200ff1d01
commit f2c5f571f2
4 changed files with 83 additions and 3 deletions

View File

@ -20,6 +20,54 @@ namespace CppJieba
return _trie.destroy();
}
bool Segment::cutMM(const string& chStr, vector<string>& res)
{
res.clear();
char logBuf[bufSize];
char utfBuf[bufSize];
ChUnicode uniStr[bufSize];
memset(uniStr, 0, sizeof(uniStr));
size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr);
if(0 == len)
{
sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str());
LogError(logBuf);
return false;
}
if(sizeof(uniStr) - len <= 5)
{
sprintf(logBuf, "%s too long!", chStr.c_str());
LogError(logBuf);
return false;
}
int i = 0;
while(i < len)
{
cout<<__FILE__<<__LINE__<<i<<endl;
int pos = _trie.findMaxMatch(uniStr + i, len - i);
if(-1 != pos)
{
int utfLen = unicodeToUtf8(uniStr + i, pos, utfBuf);
if(0 == utfLen)
{
LogError("unicodeToUtf8 failed!");
return false;
}
res.push_back(utfBuf);
i += pos;
}
else
{
i++;
}
}
return true;
}
bool Segment::cutRMM(const string& chStr, vector<string>& res)
{
res.clear();
@ -43,7 +91,6 @@ namespace CppJieba
return false;
}
int i = len - 1;
while(i >= 0)
{
@ -85,7 +132,7 @@ int main()
segment.init("dict.utf8");
vector<string> res;
string title = "我来到北京清华大学3D电视";
bool flag = segment.cutRMM(title, res);
bool flag = segment.cutMM(title, res);
if(flag)
{
for(int i = 0; i < res.size(); i++)

View File

@ -16,6 +16,7 @@ namespace CppJieba
bool init(const char* const dictFilePath);
bool destroy();
public:
bool cutMM(const string& chStr, vector<string>& res);
bool cutRMM(const string& chStr, vector<string>& res);
private:

View File

@ -75,6 +75,7 @@ namespace CppJieba
bool Trie::find(const ChUnicode* chUniStr, size_t len)
{
int res = -1;
TrieNode* p = _root;
for(size_t i = 0; i < len; i++)
{
@ -91,6 +92,7 @@ namespace CppJieba
return p->isLeaf;
}
/*
bool Trie::find(const vector<ChUnicode>& uniVec)
{
TrieNode * p = _root;
@ -108,6 +110,33 @@ namespace CppJieba
}
return p->isLeaf;
}
*/
int Trie::findMaxMatch(const ChUnicode* chUniStr, size_t len)
{
int res = -1;
TrieNode * p = _root;
for(int i = 0; i < len; i++)
{
ChUnicode chWord = chUniStr[i];
TrieNodeHashMap::const_iterator iter = p->hmap.find(chWord);
if(iter != p->hmap.end())
{
TrieNode * next = iter->second;
if(next->isLeaf)
{
res = i + 1;
}
p = next;
}
else
{
break;
}
}
cout<<__FILE__<<__LINE__<<res<<endl;
return res;
}
bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res)
{

5
Trie.h
View File

@ -95,8 +95,11 @@ namespace CppJieba
bool init(const char* const filepath = DICT_FILE_PATH);
bool destroy();
void display();
public:
bool find(const ChUnicode* chUniStr, size_t len);
bool find(const vector<ChUnicode>& uniVec);
//bool find(const vector<ChUnicode>& uniVec);
int findMaxMatch(const ChUnicode* chUniStr, size_t len);
public:
bool cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res);