This commit is contained in:
gwdwyy 2013-07-20 17:39:19 +08:00
parent dc5ebf9988
commit c8ea7610bd
4 changed files with 136 additions and 87 deletions

View File

@ -6,15 +6,19 @@
namespace CppJieba namespace CppJieba
{ {
Segment::Segment():_trie() Segment::Segment()
{ {
_encVec.push_back(Trie::UTF8);
_encVec.push_back(Trie::GBK);
//default encoding : utf8
_encoding = Trie::UTF8;
} }
Segment::~Segment() Segment::~Segment()
{ {
} }
bool Segment::init(const char* const dictFilePath) bool Segment::init(const string& dictFilePath)
{ {
bool retFlag; bool retFlag;
LogInfo(string_format("_trie.init(%s) start...", dictFilePath)); LogInfo(string_format("_trie.init(%s) start...", dictFilePath));
@ -28,11 +32,11 @@ namespace CppJieba
return _trie.destroy(); return _trie.destroy();
} }
bool Segment::cutDAG(const string& chStr, vector<string>& res) bool Segment::cutDAG(const string& str, vector<string>& res)
{ {
bool retFlag; bool retFlag;
res.clear(); res.clear();
string uniStr = _utf8ToUni(chStr); string uniStr = _utf8ToUni(str;
if(uniStr.empty()) if(uniStr.empty())
{ {
LogError("_utf8ToUni failed."); LogError("_utf8ToUni failed.");
@ -79,13 +83,11 @@ namespace CppJieba
return _trie.getWeight(word); return _trie.getWeight(word);
} }
string Segment::_utf8ToUni(const string& utfStr) string Segment::_utf8ToUni(const string& utfStr)
{ {
string uniStr = utf8ToUnicode(utfStr); string uniStr = utf8ToUnicode(utfStr);
if(uniStr.empty()) if(uniStr.empty() || uniStr.size() % 2)
{ {
LogError(string_format("utf8ToUnicode [%s] failed!", utfStr.c_str())); LogError(string_format("utf8ToUnicode [%s] failed!", utfStr.c_str()));
return ""; return "";
@ -101,6 +103,7 @@ namespace CppJieba
vec.push_back(i/2); vec.push_back(i/2);
for(uint j = i + 4; j <= uniStr.size(); j+=2) for(uint j = i + 4; j <= uniStr.size(); j+=2)
{ {
cout<<uniStr.substr(i, j - i)<<endl;
if(NULL != _trie.find(uniStr.substr(i, j - i))) if(NULL != _trie.find(uniStr.substr(i, j - i)))
{ {
vec.push_back((j - 2)/2); vec.push_back((j - 2)/2);
@ -136,7 +139,6 @@ namespace CppJieba
for(int j = 0; j < dag[i/2].size(); j++) for(int j = 0; j < dag[i/2].size(); j++)
{ {
//cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl; //cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl;
//getchar();
int pos = dag[i/2][j]; int pos = dag[i/2][j];
double val = getUniWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second; double val = getUniWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
//cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl; //cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl;
@ -182,7 +184,6 @@ namespace CppJieba
return true; return true;
} }
} }
@ -205,21 +206,25 @@ int main()
res.clear(); res.clear();
segment.cutDAG(title, res); segment.cutDAG(title, res);
PRINT_VECTOR(res); PRINT_VECTOR(res);
getchar();
title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏";
res.clear(); res.clear();
segment.cutDAG(title, res); segment.cutDAG(title, res);
PRINT_VECTOR(res); PRINT_VECTOR(res);
getchar();
title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女"; title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女";
res.clear(); res.clear();
segment.cutDAG(title, res); segment.cutDAG(title, res);
PRINT_VECTOR(res); PRINT_VECTOR(res);
getchar();
title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女"; title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女";
res.clear(); res.clear();
segment.cutDAG(title, res); segment.cutDAG(title, res);
PRINT_VECTOR(res); PRINT_VECTOR(res);
getchar();
segment.destroy(); segment.destroy();
return 0; return 0;

View File

@ -15,12 +15,14 @@ namespace CppJieba
class Segment class Segment
{ {
private: private:
string _encoding;
vector<string> _encVec;
Trie _trie; Trie _trie;
public: public:
Segment(); Segment();
~Segment(); ~Segment();
public: public:
bool init(const char* const dictFilePath); bool init(const string& dictFilePath);
bool destroy(); bool destroy();
public: public:
bool cutDAG(const string& chStr, vector<string>& res); bool cutDAG(const string& chStr, vector<string>& res);
@ -33,9 +35,6 @@ namespace CppJieba
bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res); bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res); bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res);
private:
enum {bufSize = 1024};
}; };
} }

View File

@ -6,6 +6,9 @@
namespace CppJieba namespace CppJieba
{ {
const string& Trie::UTF8 = "utf-8";
const string& Trie::GBK = "gbk";
Trie::iterator Trie::begin() Trie::iterator Trie::begin()
{ {
return _nodeInfoVec.begin(); return _nodeInfoVec.begin();
@ -16,8 +19,16 @@ namespace CppJieba
return _nodeInfoVec.end(); return _nodeInfoVec.end();
} }
Trie::Trie():_root(NULL), _totalCount(0) Trie::Trie()
{ {
//encodings : utf-8, gbk
_encVec.push_back(UTF8);
_encVec.push_back(GBK);
//default encoding : utf-8
_encoding = UTF8;
_root = NULL;
_totalCount = 0;
_minWeight = numeric_limits<double>::max(); _minWeight = numeric_limits<double>::max();
} }
@ -26,11 +37,22 @@ namespace CppJieba
destroy(); destroy();
} }
bool Trie::init(const char* const filePath) bool Trie::setEncoding(const string& enc)
{ {
if(!checkFileExist(filePath)) if(!isInVec<string>(_encVec, enc))
{ {
LogError(string_format("cann't find fiel[%s].",filePath)); LogError(string_format("%s illegal : not in [%s]", enc.c_str(), joinStr(_encVec, ",").c_str()));
return false;
}
_encoding = enc;
return true;
}
bool Trie::init(const string& filePath)
{
if(!checkFileExist(filePath.c_str()))
{
LogError(string_format("cann't find fiel[%s].",filePath.c_str()));
return false; return false;
} }
bool res = false; bool res = false;
@ -49,7 +71,7 @@ namespace CppJieba
return true; return true;
} }
bool Trie::_buildTree(const char* const filePath) bool Trie::_buildTree(const string& filePath)
{ {
if(NULL != _root) if(NULL != _root)
{ {
@ -57,7 +79,8 @@ namespace CppJieba
return false; return false;
} }
_root = new TrieNode; _root = new TrieNode;
ifstream ifile(filePath);
ifstream ifile(filePath.c_str());
string line; string line;
vector<string> vecBuf; vector<string> vecBuf;
while(getline(ifile, line)) while(getline(ifile, line))
@ -86,7 +109,7 @@ namespace CppJieba
nodeInfo.count = count; nodeInfo.count = count;
nodeInfo.tag = tag; nodeInfo.tag = tag;
bool flag = _insert(nodeInfo); bool flag = insert(nodeInfo);
if(!flag) if(!flag)
{ {
LogError("insert node failed!"); LogError("insert node failed!");
@ -178,29 +201,27 @@ namespace CppJieba
return res; return res;
} }
const TrieNodeInfo* Trie::find(const string& uniStr) const TrieNodeInfo* Trie::find(const string& str)
{ {
ChUnicode* pUni = new ChUnicode[uniStr.size()]; string uniStr = decode(str);
for(uint i = 0; i < uniStr.size(); i+=2) return _findUniStr(uniStr);
{
pUni[i/2] = twocharToUint16(uniStr[i], uniStr[i+1]);
}
const TrieNodeInfo* res = find(pUni, uniStr.size()/2);
delete [] pUni;
return res;
} }
const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len) const TrieNodeInfo* Trie::_findUniStr(const string& uniStr)
{ {
if(NULL == _root) if(NULL == _root)
{ {
LogFatal("trie not initted!"); LogFatal("trie not initted!");
return NULL; return NULL;
} }
TrieNode* p = _root; if(uniStr.empty() || uniStr.size() % 2)
for(uint i = 0; i < len; i++)
{ {
ChUnicode chUni = chUniStr[i]; LogError("uniStr illegal");
}
TrieNode* p = _root;
for(uint i = 0; i < uniStr.size(); i+=2)
{
ChUnicode chUni = twocharToUint16(uniStr[i], uniStr[i+1]);
if(p->hmap.find(chUni) == p-> hmap.end()) if(p->hmap.find(chUni) == p-> hmap.end())
{ {
return NULL; return NULL;
@ -226,6 +247,7 @@ namespace CppJieba
return NULL; return NULL;
} }
/*
double Trie::getWeight(const ChUnicode* uniStr, size_t len) double Trie::getWeight(const ChUnicode* uniStr, size_t len)
{ {
const TrieNodeInfo* p = find(uniStr, len); const TrieNodeInfo* p = find(uniStr, len);
@ -238,10 +260,11 @@ namespace CppJieba
return getMinWeight(); return getMinWeight();
} }
} }
*/
double Trie::getWeight(const string& uniStr) double Trie::getWeight(const string& uniStr)
{ {
const TrieNodeInfo * p = find(uniStr); const TrieNodeInfo * p = _findUniStr(uniStr);
if(NULL != p) if(NULL != p)
{ {
return p->weight; return p->weight;
@ -262,29 +285,6 @@ namespace CppJieba
return _totalCount; return _totalCount;
} }
/*
bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res)
{
res.clear();
//cout<<len<<endl;
for(size_t i = 0; i < len; i++)
{
//cout<<__LINE__<<","<<chUniStr[i]<<endl;
res.push_back(vector<size_t>());
vector<size_t>& vec = res[i];
for(size_t j = i; j < len; j++)
{
if(find(chUniStr + i, j - i + 1))
{
vec.push_back(j);
}
}
}
return true;
}
*/
bool Trie::_destroyNode(TrieNode* node) bool Trie::_destroyNode(TrieNode* node)
{ {
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
@ -297,21 +297,55 @@ namespace CppJieba
return true; return true;
} }
bool Trie::_insert(const TrieNodeInfo& nodeInfo) string Trie::decode(const string& str)
{ {
_nodeInfoVec.push_back(nodeInfo); if(_encoding == UTF8)
const string& word = nodeInfo.word;
ChUnicode chUniStr[bufSize];
memset(chUniStr, 0, sizeof(chUniStr));
size_t len = utf8ToUnicode(word.c_str(), word.size(), chUniStr);
if(0 == len)
{ {
return utf8ToUnicode(str);
}
if(_encoding == GBK)
{
return utf8ToUnicode(gbkToUtf8(str));
}
LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str()));
return "";
}
string Trie::encode(const string& str)
{
if(_encoding == UTF8)
{
return unicodeToUtf8(str);
}
if(_encoding == GBK)
{
return utf8ToGbk(unicodeToUtf8(str));
}
LogFatal(string_format("_encoding[%s] illeage!", _encoding.c_str()));
return "";
}
bool Trie::insert(const TrieNodeInfo& nodeInfo)
{
if(NULL == _root)
{
LogError("_root is NULL");
return false; return false;
} }
const string& word = nodeInfo.word;
string uniStr = decode(word);
if(uniStr.empty() || uniStr.size() % 2)
{
LogError("decode error.");
return false;
}
TrieNode* p = _root; TrieNode* p = _root;
for(int i = 0; i < len; i++) for(uint i = 0; i < uniStr.size(); i+=2)
{ {
ChUnicode cu = chUniStr[i]; ChUnicode cu = twocharToUint16(uniStr[i], uniStr[i+1]);
if(NULL == p) if(NULL == p)
{ {
return false; return false;
@ -327,7 +361,6 @@ namespace CppJieba
{ {
return false; return false;
} }
p->hmap[cu] = next; p->hmap[cu] = next;
p = next; p = next;
} }
@ -340,15 +373,16 @@ namespace CppJieba
{ {
return false; return false;
} }
p->isLeaf = true; if(p->isLeaf)
if(!_nodeInfoVec.empty())
{
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
}
else
{ {
LogError("this node already inserted");
return false; return false;
} }
p->isLeaf = true;
_nodeInfoVec.push_back(nodeInfo);
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
return true; return true;
} }
@ -397,7 +431,7 @@ using namespace CppJieba;
int main() int main()
{ {
Trie trie; Trie trie;
trie.init("dicts/segdict.utf8.v2.1"); trie.init("../dicts/segdict.utf8.v2.1");
//trie.init("dicts/jieba.dict.utf8"); //trie.init("dicts/jieba.dict.utf8");
//trie.init("dict.100"); //trie.init("dict.100");
//char utf[1024] = "我来到北京清华大学3D电视"; //char utf[1024] = "我来到北京清华大学3D电视";

View File

@ -29,7 +29,7 @@ namespace CppJieba
struct TrieNodeInfo struct TrieNodeInfo
{ {
string word;// utf8 string word string word;
size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 . size_t wLen;// the word's len , not string.size(), eg: "我是中国人" wLen = 5 .
size_t count; size_t count;
string tag; string tag;
@ -48,17 +48,20 @@ namespace CppJieba
{ {
TrieNodeMap hmap; TrieNodeMap hmap;
bool isLeaf; bool isLeaf;
unsigned int nodeInfoVecPos; uint nodeInfoVecPos;
TrieNode() TrieNode()
:hmap(), isLeaf(false), nodeInfoVecPos(0)
{ {
isLeaf = false;
nodeInfoVecPos = 0;
} }
}; };
class Trie class Trie
{ {
private: private:
string _encoding;
vector<string> _encVec;
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfoVec;
@ -75,30 +78,38 @@ namespace CppJieba
public: public:
Trie(); Trie();
~Trie(); ~Trie();
bool init(const char* const filePath); bool init(const string& filePath);
bool setEncoding(const string& enc);
bool destroy(); bool destroy();
void display(); void display();
public: public:
const TrieNodeInfo* find(const string& uniStr); //const TrieNodeInfo* find(const string& uniStr);
const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len); //const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len);
const TrieNodeInfo* find(const string& str);
const TrieNodeInfo* findPrefix(const string& utf8Str); const TrieNodeInfo* findPrefix(const string& utf8Str);
public: public:
double getWeight(const ChUnicode* uniStr, size_t len); //double getWeight(const ChUnicode* uniStr, size_t len);
double getWeight(const string& uniStr); double getWeight(const string& uniStr);
double getMinWeight(); double getMinWeight();
int64_t getTotalCount(); int64_t getTotalCount();
bool insert(const TrieNodeInfo& nodeInfo);
string decode(const string& str);
string encode(const string& str);
private: private:
bool _buildTree(const char* const filePath); bool _buildTree(const string& filePath);
bool _countWeight(); bool _countWeight();
bool _destroyNode(TrieNode* node); bool _destroyNode(TrieNode* node);
bool _insert(const TrieNodeInfo& nodeInfo); const TrieNodeInfo* _findUniStr(const string& uniStr);
private: public:
enum {bufSize = 1024}; static const string& UTF8;
static const string& GBK;
}; };
} }