mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
rewriting Segment.cpp/h
This commit is contained in:
parent
d69411e998
commit
ce4f2521b7
@ -41,10 +41,12 @@ namespace CppJieba
|
||||
|
||||
bool Segment::cutDAG(const string& str, vector<string>& res)
|
||||
{
|
||||
bool retFlag;
|
||||
res.clear();
|
||||
string uniStr = gEncoding.decode(str);
|
||||
if(uniStr.empty())
|
||||
|
||||
bool retFlag;
|
||||
Unicode unicode;
|
||||
retFlag = gEncoding.decode(str, unicode);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("gEncoding.decode failed.");
|
||||
return false;
|
||||
@ -52,7 +54,7 @@ namespace CppJieba
|
||||
|
||||
//calc DAG
|
||||
vector<vector<uint> > dag;
|
||||
retFlag = _calcDAG(uniStr, dag);
|
||||
retFlag = _calcDAG(unicode, dag);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_calcDAG failed.");
|
||||
@ -60,14 +62,14 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
vector<pair<int, double> > dp;
|
||||
retFlag = _calcDP(uniStr, dag, dp);
|
||||
retFlag = _calcDP(unicode, dag, dp);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_calcDP failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
retFlag = _cutDAG(uniStr, dp, res);
|
||||
retFlag = _cutDAG(unicode, dp, res);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_cutDAG failed.");
|
||||
@ -77,23 +79,24 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
double Segment::getWordWeight(const string& word)
|
||||
bool Segment::_calcDAG(const Unicode& unicode, vector<vector<uint> >& dag)
|
||||
{
|
||||
return _trie.getWeight(word);
|
||||
}
|
||||
|
||||
bool Segment::_calcDAG(const string& uniStr, vector<vector<uint> >& dag)
|
||||
{
|
||||
for(uint i = 0; i < uniStr.size(); i+=2)
|
||||
if(unicode.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
typedef UnicodeConstIterator UCI;
|
||||
UCI beginIter = unicode.begin();
|
||||
for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++)
|
||||
{
|
||||
vector<uint> vec;
|
||||
vec.push_back(i/2);
|
||||
for(uint j = i + 4; j <= uniStr.size(); j+=2)
|
||||
vec.push_back(iterI - beginIter);
|
||||
for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
|
||||
{
|
||||
//cout<<uniStr.substr(i, j - i)<<endl;
|
||||
if(NULL != _trie.find(uniStr.substr(i, j - i)))
|
||||
//care: the iterJ exceed iterEnd
|
||||
if(NULL != _trie.find(iterI, iterJ + 1))
|
||||
{
|
||||
vec.push_back((j - 2)/2);
|
||||
vec.push_back(iterJ - beginIter);
|
||||
}
|
||||
}
|
||||
dag.push_back(vec);
|
||||
@ -101,68 +104,75 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Segment::_calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res)
|
||||
bool Segment::_calcDP(const Unicode& unicode, const vector<vector<uint> >& dag, vector<pair<int, double> >& res)
|
||||
{
|
||||
if(uniStr.size() / 2 != dag.size())
|
||||
if(unicode.empty())
|
||||
{
|
||||
LogError("unicode illegal");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(unicode.size() != dag.size())
|
||||
{
|
||||
LogError("dag is illegal!");
|
||||
return false;
|
||||
}
|
||||
if(uniStr.size() < 2)
|
||||
{
|
||||
LogError("uniStr illegal");
|
||||
return false;
|
||||
}
|
||||
|
||||
res.clear();
|
||||
res.assign(uniStr.size()/2 + 1, pair<int, double>(-1, 0.0));
|
||||
res[uniStr.size()/2].first = -1;
|
||||
res[uniStr.size()/2].second = 0.0;
|
||||
for(int i = uniStr.size() - 2; i >= 0; i-=2)
|
||||
res.assign(unicode.size() + 1, pair<int, double>(-1, 0.0));
|
||||
res[unicode.size()].first = -1;
|
||||
res[unicode.size()].second = 0.0;
|
||||
|
||||
UnicodeConstIterator iterBegin = unicode.begin();
|
||||
|
||||
for(int i = unicode.size() - 1; i >= 0; i--)
|
||||
{
|
||||
// calc max
|
||||
res[i/2].first = -1;
|
||||
res[i/2].second = -(numeric_limits<double>::max());
|
||||
for(int j = 0; j < dag[i/2].size(); j++)
|
||||
res[i].first = -1;
|
||||
res[i].second = -(numeric_limits<double>::max());
|
||||
for(int j = 0; j < dag[i].size(); j++)
|
||||
{
|
||||
//cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl;
|
||||
int pos = dag[i/2][j];
|
||||
double val = getWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
|
||||
int pos = dag[i][j];
|
||||
double val = _trie.getWeight(iterBegin + i, iterBegin + pos + 1) + res[pos + 1].second;
|
||||
//double val = _trie.getWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
|
||||
//cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl;
|
||||
if(val > res[i/2].second)
|
||||
if(val > res[i].second)
|
||||
{
|
||||
res[i/2].first = pos;
|
||||
res[i/2].second = val;
|
||||
res[i].first = pos;
|
||||
res[i].second = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
res.pop_back();
|
||||
return true;
|
||||
}
|
||||
bool Segment::_cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res)
|
||||
bool Segment::_cutDAG(const Unicode& unicode, const vector<pair<int, double> >& dp, vector<string>& res)
|
||||
{
|
||||
if(dp.size() != uniStr.size()/2)
|
||||
if(dp.size() != unicode.size())
|
||||
{
|
||||
LogError("dp or uniStr illegal!");
|
||||
LogError("dp or unicode illegal!");
|
||||
return false;
|
||||
}
|
||||
|
||||
res.clear();
|
||||
|
||||
uint begin = 0;
|
||||
UnicodeConstIterator iterBegin = unicode.begin();
|
||||
for(uint i = 0; i < dp.size(); i++)
|
||||
{
|
||||
//cout<<dp[i].first<<","
|
||||
// <<dp[i].second<<endl;
|
||||
uint end = dp[i].first * 2 + 2;
|
||||
uint end = dp[i].first + 1;
|
||||
if(end <= begin)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
string tmp = unicodeToUtf8(uniStr.substr(begin, end - begin));
|
||||
//string tmp = gEncoding.encode(uniStr.substr(begin, end - begin));
|
||||
string tmp = gEncoding.encode(iterBegin + begin, iterBegin + end);
|
||||
if(tmp.empty())
|
||||
{
|
||||
LogError("unicodeToUtf8 failed.");
|
||||
LogError("gEncoding.encode failed.");
|
||||
return false;
|
||||
}
|
||||
res.push_back(tmp);
|
||||
|
@ -25,12 +25,11 @@ namespace CppJieba
|
||||
bool dispose();
|
||||
public:
|
||||
bool cutDAG(const string& chStr, vector<string>& res);
|
||||
double getWordWeight(const string& word);
|
||||
|
||||
private:
|
||||
bool _calcDAG(const string& uniStr, vector<vector<uint> >& dag);
|
||||
bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
|
||||
bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res);
|
||||
bool _calcDAG(const Unicode& unicode, vector<vector<uint> >& dag);
|
||||
bool _calcDP(const Unicode& unicode, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
|
||||
bool _cutDAG(const Unicode& unicode, const vector<pair<int, double> >& dp, vector<string>& res);
|
||||
|
||||
};
|
||||
}
|
||||
|
49
src/Trie.cpp
49
src/Trie.cpp
@ -156,7 +156,7 @@ namespace CppJieba
|
||||
LogFatal("trie not initted!");
|
||||
return NULL;
|
||||
}
|
||||
vector<uint16_t> unicode;
|
||||
Unicode unicode;
|
||||
|
||||
bool retFlag = gEncoding.decode(str, unicode);
|
||||
if(retFlag)
|
||||
@ -199,7 +199,7 @@ namespace CppJieba
|
||||
|
||||
const TrieNodeInfo* Trie::find(const string& str)
|
||||
{
|
||||
vector<uint16_t> unicode;
|
||||
Unicode unicode;
|
||||
bool retFlag = gEncoding.decode(str, unicode);
|
||||
if(!retFlag)
|
||||
{
|
||||
@ -208,7 +208,16 @@ namespace CppJieba
|
||||
return find(unicode);
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(const vector<uint16_t>& unicode)
|
||||
const TrieNodeInfo* Trie::find(const Unicode& unicode)
|
||||
{
|
||||
if(unicode.empty())
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return find(unicode.begin(), unicode.end());
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(UnicodeConstIterator begin, UnicodeConstIterator end)
|
||||
{
|
||||
|
||||
if(!_getInitFlag())
|
||||
@ -216,15 +225,14 @@ namespace CppJieba
|
||||
LogFatal("trie not initted!");
|
||||
return NULL;
|
||||
}
|
||||
if(unicode.empty())
|
||||
if(begin >= end)
|
||||
{
|
||||
LogError("unicode empty");
|
||||
return NULL;
|
||||
}
|
||||
TrieNode* p = _root;
|
||||
for(uint i = 0; i < unicode.size(); i++)
|
||||
for(UnicodeConstIterator it = begin; it != end; it++)
|
||||
{
|
||||
uint16_t chUni = unicode[i];
|
||||
uint16_t chUni = *it;
|
||||
if(p->hmap.find(chUni) == p-> hmap.end())
|
||||
{
|
||||
return NULL;
|
||||
@ -253,8 +261,17 @@ namespace CppJieba
|
||||
double Trie::getWeight(const string& str)
|
||||
{
|
||||
|
||||
vector<uint16_t> unicode;
|
||||
Unicode unicode;
|
||||
gEncoding.decode(str, unicode);
|
||||
return getWeight(unicode);
|
||||
}
|
||||
|
||||
double Trie::getWeight(const Unicode& unicode)
|
||||
{
|
||||
if(unicode.empty())
|
||||
{
|
||||
return getMinWeight();
|
||||
}
|
||||
const TrieNodeInfo * p = find(unicode);
|
||||
if(NULL != p)
|
||||
{
|
||||
@ -264,6 +281,20 @@ namespace CppJieba
|
||||
{
|
||||
return getMinWeight();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
double Trie::getWeight(UnicodeConstIterator begin, UnicodeConstIterator end)
|
||||
{
|
||||
const TrieNodeInfo * p = find(begin, end);
|
||||
if(NULL != p)
|
||||
{
|
||||
return p->weight;
|
||||
}
|
||||
else
|
||||
{
|
||||
return getMinWeight();
|
||||
}
|
||||
}
|
||||
|
||||
double Trie::getMinWeight()
|
||||
@ -299,7 +330,7 @@ namespace CppJieba
|
||||
|
||||
const string& word = nodeInfo.word;
|
||||
|
||||
vector<uint16_t> unicode;
|
||||
Unicode unicode;
|
||||
bool retFlag = gEncoding.decode(word, unicode);
|
||||
if(!retFlag)
|
||||
{
|
||||
|
@ -88,11 +88,14 @@ namespace CppJieba
|
||||
|
||||
public:
|
||||
const TrieNodeInfo* find(const string& str);
|
||||
const TrieNodeInfo* find(const vector<uint16_t>& unicode);
|
||||
const TrieNodeInfo* find(const Unicode& unicode);
|
||||
const TrieNodeInfo* find(UnicodeConstIterator begin, UnicodeConstIterator end);
|
||||
const TrieNodeInfo* findPrefix(const string& str);
|
||||
|
||||
public:
|
||||
double getWeight(const string& str);
|
||||
double getWeight(const Unicode& unicode);
|
||||
double getWeight(UnicodeConstIterator begin, UnicodeConstIterator end);
|
||||
double getMinWeight();
|
||||
|
||||
int64_t getTotalCount();
|
||||
|
@ -38,7 +38,17 @@ namespace CPPCOMMON
|
||||
return true;
|
||||
}
|
||||
|
||||
string UnicodeEncoding::encode(const vector<uint16_t>& unicode)
|
||||
string UnicodeEncoding::encode(UnicodeConstIterator begin, UnicodeConstIterator end)
|
||||
{
|
||||
if(begin >= end)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
Unicode unicode(begin, end);
|
||||
return encode(unicode);
|
||||
}
|
||||
|
||||
string UnicodeEncoding::encode(const Unicode& unicode)
|
||||
{
|
||||
if(unicode.empty())
|
||||
{
|
||||
@ -55,7 +65,7 @@ namespace CPPCOMMON
|
||||
return "";
|
||||
}
|
||||
|
||||
bool UnicodeEncoding::decode(const string& str, vector<uint16_t>& unicode)
|
||||
bool UnicodeEncoding::decode(const string& str, Unicode& unicode)
|
||||
{
|
||||
if(str.empty())
|
||||
{
|
||||
@ -80,7 +90,7 @@ int main()
|
||||
{
|
||||
UnicodeEncoding enc(GBKENC);
|
||||
ifstream ifile("testdata/dict.gbk");
|
||||
vector<uint16_t> unicode;
|
||||
Unicode unicode;
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
|
@ -26,8 +26,9 @@ namespace CPPCOMMON
|
||||
~UnicodeEncoding();
|
||||
public:
|
||||
bool setEncoding(const string& enc);
|
||||
string encode(const vector<uint16_t>& unicode);
|
||||
bool decode(const string& str, vector<uint16_t>& unicode);
|
||||
string encode(const Unicode& unicode);
|
||||
string encode(UnicodeConstIterator begin, UnicodeConstIterator end);
|
||||
bool decode(const string& str, Unicode& unicode);
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -213,7 +213,7 @@ namespace CPPCOMMON
|
||||
return res;
|
||||
}
|
||||
|
||||
string unicodeToUtf8(const vector<uint16_t>& unicode)
|
||||
string unicodeToUtf8(const Unicode& unicode)
|
||||
{
|
||||
if(unicode.empty())
|
||||
{
|
||||
@ -283,7 +283,7 @@ namespace CPPCOMMON
|
||||
return length;
|
||||
}
|
||||
|
||||
bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode)
|
||||
bool utf8ToUnicode(const string& utfStr, Unicode& unicode)
|
||||
{
|
||||
unicode.clear();
|
||||
if(utfStr.empty())
|
||||
@ -376,7 +376,7 @@ namespace CPPCOMMON
|
||||
}
|
||||
|
||||
//unicode str to vec
|
||||
bool uniStrToVec(const string& str, vector<uint16_t>& vec)
|
||||
bool uniStrToVec(const string& str, Unicode& vec)
|
||||
{
|
||||
vec.clear();
|
||||
if(str.empty() || str.size() % 2)
|
||||
@ -392,7 +392,7 @@ namespace CPPCOMMON
|
||||
}
|
||||
|
||||
//unicode vec to str
|
||||
string uniVecToStr(const vector<uint16_t>& vec)
|
||||
string uniVecToStr(const Unicode& vec)
|
||||
{
|
||||
string res("");
|
||||
for(uint i = 0; i < vec.size(); i++)
|
||||
@ -451,7 +451,7 @@ int main()
|
||||
//cout<<string_format("hehe%s11asd%dasf","[here]",2);
|
||||
ifstream ifile("testdata/dict.gbk");
|
||||
string line;
|
||||
vector<uint16_t> unicode;
|
||||
Unicode unicode;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
cout<<line<<endl;
|
||||
|
@ -38,16 +38,16 @@ namespace CPPCOMMON
|
||||
|
||||
//encode
|
||||
size_t unicodeToUtf8(uint16_t *in, size_t len, char * out);
|
||||
string unicodeToUtf8(const vector<uint16_t>& unicode);
|
||||
string unicodeToUtf8(const Unicode& unicode);
|
||||
int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode);
|
||||
bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode);
|
||||
bool utf8ToUnicode(const string& utfStr, Unicode& unicode);
|
||||
|
||||
int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen);
|
||||
string gbkToUtf8(const string& gbk);
|
||||
string utf8ToGbk(const string& utf);
|
||||
|
||||
bool uniStrToVec(const string& str, vector<uint16_t>& vec);
|
||||
string uniVecToStr(const vector<uint16_t>& vec);
|
||||
bool uniStrToVec(const string& str, Unicode& vec);
|
||||
string uniVecToStr(const Unicode& vec);
|
||||
|
||||
inline uint16_t twocharToUint16(char high, char low)
|
||||
{
|
||||
@ -62,7 +62,7 @@ namespace CPPCOMMON
|
||||
return res;
|
||||
}
|
||||
|
||||
inline void printUnicode(const vector<uint16_t>& unicode)
|
||||
inline void printUnicode(const Unicode& unicode)
|
||||
{
|
||||
cout<<uniVecToStr(unicode)<<endl;
|
||||
}
|
||||
|
@ -8,6 +8,8 @@
|
||||
namespace CPPCOMMON
|
||||
{
|
||||
typedef unsigned int uint;
|
||||
typedef std::vector<uint16_t> Unicode;
|
||||
typedef std::vector<uint16_t>::const_iterator UnicodeConstIterator;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user