big reconstruction: replace string word with Unicode in structs.h and others

This commit is contained in:
wyy 2013-09-09 14:22:25 +08:00
parent 70f12f2c97
commit e8a98d4e4d
11 changed files with 338 additions and 280 deletions

View File

@ -8,7 +8,7 @@ using namespace CppJieba;
void testKeyWordExt(const char * dictPath, const char * filePath) void testKeyWordExt(const char * dictPath, const char * filePath)
{ {
KeyWordExt ext; KeyWordExt ext;
if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0")) if(!ext.init(dictPath))
{ {
return; return;
} }

View File

@ -32,7 +32,7 @@ void cut(const char * const filePath)
{ {
if(!line.empty()) if(!line.empty())
{ {
seg.cutDAG(line, res); seg.cut(line, res);
cout<<line<<"\n"<<joinStr(res,"/")<<endl; cout<<line<<"\n"<<joinStr(res,"/")<<endl;
} }
} }
@ -60,7 +60,7 @@ void cutAll(const char* const filePath)
string line; string line;
while(getline(ifs, line)) while(getline(ifs, line))
{ {
seg.cutDAG(line, res); seg.cut(line, res);
} }
} }

View File

@ -16,7 +16,7 @@ namespace CppJieba
{ {
} }
bool KeyWordExt::init(const char* const segDictFile, const char* const stopWordDictFile) bool KeyWordExt::init(const char* const segDictFile)
{ {
LogInfo("KeyWordExt init start ..."); LogInfo("KeyWordExt init start ...");
if(!_segment.init(segDictFile)) if(!_segment.init(segDictFile))
@ -24,40 +24,34 @@ namespace CppJieba
LogError("_segment.init failed."); LogError("_segment.init failed.");
return false; return false;
} }
if(!_loadStopWords(stopWordDictFile))
{
LogError("_loadStopWords failed.");
return false;
}
LogInfo("KeyWordExt init OK.");
return true; return true;
} }
bool KeyWordExt::_loadPriorSubWords(const char * const filePath) //bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
{ //{
LogInfo(string_format("_loadPriorSubWords(%s) start", filePath)); // LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
if(!checkFileExist(filePath)) // if(!checkFileExist(filePath))
{ // {
LogError(string_format("cann't find file[%s].",filePath)); // LogError(string_format("cann't find file[%s].",filePath));
return false; // return false;
} // }
if(!_priorSubWords.empty()) // if(!_priorSubWords.empty())
{ // {
LogError("_priorSubWords has been initted before"); // LogError("_priorSubWords has been initted before");
return false; // return false;
} // }
ifstream infile(filePath); // ifstream infile(filePath);
string subword; // string subword;
while(getline(infile, subword)) // while(getline(infile, subword))
{ // {
_priorSubWords.push_back(subword); // _priorSubWords.push_back(subword);
} // }
LogInfo(string_format("_loadPriorSubWords(%s) end", filePath)); // LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
infile.close(); // infile.close();
return true; // return true;
} //}
bool KeyWordExt::_loadStopWords(const char * const filePath) bool KeyWordExt::loadStopWords(const char * const filePath)
{ {
LogInfo(string_format("_loadStopWords(%s) start", filePath)); LogInfo(string_format("_loadStopWords(%s) start", filePath));
@ -74,9 +68,15 @@ namespace CppJieba
ifstream ifile(filePath); ifstream ifile(filePath);
string line; string line;
Unicode word;
while(getline(ifile, line)) while(getline(ifile, line))
{ {
_stopWords.insert(line); if(!TransCode::strToVec(line, word))
{
LogError("strToVec failed .");
return false;
}
_stopWords.insert(word);
} }
LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size())); LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
@ -100,12 +100,7 @@ namespace CppJieba
{ {
KeyWordInfo& wInfo = wordInfos[i]; KeyWordInfo& wInfo = wordInfos[i];
wInfo.idf = - wInfo.logFreq; wInfo.idf = - wInfo.logFreq;
if(0 == wInfo.wLen) wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
{
LogFatal("wLen is 0!");
return false;
}
wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
} }
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
return true; return true;
@ -143,14 +138,16 @@ namespace CppJieba
return false; return false;
} }
#ifdef DEBU
LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
#endif
keyWordInfos.clear(); keyWordInfos.clear();
for(uint i = 0; i < words.size(); i++) for(uint i = 0; i < words.size(); i++)
{ {
keyWordInfos.push_back(words[i]); Unicode uniWord;
if(!TransCode::strToVec(words[i], uniWord))
{
LogError("strToVec failed");
return false;
}
keyWordInfos.push_back(uniWord);
} }
return _extract(keyWordInfos, topN); return _extract(keyWordInfos, topN);
@ -164,7 +161,7 @@ namespace CppJieba
} }
vector<TrieNodeInfo> trieNodeInfos; vector<TrieNodeInfo> trieNodeInfos;
_segment.cutDAG(title, trieNodeInfos); _segment.cut(title, trieNodeInfos);
keyWordInfos.clear(); keyWordInfos.clear();
for(uint i = 0; i < trieNodeInfos.size(); i++) for(uint i = 0; i < trieNodeInfos.size(); i++)
@ -249,7 +246,7 @@ namespace CppJieba
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos) bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
{ {
set<string> st; set<Unicode> st;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); ) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{ {
if(st.find(it->word) != st.end()) if(st.find(it->word) != st.end())
@ -271,7 +268,7 @@ namespace CppJieba
{ {
// filter single word // filter single word
if(1 == it->wLen) if(1 == it->word.size())
{ {
it = wordInfos.erase(it); it = wordInfos.erase(it);
} }
@ -285,27 +282,15 @@ namespace CppJieba
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos) bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
{ {
vector<string> tmp ; vector<Unicode> tmp ;
for(uint i = 0; i < wordInfos.size(); i++) for(uint i = 0; i < wordInfos.size(); i++)
{ {
tmp.push_back(wordInfos[i].word); tmp.push_back(wordInfos[i].word);
} }
set<string> subs;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
{
for(uint j = 0; j < tmp.size(); j++)
{
if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
{
subs.insert(it->word);
}
}
}
//erase subs from strs
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); ) for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{ {
if(subs.end() != subs.find(it->word)) if(_isSubIn(tmp, it->word))
{ {
it = wordInfos.erase(it); it = wordInfos.erase(it);
} }
@ -314,50 +299,51 @@ namespace CppJieba
it++; it++;
} }
} }
return true; return true;
} }
bool KeyWordExt::_isContainSubWords(const string& word) //bool KeyWordExt::_isContainSubWords(const string& word)
{ //{
for(uint i = 0; i < _priorSubWords.size(); i++) // for(uint i = 0; i < _priorSubWords.size(); i++)
{ // {
if(string::npos != word.find(_priorSubWords[i])) // if(string::npos != word.find(_priorSubWords[i]))
{ // {
return true; // return true;
} // }
} // }
return false; // return false;
} //}
bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos) //bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
{ //{
if(2 > wordInfos.size()) // if(2 > wordInfos.size())
{ // {
return true; // return true;
} // }
KeyWordInfo prior; // KeyWordInfo prior;
bool flag = false; // bool flag = false;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); ) // for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{ // {
if(_isContainSubWords(it->word)) // if(_isContainSubWords(it->word))
{ // {
prior = *it; // prior = *it;
it = wordInfos.erase(it); // it = wordInfos.erase(it);
flag = true; // flag = true;
break; // break;
} // }
else // else
{ // {
it ++; // it ++;
} // }
} // }
if(flag) // if(flag)
{ // {
wordInfos.insert(wordInfos.begin(), prior); // wordInfos.insert(wordInfos.begin(), prior);
} // }
return true; // return true;
} //}
} }
@ -375,12 +361,6 @@ int main()
} }
ext._loadStopWords("../dicts/stopwords.gbk.v1.0"); ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
if(!ext._loadPriorSubWords("../dicts/prior.gbk"))
{
cerr<<"err"<<endl;
return 1;
}
ifstream ifile("testtitle.gbk"); ifstream ifile("testtitle.gbk");
vector<string> res; vector<string> res;
string line; string line;

View File

@ -15,17 +15,16 @@ namespace CppJieba
{ {
private: private:
MPSegment _segment; MPSegment _segment;
vector<string> _priorSubWords; //vector<string> _priorSubWords;
set<string> _stopWords; set<Unicode> _stopWords;
public: public:
KeyWordExt(); KeyWordExt();
~KeyWordExt(); ~KeyWordExt();
bool init(const char* const segDictFile, const char* const stopWordDictFile); bool init(const char* const segDictFile);
bool dispose(); bool dispose();
bool loadStopWords(const char * const filePath);
private: private:
bool _loadStopWords(const char * const filePath); //bool _loadPriorSubWords(const char * const filePath);
bool _loadPriorSubWords(const char * const filePath);
public: public:
@ -46,8 +45,20 @@ namespace CppJieba
bool _filterSubstr(vector<KeyWordInfo>& ); bool _filterSubstr(vector<KeyWordInfo>& );
bool _filterStopWords(vector<KeyWordInfo>& ); bool _filterStopWords(vector<KeyWordInfo>& );
private: private:
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos); inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
bool _isContainSubWords(const string& word); {
for(uint j = 0; j < words.size(); j++)
{
if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
{
return true;
}
}
return false;
}
//bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
//bool _isContainSubWords(const string& word);
}; };

View File

@ -36,22 +36,22 @@ namespace CppJieba
return _trie.dispose(); return _trie.dispose();
} }
bool MPSegment::cutDAG(const string& str, vector<string>& res) bool MPSegment::cut(const string& str, vector<string>& res)
{ {
vector<TrieNodeInfo> segWordInfos; vector<TrieNodeInfo> segWordInfos;
if(!cutDAG(str, segWordInfos)) if(!cut(str, segWordInfos))
{ {
return false; return false;
} }
res.clear(); res.clear();
for(uint i = 0; i < segWordInfos.size(); i++) for(uint i = 0; i < segWordInfos.size(); i++)
{ {
res.push_back(segWordInfos[i].word); res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
} }
return true; return true;
} }
bool MPSegment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos) bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)
{ {
if(str.empty()) if(str.empty())
{ {
@ -59,13 +59,19 @@ namespace CppJieba
} }
segWordInfos.clear(); segWordInfos.clear();
SegmentContext segContext; SegmentContext segContext;
Unicode sentence;
if(!TransCode::strToVec(str, segContext.uintVec)) if(!TransCode::strToVec(str, sentence))
{ {
LogError("TransCode::strToVec failed."); LogError("TransCode::strToVec failed.");
return false; return false;
} }
for(uint i = 0; i < sentence.size(); i++)
{
segContext.push_back(SegmentChar(sentence[i]));
}
//calc DAG //calc DAG
if(!_calcDAG(segContext)) if(!_calcDAG(segContext))
{ {
@ -79,9 +85,9 @@ namespace CppJieba
return false; return false;
} }
if(!_cutDAG(segContext, segWordInfos)) if(!_cut(segContext, segWordInfos))
{ {
LogError("_cutDAG failed."); LogError("_cut failed.");
return false; return false;
} }
@ -90,112 +96,151 @@ namespace CppJieba
bool MPSegment::_calcDAG(SegmentContext& segContext) bool MPSegment::_calcDAG(SegmentContext& segContext)
{ {
if(segContext.uintVec.empty()) if(segContext.empty())
{ {
LogError("segContext empty.");
return false; return false;
} }
vector<pair<uint, const TrieNodeInfo*> > vec;
Unicode::const_iterator beginIter = segContext.uintVec.begin(); Unicode unicode;
for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) for(uint i = 0; i < segContext.size(); i++)
{ {
vec.clear(); unicode.clear();
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL)); for(uint j = i ; j < segContext.size(); j++)
for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
{ {
//care: the iterJ exceed iterEnd unicode.push_back(segContext[j].uniCh);
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1); const TrieNodeInfo* pInfo = _trie.find(unicode);
if(NULL != ptNodeInfo) if(pInfo)
{ {
vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo)); segContext[i].dag[j] = pInfo;
} }
} }
segContext.dag.push_back(vec); if(segContext[i].dag.end() == segContext[i].dag.find(i))
{
segContext[i].dag[i] = NULL;
}
} }
return true; return true;
//vector<pair<uint, const TrieNodeInfo*> > vec;
//Unicode::const_iterator beginIter = segContext.uintVec.begin();
//for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
//{
// vec.clear();
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
// for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
// {
// //care: the iterJ exceed iterEnd
// const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
// if(NULL != ptNodeInfo)
// {
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
// }
// }
// segContext.dag.push_back(vec);
//}
//return true;
} }
bool MPSegment::_calcDP(SegmentContext& segContext) bool MPSegment::_calcDP(SegmentContext& segContext)
{ {
if(segContext.uintVec.empty()) if(segContext.empty())
{ {
LogError("uintVec illegal"); LogError("segContext empty");
return false; return false;
} }
if(segContext.uintVec.size() != segContext.dag.size()) for(int i = segContext.size() - 1; i >= 0; i--)
{ {
LogError("dag is illegal!"); segContext[i].pInfo = NULL;
return false; segContext[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
{
uint nextPos = it->first;
const TrieNodeInfo* p = it->second;
double val = 0.0;
if(nextPos + 1 < segContext.size())
{
val += segContext[nextPos + 1].weight;
} }
segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0)); if(p)
segContext.dp[segContext.uintVec.size()].first = NULL;
segContext.dp[segContext.uintVec.size()].second = 0.0;
for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
{ {
// calc max val += p->logFreq;
segContext.dp[i].first = NULL;
segContext.dp[i].second = MIN_DOUBLE;
for(uint j = 0; j < segContext.dag[i].size(); j++)
{
const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
int pos = p.first;
double val = segContext.dp[pos+1].second;
if(NULL != p.second)
{
val += (p.second)->logFreq;
} }
else else
{ {
val += _trie.getMinLogFreq(); val += _trie.getMinLogFreq();
} }
if(val > segContext[i].weight)
if(val > segContext.dp[i].second)
{ {
segContext.dp[i].first = p.second; segContext[i].pInfo = p;
segContext.dp[i].second = val; segContext[i].weight = val;
} }
} }
} }
segContext.dp.pop_back();
return true; return true;
//segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
//segContext.dp[segContext.uintVec.size()].first = NULL;
//segContext.dp[segContext.uintVec.size()].second = 0.0;
//for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
//{
// // calc max
// segContext.dp[i].first = NULL;
// segContext.dp[i].second = MIN_DOUBLE;
// for(uint j = 0; j < segContext.dag[i].size(); j++)
// {
// const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
// int pos = p.first;
// double val = segContext.dp[pos+1].second;
// if(NULL != p.second)
// {
// val += (p.second)->logFreq;
// }
// else
// {
// val += _trie.getMinLogFreq();
// }
// if(val > segContext.dp[i].second)
// {
// segContext.dp[i].first = p.second;
// segContext.dp[i].second = val;
// }
// }
//}
//segContext.dp.pop_back();
//return true;
} }
bool MPSegment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res) bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)
{ {
if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size()) //if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
{ //{
LogError("dp or uintVec illegal!"); // LogFatal("dp or uintVec illegal!");
return false; // return false;
} //}
res.clear(); res.clear();
Unicode::const_iterator iterBegin = segContext.uintVec.begin();
uint i = 0; uint i = 0;
while(i < segContext.dp.size()) while(i < segContext.size())
{ {
const TrieNodeInfo* p = segContext.dp[i].first; const TrieNodeInfo* p = segContext[i].pInfo;
if(NULL == p) if(p)
{
res.push_back(*p);
i += p->word.size();
}
else//single chinese word
{ {
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1); nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.wLen = 1;
nodeInfo.freq = 0; nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq(); nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo); res.push_back(nodeInfo);
i++; i++;
} }
else
{
res.push_back(*p);
if(0 == p->wLen)
{
LogFatal("TrieNodeInfo's wLen is 0!");
return false;
}
i += p->wLen;
}
} }
return true; return true;
} }
@ -223,7 +268,7 @@ int main()
while(getline(ifile, line)) while(getline(ifile, line))
{ {
res.clear(); res.clear();
segment.cutDAG(line, res); segment.cut(line, res);
PRINT_VECTOR(res); PRINT_VECTOR(res);
getchar(); getchar();
} }

View File

@ -13,6 +13,9 @@
namespace CppJieba namespace CppJieba
{ {
typedef vector<SegmentChar> SegmentContext;
class MPSegment class MPSegment
{ {
private: private:
@ -25,15 +28,14 @@ namespace CppJieba
bool init(const char* const filePath); bool init(const char* const filePath);
bool dispose(); bool dispose();
public: public:
bool cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos); bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos);
bool cutDAG(const string& str, vector<string>& res); bool cut(const string& str, vector<string>& res);
private: private:
bool _calcDAG(SegmentContext& segContext); bool _calcDAG(SegmentContext& segContext);
bool _calcDP(SegmentContext& segContext); bool _calcDP(SegmentContext& segContext);
bool _cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res); bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res);
//bool _fill(const string& )
}; };
} }

View File

@ -35,12 +35,12 @@ namespace CppJieba
bool MixSegment::cut(const string& str, vector<string>& res) bool MixSegment::cut(const string& str, vector<string>& res)
{ {
vector<TrieNodeInfo> infos; vector<TrieNodeInfo> infos;
if(!_mpSeg.cutDAG(str, infos)) if(!_mpSeg.cut(str, infos))
{ {
LogError("_mpSeg cutDAG failed."); LogError("_mpSeg cutDAG failed.");
return false; return false;
} }
for(uint = 0; i < infos.size(); i++) for(uint i= 0; i < infos.size(); i++)
{ {
} }

View File

@ -109,9 +109,11 @@ namespace CppJieba
LogError(string_format("line[%s] illegal.", line.c_str())); LogError(string_format("line[%s] illegal.", line.c_str()));
return false; return false;
} }
nodeInfo.word = vecBuf[0]; if(!TransCode::strToVec(vecBuf[0], nodeInfo.word))
{
return false;
}
nodeInfo.freq = atoi(vecBuf[1].c_str()); nodeInfo.freq = atoi(vecBuf[1].c_str());
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
if(3 == vecBuf.size()) if(3 == vecBuf.size())
{ {
nodeInfo.tag = vecBuf[2]; nodeInfo.tag = vecBuf[2];
@ -193,7 +195,7 @@ namespace CppJieba
return res; return res;
} }
const TrieNodeInfo* Trie::find(const string& str) TrieNodeInfo* Trie::find(const string& str)
{ {
Unicode uintVec; Unicode uintVec;
bool retFlag = TransCode::strToVec(str, uintVec); bool retFlag = TransCode::strToVec(str, uintVec);
@ -204,7 +206,7 @@ namespace CppJieba
return find(uintVec); return find(uintVec);
} }
const TrieNodeInfo* Trie::find(const Unicode& uintVec) TrieNodeInfo* Trie::find(const Unicode& uintVec)
{ {
if(uintVec.empty()) if(uintVec.empty())
{ {
@ -213,7 +215,7 @@ namespace CppJieba
return find(uintVec.begin(), uintVec.end()); return find(uintVec.begin(), uintVec.end());
} }
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end) TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
if(!_getInitFlag()) if(!_getInitFlag())
@ -324,16 +326,8 @@ namespace CppJieba
return false; return false;
} }
const string& word = nodeInfo.word;
Unicode uintVec;
bool retFlag = TransCode::strToVec(word, uintVec);
if(!retFlag)
{
LogError("TransCode::strToVec error.");
return false;
}
const Unicode& uintVec = nodeInfo.word;
TrieNode* p = _root; TrieNode* p = _root;
for(uint i = 0; i < uintVec.size(); i++) for(uint i = 0; i < uintVec.size(); i++)
{ {

View File

@ -66,9 +66,9 @@ namespace CppJieba
bool _getInitFlag(); bool _getInitFlag();
public: public:
const TrieNodeInfo* find(const string& str); TrieNodeInfo* find(const string& str);
const TrieNodeInfo* find(const Unicode& uintVec); TrieNodeInfo* find(const Unicode& uintVec);
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end); TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
const TrieNodeInfo* findPrefix(const string& str); const TrieNodeInfo* findPrefix(const string& str);
public: public:

View File

@ -27,7 +27,6 @@ namespace CppJieba
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap; typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
typedef unordered_map<uint16_t, double> EmitProbMap; typedef unordered_map<uint16_t, double> EmitProbMap;
const double MIN_DOUBLE = -3.14e+100; const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100; const double MAX_DOUBLE = 3.14e+100;
} }

View File

@ -4,35 +4,63 @@
#include <limits> #include <limits>
#include "globals.h" #include "globals.h"
#include "Trie.h" #include "Trie.h"
#include "TransCode.h"
namespace CppJieba namespace CppJieba
{ {
struct TrieNodeInfo struct TrieNodeInfo
{ {
string word; //string word;
size_t wLen;// the word's len , not string.length(), //size_t wLen;// the word's len , not string.length(),
Unicode word;
size_t freq; size_t freq;
string tag; string tag;
double logFreq; //logFreq = log(freq/sum(freq)); double logFreq; //logFreq = log(freq/sum(freq));
TrieNodeInfo():wLen(0),freq(0),logFreq(0.0) TrieNodeInfo():freq(0),logFreq(0.0)
{ {
} }
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
{ {
} }
TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
{ {
wLen = TransCode::getWordLength(_word);
} }
}; };
struct SegmentContext//: public TrieNodeInfo typedef unordered_map<uint, const TrieNodeInfo*> DagType;
struct SegmentChar
{ {
vector<uint16_t> uintVec; uint16_t uniCh;
vector< vector<pair<uint, const TrieNodeInfo*> > > dag; DagType dag;
vector< pair<const TrieNodeInfo*, double> > dp; const TrieNodeInfo * pInfo;
double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
{
}
/*const TrieNodeInfo* pInfo;
double weight;
SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
{
}*/
}; };
/*
struct SegmentContext
{
vector<SegmentChar> context;
bool getDA
};*/
typedef vector<SegmentChar> SegmentContext;
//struct SegmentContext
//{
// vector<SegmentChar> context;
// //vector<uint16_t> uintVec;
// //vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
// //vector< pair<const TrieNodeInfo*, double> > dp;
//};
/* /*
struct SegmentWordInfo: public TrieNodeInfo struct SegmentWordInfo: public TrieNodeInfo
@ -48,7 +76,7 @@ namespace CppJieba
KeyWordInfo():idf(0.0),weight(0.0) KeyWordInfo():idf(0.0),weight(0.0)
{ {
} }
KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0) KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
{ {
} }
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo) KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
@ -56,13 +84,12 @@ namespace CppJieba
} }
string toString() const string toString() const
{ {
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf);
} }
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo) KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
{ {
word = trieNodeInfo.word; word = trieNodeInfo.word;
freq = trieNodeInfo.freq; freq = trieNodeInfo.freq;
wLen = trieNodeInfo.wLen;
tag = trieNodeInfo.tag; tag = trieNodeInfo.tag;
logFreq = trieNodeInfo.logFreq; logFreq = trieNodeInfo.logFreq;
return *this; return *this;