big reconstruction: replace string word with Unicode in structs.h and others

This commit is contained in:
wyy 2013-09-09 14:22:25 +08:00
parent 70f12f2c97
commit e8a98d4e4d
11 changed files with 338 additions and 280 deletions

View File

@ -8,7 +8,7 @@ using namespace CppJieba;
void testKeyWordExt(const char * dictPath, const char * filePath)
{
KeyWordExt ext;
if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0"))
if(!ext.init(dictPath))
{
return;
}

View File

@ -32,7 +32,7 @@ void cut(const char * const filePath)
{
if(!line.empty())
{
seg.cutDAG(line, res);
seg.cut(line, res);
cout<<line<<"\n"<<joinStr(res,"/")<<endl;
}
}
@ -60,7 +60,7 @@ void cutAll(const char* const filePath)
string line;
while(getline(ifs, line))
{
seg.cutDAG(line, res);
seg.cut(line, res);
}
}

View File

@ -16,7 +16,7 @@ namespace CppJieba
{
}
bool KeyWordExt::init(const char* const segDictFile, const char* const stopWordDictFile)
bool KeyWordExt::init(const char* const segDictFile)
{
LogInfo("KeyWordExt init start ...");
if(!_segment.init(segDictFile))
@ -24,40 +24,34 @@ namespace CppJieba
LogError("_segment.init failed.");
return false;
}
if(!_loadStopWords(stopWordDictFile))
{
LogError("_loadStopWords failed.");
return false;
}
LogInfo("KeyWordExt init OK.");
return true;
}
bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
{
LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
if(!checkFileExist(filePath))
{
LogError(string_format("cann't find file[%s].",filePath));
return false;
}
if(!_priorSubWords.empty())
{
LogError("_priorSubWords has been initted before");
return false;
}
ifstream infile(filePath);
string subword;
while(getline(infile, subword))
{
_priorSubWords.push_back(subword);
}
LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
infile.close();
return true;
}
//bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
//{
// LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
// if(!checkFileExist(filePath))
// {
// LogError(string_format("cann't find file[%s].",filePath));
// return false;
// }
// if(!_priorSubWords.empty())
// {
// LogError("_priorSubWords has been initted before");
// return false;
// }
// ifstream infile(filePath);
// string subword;
// while(getline(infile, subword))
// {
// _priorSubWords.push_back(subword);
// }
// LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
// infile.close();
// return true;
//}
bool KeyWordExt::_loadStopWords(const char * const filePath)
bool KeyWordExt::loadStopWords(const char * const filePath)
{
LogInfo(string_format("_loadStopWords(%s) start", filePath));
@ -74,9 +68,15 @@ namespace CppJieba
ifstream ifile(filePath);
string line;
Unicode word;
while(getline(ifile, line))
{
_stopWords.insert(line);
if(!TransCode::strToVec(line, word))
{
LogError("strToVec failed .");
return false;
}
_stopWords.insert(word);
}
LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
@ -100,12 +100,7 @@ namespace CppJieba
{
KeyWordInfo& wInfo = wordInfos[i];
wInfo.idf = - wInfo.logFreq;
if(0 == wInfo.wLen)
{
LogFatal("wLen is 0!");
return false;
}
wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
}
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
return true;
@ -143,14 +138,16 @@ namespace CppJieba
return false;
}
#ifdef DEBU
LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
#endif
keyWordInfos.clear();
for(uint i = 0; i < words.size(); i++)
{
keyWordInfos.push_back(words[i]);
Unicode uniWord;
if(!TransCode::strToVec(words[i], uniWord))
{
LogError("strToVec failed");
return false;
}
keyWordInfos.push_back(uniWord);
}
return _extract(keyWordInfos, topN);
@ -164,7 +161,7 @@ namespace CppJieba
}
vector<TrieNodeInfo> trieNodeInfos;
_segment.cutDAG(title, trieNodeInfos);
_segment.cut(title, trieNodeInfos);
keyWordInfos.clear();
for(uint i = 0; i < trieNodeInfos.size(); i++)
@ -249,7 +246,7 @@ namespace CppJieba
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
{
set<string> st;
set<Unicode> st;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{
if(st.find(it->word) != st.end())
@ -271,7 +268,7 @@ namespace CppJieba
{
// filter single word
if(1 == it->wLen)
if(1 == it->word.size())
{
it = wordInfos.erase(it);
}
@ -285,79 +282,68 @@ namespace CppJieba
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
{
vector<string> tmp ;
vector<Unicode> tmp ;
for(uint i = 0; i < wordInfos.size(); i++)
{
tmp.push_back(wordInfos[i].word);
}
set<string> subs;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
{
for(uint j = 0; j < tmp.size(); j++)
{
if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
{
subs.insert(it->word);
}
}
}
//erase subs from strs
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{
if(subs.end() != subs.find(it->word))
{
it = wordInfos.erase(it);
}
else
{
it ++;
}
if(_isSubIn(tmp, it->word))
{
it = wordInfos.erase(it);
}
else
{
it++;
}
}
return true;
}
bool KeyWordExt::_isContainSubWords(const string& word)
{
for(uint i = 0; i < _priorSubWords.size(); i++)
{
if(string::npos != word.find(_priorSubWords[i]))
{
return true;
}
}
return false;
}
//bool KeyWordExt::_isContainSubWords(const string& word)
//{
// for(uint i = 0; i < _priorSubWords.size(); i++)
// {
// if(string::npos != word.find(_priorSubWords[i]))
// {
// return true;
// }
// }
// return false;
//}
bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
{
if(2 > wordInfos.size())
{
return true;
}
//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
//{
// if(2 > wordInfos.size())
// {
// return true;
// }
KeyWordInfo prior;
bool flag = false;
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{
if(_isContainSubWords(it->word))
{
prior = *it;
it = wordInfos.erase(it);
flag = true;
break;
}
else
{
it ++;
}
}
if(flag)
{
wordInfos.insert(wordInfos.begin(), prior);
}
return true;
}
// KeyWordInfo prior;
// bool flag = false;
// for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
// {
// if(_isContainSubWords(it->word))
// {
// prior = *it;
// it = wordInfos.erase(it);
// flag = true;
// break;
// }
// else
// {
// it ++;
// }
// }
// if(flag)
// {
// wordInfos.insert(wordInfos.begin(), prior);
// }
// return true;
//}
}
@ -375,12 +361,6 @@ int main()
}
ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
if(!ext._loadPriorSubWords("../dicts/prior.gbk"))
{
cerr<<"err"<<endl;
return 1;
}
ifstream ifile("testtitle.gbk");
vector<string> res;
string line;

View File

@ -1,7 +1,7 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
************************************/
#ifndef CPPJIEBA_KEYWORDEXT_H
#define CPPJIEBA_KEYWORDEXT_H
@ -11,45 +11,56 @@
namespace CppJieba
{
class KeyWordExt
{
private:
MPSegment _segment;
vector<string> _priorSubWords;
set<string> _stopWords;
public:
KeyWordExt();
~KeyWordExt();
bool init(const char* const segDictFile, const char* const stopWordDictFile);
bool dispose();
private:
bool _loadStopWords(const char * const filePath);
bool _loadPriorSubWords(const char * const filePath);
class KeyWordExt
{
private:
MPSegment _segment;
//vector<string> _priorSubWords;
set<Unicode> _stopWords;
public:
KeyWordExt();
~KeyWordExt();
bool init(const char* const segDictFile);
bool dispose();
bool loadStopWords(const char * const filePath);
private:
//bool _loadPriorSubWords(const char * const filePath);
public:
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
private:
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
private:
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
private:
//sort by word len - idf
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
private:
bool _filter(vector<KeyWordInfo>& );
bool _filterDuplicate(vector<KeyWordInfo>& );
bool _filterSingleWord(vector<KeyWordInfo>& );
bool _filterSubstr(vector<KeyWordInfo>& );
bool _filterStopWords(vector<KeyWordInfo>& );
private:
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
bool _isContainSubWords(const string& word);
public:
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
private:
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
private:
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
private:
//sort by word len - idf
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
private:
bool _filter(vector<KeyWordInfo>& );
bool _filterDuplicate(vector<KeyWordInfo>& );
bool _filterSingleWord(vector<KeyWordInfo>& );
bool _filterSubstr(vector<KeyWordInfo>& );
bool _filterStopWords(vector<KeyWordInfo>& );
private:
inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
{
};
for(uint j = 0; j < words.size(); j++)
{
if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
{
return true;
}
}
return false;
}
//bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
//bool _isContainSubWords(const string& word);
};
}

View File

@ -36,22 +36,22 @@ namespace CppJieba
return _trie.dispose();
}
bool MPSegment::cutDAG(const string& str, vector<string>& res)
bool MPSegment::cut(const string& str, vector<string>& res)
{
vector<TrieNodeInfo> segWordInfos;
if(!cutDAG(str, segWordInfos))
if(!cut(str, segWordInfos))
{
return false;
}
res.clear();
for(uint i = 0; i < segWordInfos.size(); i++)
{
res.push_back(segWordInfos[i].word);
res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
}
return true;
}
bool MPSegment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos)
bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)
{
if(str.empty())
{
@ -59,13 +59,19 @@ namespace CppJieba
}
segWordInfos.clear();
SegmentContext segContext;
if(!TransCode::strToVec(str, segContext.uintVec))
Unicode sentence;
if(!TransCode::strToVec(str, sentence))
{
LogError("TransCode::strToVec failed.");
return false;
}
for(uint i = 0; i < sentence.size(); i++)
{
segContext.push_back(SegmentChar(sentence[i]));
}
//calc DAG
if(!_calcDAG(segContext))
{
@ -79,9 +85,9 @@ namespace CppJieba
return false;
}
if(!_cutDAG(segContext, segWordInfos))
if(!_cut(segContext, segWordInfos))
{
LogError("_cutDAG failed.");
LogError("_cut failed.");
return false;
}
@ -90,111 +96,150 @@ namespace CppJieba
bool MPSegment::_calcDAG(SegmentContext& segContext)
{
if(segContext.uintVec.empty())
if(segContext.empty())
{
LogError("segContext empty.");
return false;
}
vector<pair<uint, const TrieNodeInfo*> > vec;
Unicode::const_iterator beginIter = segContext.uintVec.begin();
for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
{
vec.clear();
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
{
//care: the iterJ exceed iterEnd
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
if(NULL != ptNodeInfo)
{
vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
}
}
segContext.dag.push_back(vec);
}
return true;
Unicode unicode;
for(uint i = 0; i < segContext.size(); i++)
{
unicode.clear();
for(uint j = i ; j < segContext.size(); j++)
{
unicode.push_back(segContext[j].uniCh);
const TrieNodeInfo* pInfo = _trie.find(unicode);
if(pInfo)
{
segContext[i].dag[j] = pInfo;
}
}
if(segContext[i].dag.end() == segContext[i].dag.find(i))
{
segContext[i].dag[i] = NULL;
}
}
return true;
//vector<pair<uint, const TrieNodeInfo*> > vec;
//Unicode::const_iterator beginIter = segContext.uintVec.begin();
//for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
//{
// vec.clear();
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
// for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
// {
// //care: the iterJ exceed iterEnd
// const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
// if(NULL != ptNodeInfo)
// {
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
// }
// }
// segContext.dag.push_back(vec);
//}
//return true;
}
bool MPSegment::_calcDP(SegmentContext& segContext)
{
if(segContext.uintVec.empty())
if(segContext.empty())
{
LogError("uintVec illegal");
LogError("segContext empty");
return false;
}
for(int i = segContext.size() - 1; i >= 0; i--)
{
segContext[i].pInfo = NULL;
segContext[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
{
uint nextPos = it->first;
const TrieNodeInfo* p = it->second;
double val = 0.0;
if(nextPos + 1 < segContext.size())
{
val += segContext[nextPos + 1].weight;
}
if(segContext.uintVec.size() != segContext.dag.size())
{
LogError("dag is illegal!");
return false;
}
segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
segContext.dp[segContext.uintVec.size()].first = NULL;
segContext.dp[segContext.uintVec.size()].second = 0.0;
for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
{
// calc max
segContext.dp[i].first = NULL;
segContext.dp[i].second = MIN_DOUBLE;
for(uint j = 0; j < segContext.dag[i].size(); j++)
{
const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
int pos = p.first;
double val = segContext.dp[pos+1].second;
if(NULL != p.second)
{
val += (p.second)->logFreq;
}
else
{
if(p)
{
val += p->logFreq;
}
else
{
val += _trie.getMinLogFreq();
}
if(val > segContext.dp[i].second)
}
if(val > segContext[i].weight)
{
segContext.dp[i].first = p.second;
segContext.dp[i].second = val;
segContext[i].pInfo = p;
segContext[i].weight = val;
}
}
}
segContext.dp.pop_back();
return true;
}
}
return true;
//segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
//segContext.dp[segContext.uintVec.size()].first = NULL;
//segContext.dp[segContext.uintVec.size()].second = 0.0;
//for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
//{
// // calc max
// segContext.dp[i].first = NULL;
// segContext.dp[i].second = MIN_DOUBLE;
// for(uint j = 0; j < segContext.dag[i].size(); j++)
// {
// const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
// int pos = p.first;
// double val = segContext.dp[pos+1].second;
// if(NULL != p.second)
// {
// val += (p.second)->logFreq;
// }
// else
// {
// val += _trie.getMinLogFreq();
// }
// if(val > segContext.dp[i].second)
// {
// segContext.dp[i].first = p.second;
// segContext.dp[i].second = val;
// }
// }
//}
//segContext.dp.pop_back();
//return true;
}
bool MPSegment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res)
bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)
{
if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
{
LogError("dp or uintVec illegal!");
return false;
}
//if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
//{
// LogFatal("dp or uintVec illegal!");
// return false;
//}
res.clear();
Unicode::const_iterator iterBegin = segContext.uintVec.begin();
uint i = 0;
while(i < segContext.dp.size())
while(i < segContext.size())
{
const TrieNodeInfo* p = segContext.dp[i].first;
if(NULL == p)
const TrieNodeInfo* p = segContext[i].pInfo;
if(p)
{
res.push_back(*p);
i += p->word.size();
}
else//single chinese word
{
TrieNodeInfo nodeInfo;
nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1);
nodeInfo.wLen = 1;
nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo);
i ++;
}
else
{
res.push_back(*p);
if(0 == p->wLen)
{
LogFatal("TrieNodeInfo's wLen is 0!");
return false;
}
i += p->wLen;
i++;
}
}
return true;
@ -223,7 +268,7 @@ int main()
while(getline(ifile, line))
{
res.clear();
segment.cutDAG(line, res);
segment.cut(line, res);
PRINT_VECTOR(res);
getchar();
}

View File

@ -13,6 +13,9 @@
namespace CppJieba
{
typedef vector<SegmentChar> SegmentContext;
class MPSegment
{
private:
@ -25,15 +28,14 @@ namespace CppJieba
bool init(const char* const filePath);
bool dispose();
public:
bool cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos);
bool cutDAG(const string& str, vector<string>& res);
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos);
bool cut(const string& str, vector<string>& res);
private:
bool _calcDAG(SegmentContext& segContext);
bool _calcDP(SegmentContext& segContext);
bool _cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res);
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res);
//bool _fill(const string& )
};
}

View File

@ -35,12 +35,12 @@ namespace CppJieba
bool MixSegment::cut(const string& str, vector<string>& res)
{
vector<TrieNodeInfo> infos;
if(!_mpSeg.cutDAG(str, infos))
if(!_mpSeg.cut(str, infos))
{
LogError("_mpSeg cutDAG failed.");
return false;
}
for(uint = 0; i < infos.size(); i++)
for(uint i= 0; i < infos.size(); i++)
{
}

View File

@ -109,9 +109,11 @@ namespace CppJieba
LogError(string_format("line[%s] illegal.", line.c_str()));
return false;
}
nodeInfo.word = vecBuf[0];
if(!TransCode::strToVec(vecBuf[0], nodeInfo.word))
{
return false;
}
nodeInfo.freq = atoi(vecBuf[1].c_str());
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
if(3 == vecBuf.size())
{
nodeInfo.tag = vecBuf[2];
@ -193,7 +195,7 @@ namespace CppJieba
return res;
}
const TrieNodeInfo* Trie::find(const string& str)
TrieNodeInfo* Trie::find(const string& str)
{
Unicode uintVec;
bool retFlag = TransCode::strToVec(str, uintVec);
@ -204,7 +206,7 @@ namespace CppJieba
return find(uintVec);
}
const TrieNodeInfo* Trie::find(const Unicode& uintVec)
TrieNodeInfo* Trie::find(const Unicode& uintVec)
{
if(uintVec.empty())
{
@ -213,7 +215,7 @@ namespace CppJieba
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
{
if(!_getInitFlag())
@ -324,16 +326,8 @@ namespace CppJieba
return false;
}
const string& word = nodeInfo.word;
Unicode uintVec;
bool retFlag = TransCode::strToVec(word, uintVec);
if(!retFlag)
{
LogError("TransCode::strToVec error.");
return false;
}
const Unicode& uintVec = nodeInfo.word;
TrieNode* p = _root;
for(uint i = 0; i < uintVec.size(); i++)
{

View File

@ -66,9 +66,9 @@ namespace CppJieba
bool _getInitFlag();
public:
const TrieNodeInfo* find(const string& str);
const TrieNodeInfo* find(const Unicode& uintVec);
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
TrieNodeInfo* find(const string& str);
TrieNodeInfo* find(const Unicode& uintVec);
TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
const TrieNodeInfo* findPrefix(const string& str);
public:

View File

@ -27,7 +27,6 @@ namespace CppJieba
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
typedef unordered_map<uint16_t, double> EmitProbMap;
const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100;
}

View File

@ -4,35 +4,63 @@
#include <limits>
#include "globals.h"
#include "Trie.h"
#include "TransCode.h"
namespace CppJieba
{
struct TrieNodeInfo
{
string word;
size_t wLen;// the word's len , not string.length(),
//string word;
//size_t wLen;// the word's len , not string.length(),
Unicode word;
size_t freq;
string tag;
double logFreq; //logFreq = log(freq/sum(freq));
TrieNodeInfo():wLen(0),freq(0),logFreq(0.0)
TrieNodeInfo():freq(0),logFreq(0.0)
{
}
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
{
}
TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
{
wLen = TransCode::getWordLength(_word);
}
};
typedef unordered_map<uint, const TrieNodeInfo*> DagType;
struct SegmentChar
{
uint16_t uniCh;
DagType dag;
const TrieNodeInfo * pInfo;
double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
{
}
/*const TrieNodeInfo* pInfo;
double weight;
SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
{
}*/
};
/*
struct SegmentContext
{
vector<SegmentChar> context;
bool getDA
};*/
typedef vector<SegmentChar> SegmentContext;
struct SegmentContext//: public TrieNodeInfo
{
vector<uint16_t> uintVec;
vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
vector< pair<const TrieNodeInfo*, double> > dp;
};
//struct SegmentContext
//{
// vector<SegmentChar> context;
// //vector<uint16_t> uintVec;
// //vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
// //vector< pair<const TrieNodeInfo*, double> > dp;
//};
/*
struct SegmentWordInfo: public TrieNodeInfo
@ -48,7 +76,7 @@ namespace CppJieba
KeyWordInfo():idf(0.0),weight(0.0)
{
}
KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
{
}
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
@ -56,13 +84,12 @@ namespace CppJieba
}
string toString() const
{
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf);
}
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
{
word = trieNodeInfo.word;
freq = trieNodeInfo.freq;
wLen = trieNodeInfo.wLen;
tag = trieNodeInfo.tag;
logFreq = trieNodeInfo.logFreq;
return *this;