mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
big reconstruction: replace string word with Unicode in structs.h and others
This commit is contained in:
parent
70f12f2c97
commit
e8a98d4e4d
@ -8,7 +8,7 @@ using namespace CppJieba;
|
||||
void testKeyWordExt(const char * dictPath, const char * filePath)
|
||||
{
|
||||
KeyWordExt ext;
|
||||
if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0"))
|
||||
if(!ext.init(dictPath))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ void cut(const char * const filePath)
|
||||
{
|
||||
if(!line.empty())
|
||||
{
|
||||
seg.cutDAG(line, res);
|
||||
seg.cut(line, res);
|
||||
cout<<line<<"\n"<<joinStr(res,"/")<<endl;
|
||||
}
|
||||
}
|
||||
@ -60,7 +60,7 @@ void cutAll(const char* const filePath)
|
||||
string line;
|
||||
while(getline(ifs, line))
|
||||
{
|
||||
seg.cutDAG(line, res);
|
||||
seg.cut(line, res);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -16,7 +16,7 @@ namespace CppJieba
|
||||
{
|
||||
}
|
||||
|
||||
bool KeyWordExt::init(const char* const segDictFile, const char* const stopWordDictFile)
|
||||
bool KeyWordExt::init(const char* const segDictFile)
|
||||
{
|
||||
LogInfo("KeyWordExt init start ...");
|
||||
if(!_segment.init(segDictFile))
|
||||
@ -24,40 +24,34 @@ namespace CppJieba
|
||||
LogError("_segment.init failed.");
|
||||
return false;
|
||||
}
|
||||
if(!_loadStopWords(stopWordDictFile))
|
||||
{
|
||||
LogError("_loadStopWords failed.");
|
||||
return false;
|
||||
}
|
||||
LogInfo("KeyWordExt init OK.");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
|
||||
{
|
||||
LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
|
||||
if(!checkFileExist(filePath))
|
||||
{
|
||||
LogError(string_format("cann't find file[%s].",filePath));
|
||||
return false;
|
||||
}
|
||||
if(!_priorSubWords.empty())
|
||||
{
|
||||
LogError("_priorSubWords has been initted before");
|
||||
return false;
|
||||
}
|
||||
ifstream infile(filePath);
|
||||
string subword;
|
||||
while(getline(infile, subword))
|
||||
{
|
||||
_priorSubWords.push_back(subword);
|
||||
}
|
||||
LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
|
||||
infile.close();
|
||||
return true;
|
||||
}
|
||||
//bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
|
||||
//{
|
||||
// LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
|
||||
// if(!checkFileExist(filePath))
|
||||
// {
|
||||
// LogError(string_format("cann't find file[%s].",filePath));
|
||||
// return false;
|
||||
// }
|
||||
// if(!_priorSubWords.empty())
|
||||
// {
|
||||
// LogError("_priorSubWords has been initted before");
|
||||
// return false;
|
||||
// }
|
||||
// ifstream infile(filePath);
|
||||
// string subword;
|
||||
// while(getline(infile, subword))
|
||||
// {
|
||||
// _priorSubWords.push_back(subword);
|
||||
// }
|
||||
// LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
|
||||
// infile.close();
|
||||
// return true;
|
||||
//}
|
||||
|
||||
bool KeyWordExt::_loadStopWords(const char * const filePath)
|
||||
bool KeyWordExt::loadStopWords(const char * const filePath)
|
||||
{
|
||||
|
||||
LogInfo(string_format("_loadStopWords(%s) start", filePath));
|
||||
@ -74,9 +68,15 @@ namespace CppJieba
|
||||
|
||||
ifstream ifile(filePath);
|
||||
string line;
|
||||
Unicode word;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
_stopWords.insert(line);
|
||||
if(!TransCode::strToVec(line, word))
|
||||
{
|
||||
LogError("strToVec failed .");
|
||||
return false;
|
||||
}
|
||||
_stopWords.insert(word);
|
||||
}
|
||||
LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
|
||||
|
||||
@ -100,12 +100,7 @@ namespace CppJieba
|
||||
{
|
||||
KeyWordInfo& wInfo = wordInfos[i];
|
||||
wInfo.idf = - wInfo.logFreq;
|
||||
if(0 == wInfo.wLen)
|
||||
{
|
||||
LogFatal("wLen is 0!");
|
||||
return false;
|
||||
}
|
||||
wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
|
||||
wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
|
||||
}
|
||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||
return true;
|
||||
@ -143,14 +138,16 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef DEBU
|
||||
LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
|
||||
#endif
|
||||
|
||||
keyWordInfos.clear();
|
||||
for(uint i = 0; i < words.size(); i++)
|
||||
{
|
||||
keyWordInfos.push_back(words[i]);
|
||||
Unicode uniWord;
|
||||
if(!TransCode::strToVec(words[i], uniWord))
|
||||
{
|
||||
LogError("strToVec failed");
|
||||
return false;
|
||||
}
|
||||
keyWordInfos.push_back(uniWord);
|
||||
}
|
||||
|
||||
return _extract(keyWordInfos, topN);
|
||||
@ -164,7 +161,7 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
vector<TrieNodeInfo> trieNodeInfos;
|
||||
_segment.cutDAG(title, trieNodeInfos);
|
||||
_segment.cut(title, trieNodeInfos);
|
||||
|
||||
keyWordInfos.clear();
|
||||
for(uint i = 0; i < trieNodeInfos.size(); i++)
|
||||
@ -249,7 +246,7 @@ namespace CppJieba
|
||||
|
||||
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
set<string> st;
|
||||
set<Unicode> st;
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
{
|
||||
if(st.find(it->word) != st.end())
|
||||
@ -271,7 +268,7 @@ namespace CppJieba
|
||||
{
|
||||
|
||||
// filter single word
|
||||
if(1 == it->wLen)
|
||||
if(1 == it->word.size())
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
@ -285,79 +282,68 @@ namespace CppJieba
|
||||
|
||||
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
vector<string> tmp ;
|
||||
vector<Unicode> tmp ;
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
tmp.push_back(wordInfos[i].word);
|
||||
}
|
||||
set<string> subs;
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
|
||||
{
|
||||
for(uint j = 0; j < tmp.size(); j++)
|
||||
{
|
||||
if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
|
||||
{
|
||||
subs.insert(it->word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//erase subs from strs
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
{
|
||||
if(subs.end() != subs.find(it->word))
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it ++;
|
||||
}
|
||||
if(_isSubIn(tmp, it->word))
|
||||
{
|
||||
it = wordInfos.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_isContainSubWords(const string& word)
|
||||
{
|
||||
for(uint i = 0; i < _priorSubWords.size(); i++)
|
||||
{
|
||||
if(string::npos != word.find(_priorSubWords[i]))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
//bool KeyWordExt::_isContainSubWords(const string& word)
|
||||
//{
|
||||
// for(uint i = 0; i < _priorSubWords.size(); i++)
|
||||
// {
|
||||
// if(string::npos != word.find(_priorSubWords[i]))
|
||||
// {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// return false;
|
||||
//}
|
||||
|
||||
bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
|
||||
{
|
||||
if(2 > wordInfos.size())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
|
||||
//{
|
||||
// if(2 > wordInfos.size())
|
||||
// {
|
||||
// return true;
|
||||
// }
|
||||
|
||||
KeyWordInfo prior;
|
||||
bool flag = false;
|
||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
{
|
||||
if(_isContainSubWords(it->word))
|
||||
{
|
||||
prior = *it;
|
||||
it = wordInfos.erase(it);
|
||||
flag = true;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
it ++;
|
||||
}
|
||||
}
|
||||
if(flag)
|
||||
{
|
||||
wordInfos.insert(wordInfos.begin(), prior);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// KeyWordInfo prior;
|
||||
// bool flag = false;
|
||||
// for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||
// {
|
||||
// if(_isContainSubWords(it->word))
|
||||
// {
|
||||
// prior = *it;
|
||||
// it = wordInfos.erase(it);
|
||||
// flag = true;
|
||||
// break;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// it ++;
|
||||
// }
|
||||
// }
|
||||
// if(flag)
|
||||
// {
|
||||
// wordInfos.insert(wordInfos.begin(), prior);
|
||||
// }
|
||||
// return true;
|
||||
//}
|
||||
}
|
||||
|
||||
|
||||
@ -375,12 +361,6 @@ int main()
|
||||
}
|
||||
ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
|
||||
|
||||
if(!ext._loadPriorSubWords("../dicts/prior.gbk"))
|
||||
{
|
||||
cerr<<"err"<<endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
ifstream ifile("testtitle.gbk");
|
||||
vector<string> res;
|
||||
string line;
|
||||
|
@ -1,7 +1,7 @@
|
||||
/************************************
|
||||
* file enc : ASCII
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
************************************/
|
||||
#ifndef CPPJIEBA_KEYWORDEXT_H
|
||||
#define CPPJIEBA_KEYWORDEXT_H
|
||||
|
||||
@ -11,45 +11,56 @@
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
class KeyWordExt
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
vector<string> _priorSubWords;
|
||||
set<string> _stopWords;
|
||||
public:
|
||||
KeyWordExt();
|
||||
~KeyWordExt();
|
||||
bool init(const char* const segDictFile, const char* const stopWordDictFile);
|
||||
bool dispose();
|
||||
|
||||
private:
|
||||
bool _loadStopWords(const char * const filePath);
|
||||
bool _loadPriorSubWords(const char * const filePath);
|
||||
class KeyWordExt
|
||||
{
|
||||
private:
|
||||
MPSegment _segment;
|
||||
//vector<string> _priorSubWords;
|
||||
set<Unicode> _stopWords;
|
||||
public:
|
||||
KeyWordExt();
|
||||
~KeyWordExt();
|
||||
bool init(const char* const segDictFile);
|
||||
bool dispose();
|
||||
bool loadStopWords(const char * const filePath);
|
||||
private:
|
||||
//bool _loadPriorSubWords(const char * const filePath);
|
||||
|
||||
|
||||
public:
|
||||
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
private:
|
||||
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
|
||||
private:
|
||||
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
|
||||
private:
|
||||
//sort by word len - idf
|
||||
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
|
||||
private:
|
||||
bool _filter(vector<KeyWordInfo>& );
|
||||
bool _filterDuplicate(vector<KeyWordInfo>& );
|
||||
bool _filterSingleWord(vector<KeyWordInfo>& );
|
||||
bool _filterSubstr(vector<KeyWordInfo>& );
|
||||
bool _filterStopWords(vector<KeyWordInfo>& );
|
||||
private:
|
||||
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
|
||||
bool _isContainSubWords(const string& word);
|
||||
public:
|
||||
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
private:
|
||||
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
|
||||
private:
|
||||
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
|
||||
private:
|
||||
//sort by word len - idf
|
||||
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
|
||||
private:
|
||||
bool _filter(vector<KeyWordInfo>& );
|
||||
bool _filterDuplicate(vector<KeyWordInfo>& );
|
||||
bool _filterSingleWord(vector<KeyWordInfo>& );
|
||||
bool _filterSubstr(vector<KeyWordInfo>& );
|
||||
bool _filterStopWords(vector<KeyWordInfo>& );
|
||||
private:
|
||||
inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
|
||||
{
|
||||
|
||||
};
|
||||
for(uint j = 0; j < words.size(); j++)
|
||||
{
|
||||
if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
//bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
|
||||
//bool _isContainSubWords(const string& word);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
@ -36,22 +36,22 @@ namespace CppJieba
|
||||
return _trie.dispose();
|
||||
}
|
||||
|
||||
bool MPSegment::cutDAG(const string& str, vector<string>& res)
|
||||
bool MPSegment::cut(const string& str, vector<string>& res)
|
||||
{
|
||||
vector<TrieNodeInfo> segWordInfos;
|
||||
if(!cutDAG(str, segWordInfos))
|
||||
if(!cut(str, segWordInfos))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
res.clear();
|
||||
for(uint i = 0; i < segWordInfos.size(); i++)
|
||||
{
|
||||
res.push_back(segWordInfos[i].word);
|
||||
res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MPSegment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos)
|
||||
bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)
|
||||
{
|
||||
if(str.empty())
|
||||
{
|
||||
@ -59,13 +59,19 @@ namespace CppJieba
|
||||
}
|
||||
segWordInfos.clear();
|
||||
SegmentContext segContext;
|
||||
|
||||
if(!TransCode::strToVec(str, segContext.uintVec))
|
||||
Unicode sentence;
|
||||
|
||||
if(!TransCode::strToVec(str, sentence))
|
||||
{
|
||||
LogError("TransCode::strToVec failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0; i < sentence.size(); i++)
|
||||
{
|
||||
segContext.push_back(SegmentChar(sentence[i]));
|
||||
}
|
||||
|
||||
//calc DAG
|
||||
if(!_calcDAG(segContext))
|
||||
{
|
||||
@ -79,9 +85,9 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!_cutDAG(segContext, segWordInfos))
|
||||
if(!_cut(segContext, segWordInfos))
|
||||
{
|
||||
LogError("_cutDAG failed.");
|
||||
LogError("_cut failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -90,111 +96,150 @@ namespace CppJieba
|
||||
|
||||
bool MPSegment::_calcDAG(SegmentContext& segContext)
|
||||
{
|
||||
if(segContext.uintVec.empty())
|
||||
if(segContext.empty())
|
||||
{
|
||||
LogError("segContext empty.");
|
||||
return false;
|
||||
}
|
||||
vector<pair<uint, const TrieNodeInfo*> > vec;
|
||||
Unicode::const_iterator beginIter = segContext.uintVec.begin();
|
||||
for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
|
||||
{
|
||||
vec.clear();
|
||||
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
|
||||
for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
|
||||
{
|
||||
//care: the iterJ exceed iterEnd
|
||||
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
|
||||
if(NULL != ptNodeInfo)
|
||||
{
|
||||
vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
|
||||
}
|
||||
}
|
||||
segContext.dag.push_back(vec);
|
||||
}
|
||||
return true;
|
||||
|
||||
Unicode unicode;
|
||||
for(uint i = 0; i < segContext.size(); i++)
|
||||
{
|
||||
unicode.clear();
|
||||
for(uint j = i ; j < segContext.size(); j++)
|
||||
{
|
||||
unicode.push_back(segContext[j].uniCh);
|
||||
const TrieNodeInfo* pInfo = _trie.find(unicode);
|
||||
if(pInfo)
|
||||
{
|
||||
segContext[i].dag[j] = pInfo;
|
||||
}
|
||||
}
|
||||
if(segContext[i].dag.end() == segContext[i].dag.find(i))
|
||||
{
|
||||
segContext[i].dag[i] = NULL;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
//vector<pair<uint, const TrieNodeInfo*> > vec;
|
||||
//Unicode::const_iterator beginIter = segContext.uintVec.begin();
|
||||
//for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
|
||||
//{
|
||||
// vec.clear();
|
||||
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
|
||||
// for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
|
||||
// {
|
||||
// //care: the iterJ exceed iterEnd
|
||||
// const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
|
||||
// if(NULL != ptNodeInfo)
|
||||
// {
|
||||
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
|
||||
// }
|
||||
// }
|
||||
// segContext.dag.push_back(vec);
|
||||
//}
|
||||
//return true;
|
||||
}
|
||||
|
||||
bool MPSegment::_calcDP(SegmentContext& segContext)
|
||||
{
|
||||
if(segContext.uintVec.empty())
|
||||
if(segContext.empty())
|
||||
{
|
||||
LogError("uintVec illegal");
|
||||
LogError("segContext empty");
|
||||
return false;
|
||||
}
|
||||
|
||||
for(int i = segContext.size() - 1; i >= 0; i--)
|
||||
{
|
||||
segContext[i].pInfo = NULL;
|
||||
segContext[i].weight = MIN_DOUBLE;
|
||||
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
|
||||
{
|
||||
uint nextPos = it->first;
|
||||
const TrieNodeInfo* p = it->second;
|
||||
double val = 0.0;
|
||||
if(nextPos + 1 < segContext.size())
|
||||
{
|
||||
val += segContext[nextPos + 1].weight;
|
||||
}
|
||||
|
||||
if(segContext.uintVec.size() != segContext.dag.size())
|
||||
{
|
||||
LogError("dag is illegal!");
|
||||
return false;
|
||||
}
|
||||
|
||||
segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
|
||||
segContext.dp[segContext.uintVec.size()].first = NULL;
|
||||
segContext.dp[segContext.uintVec.size()].second = 0.0;
|
||||
|
||||
for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
|
||||
{
|
||||
// calc max
|
||||
segContext.dp[i].first = NULL;
|
||||
segContext.dp[i].second = MIN_DOUBLE;
|
||||
for(uint j = 0; j < segContext.dag[i].size(); j++)
|
||||
{
|
||||
const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
|
||||
int pos = p.first;
|
||||
double val = segContext.dp[pos+1].second;
|
||||
if(NULL != p.second)
|
||||
{
|
||||
val += (p.second)->logFreq;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(p)
|
||||
{
|
||||
val += p->logFreq;
|
||||
}
|
||||
else
|
||||
{
|
||||
val += _trie.getMinLogFreq();
|
||||
}
|
||||
|
||||
if(val > segContext.dp[i].second)
|
||||
}
|
||||
if(val > segContext[i].weight)
|
||||
{
|
||||
segContext.dp[i].first = p.second;
|
||||
segContext.dp[i].second = val;
|
||||
segContext[i].pInfo = p;
|
||||
segContext[i].weight = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
segContext.dp.pop_back();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
//segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
|
||||
//segContext.dp[segContext.uintVec.size()].first = NULL;
|
||||
//segContext.dp[segContext.uintVec.size()].second = 0.0;
|
||||
|
||||
//for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
|
||||
//{
|
||||
// // calc max
|
||||
// segContext.dp[i].first = NULL;
|
||||
// segContext.dp[i].second = MIN_DOUBLE;
|
||||
// for(uint j = 0; j < segContext.dag[i].size(); j++)
|
||||
// {
|
||||
// const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
|
||||
// int pos = p.first;
|
||||
// double val = segContext.dp[pos+1].second;
|
||||
// if(NULL != p.second)
|
||||
// {
|
||||
// val += (p.second)->logFreq;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// val += _trie.getMinLogFreq();
|
||||
// }
|
||||
|
||||
// if(val > segContext.dp[i].second)
|
||||
// {
|
||||
// segContext.dp[i].first = p.second;
|
||||
// segContext.dp[i].second = val;
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
//segContext.dp.pop_back();
|
||||
//return true;
|
||||
}
|
||||
|
||||
bool MPSegment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res)
|
||||
bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)
|
||||
{
|
||||
if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
|
||||
{
|
||||
LogError("dp or uintVec illegal!");
|
||||
return false;
|
||||
}
|
||||
//if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
|
||||
//{
|
||||
// LogFatal("dp or uintVec illegal!");
|
||||
// return false;
|
||||
//}
|
||||
res.clear();
|
||||
|
||||
Unicode::const_iterator iterBegin = segContext.uintVec.begin();
|
||||
uint i = 0;
|
||||
while(i < segContext.dp.size())
|
||||
while(i < segContext.size())
|
||||
{
|
||||
const TrieNodeInfo* p = segContext.dp[i].first;
|
||||
if(NULL == p)
|
||||
const TrieNodeInfo* p = segContext[i].pInfo;
|
||||
if(p)
|
||||
{
|
||||
res.push_back(*p);
|
||||
i += p->word.size();
|
||||
}
|
||||
else//single chinese word
|
||||
{
|
||||
TrieNodeInfo nodeInfo;
|
||||
nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1);
|
||||
nodeInfo.wLen = 1;
|
||||
nodeInfo.word.push_back(segContext[i].uniCh);
|
||||
nodeInfo.freq = 0;
|
||||
nodeInfo.logFreq = _trie.getMinLogFreq();
|
||||
res.push_back(nodeInfo);
|
||||
i ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
res.push_back(*p);
|
||||
if(0 == p->wLen)
|
||||
{
|
||||
LogFatal("TrieNodeInfo's wLen is 0!");
|
||||
return false;
|
||||
}
|
||||
i += p->wLen;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@ -223,7 +268,7 @@ int main()
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
res.clear();
|
||||
segment.cutDAG(line, res);
|
||||
segment.cut(line, res);
|
||||
PRINT_VECTOR(res);
|
||||
getchar();
|
||||
}
|
||||
|
@ -13,6 +13,9 @@
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
typedef vector<SegmentChar> SegmentContext;
|
||||
|
||||
class MPSegment
|
||||
{
|
||||
private:
|
||||
@ -25,15 +28,14 @@ namespace CppJieba
|
||||
bool init(const char* const filePath);
|
||||
bool dispose();
|
||||
public:
|
||||
bool cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos);
|
||||
bool cutDAG(const string& str, vector<string>& res);
|
||||
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos);
|
||||
bool cut(const string& str, vector<string>& res);
|
||||
|
||||
private:
|
||||
bool _calcDAG(SegmentContext& segContext);
|
||||
bool _calcDP(SegmentContext& segContext);
|
||||
bool _cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res);
|
||||
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res);
|
||||
|
||||
//bool _fill(const string& )
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -35,12 +35,12 @@ namespace CppJieba
|
||||
bool MixSegment::cut(const string& str, vector<string>& res)
|
||||
{
|
||||
vector<TrieNodeInfo> infos;
|
||||
if(!_mpSeg.cutDAG(str, infos))
|
||||
if(!_mpSeg.cut(str, infos))
|
||||
{
|
||||
LogError("_mpSeg cutDAG failed.");
|
||||
return false;
|
||||
}
|
||||
for(uint = 0; i < infos.size(); i++)
|
||||
for(uint i= 0; i < infos.size(); i++)
|
||||
{
|
||||
|
||||
}
|
||||
|
22
src/Trie.cpp
22
src/Trie.cpp
@ -109,9 +109,11 @@ namespace CppJieba
|
||||
LogError(string_format("line[%s] illegal.", line.c_str()));
|
||||
return false;
|
||||
}
|
||||
nodeInfo.word = vecBuf[0];
|
||||
if(!TransCode::strToVec(vecBuf[0], nodeInfo.word))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
||||
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
|
||||
if(3 == vecBuf.size())
|
||||
{
|
||||
nodeInfo.tag = vecBuf[2];
|
||||
@ -193,7 +195,7 @@ namespace CppJieba
|
||||
return res;
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(const string& str)
|
||||
TrieNodeInfo* Trie::find(const string& str)
|
||||
{
|
||||
Unicode uintVec;
|
||||
bool retFlag = TransCode::strToVec(str, uintVec);
|
||||
@ -204,7 +206,7 @@ namespace CppJieba
|
||||
return find(uintVec);
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(const Unicode& uintVec)
|
||||
TrieNodeInfo* Trie::find(const Unicode& uintVec)
|
||||
{
|
||||
if(uintVec.empty())
|
||||
{
|
||||
@ -213,7 +215,7 @@ namespace CppJieba
|
||||
return find(uintVec.begin(), uintVec.end());
|
||||
}
|
||||
|
||||
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
|
||||
if(!_getInitFlag())
|
||||
@ -324,16 +326,8 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
const string& word = nodeInfo.word;
|
||||
|
||||
Unicode uintVec;
|
||||
bool retFlag = TransCode::strToVec(word, uintVec);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("TransCode::strToVec error.");
|
||||
return false;
|
||||
}
|
||||
|
||||
const Unicode& uintVec = nodeInfo.word;
|
||||
TrieNode* p = _root;
|
||||
for(uint i = 0; i < uintVec.size(); i++)
|
||||
{
|
||||
|
@ -66,9 +66,9 @@ namespace CppJieba
|
||||
bool _getInitFlag();
|
||||
|
||||
public:
|
||||
const TrieNodeInfo* find(const string& str);
|
||||
const TrieNodeInfo* find(const Unicode& uintVec);
|
||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
TrieNodeInfo* find(const string& str);
|
||||
TrieNodeInfo* find(const Unicode& uintVec);
|
||||
TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
const TrieNodeInfo* findPrefix(const string& str);
|
||||
|
||||
public:
|
||||
|
@ -27,7 +27,6 @@ namespace CppJieba
|
||||
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||
typedef unordered_map<uint16_t, double> EmitProbMap;
|
||||
|
||||
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
}
|
||||
|
@ -4,35 +4,63 @@
|
||||
#include <limits>
|
||||
#include "globals.h"
|
||||
#include "Trie.h"
|
||||
#include "TransCode.h"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
struct TrieNodeInfo
|
||||
{
|
||||
string word;
|
||||
size_t wLen;// the word's len , not string.length(),
|
||||
//string word;
|
||||
//size_t wLen;// the word's len , not string.length(),
|
||||
Unicode word;
|
||||
size_t freq;
|
||||
string tag;
|
||||
double logFreq; //logFreq = log(freq/sum(freq));
|
||||
TrieNodeInfo():wLen(0),freq(0),logFreq(0.0)
|
||||
TrieNodeInfo():freq(0),logFreq(0.0)
|
||||
{
|
||||
}
|
||||
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
||||
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
||||
{
|
||||
}
|
||||
TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
||||
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
||||
{
|
||||
wLen = TransCode::getWordLength(_word);
|
||||
}
|
||||
};
|
||||
|
||||
typedef unordered_map<uint, const TrieNodeInfo*> DagType;
|
||||
struct SegmentChar
|
||||
{
|
||||
uint16_t uniCh;
|
||||
DagType dag;
|
||||
const TrieNodeInfo * pInfo;
|
||||
double weight;
|
||||
|
||||
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
|
||||
{
|
||||
}
|
||||
|
||||
/*const TrieNodeInfo* pInfo;
|
||||
double weight;
|
||||
SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
|
||||
{
|
||||
}*/
|
||||
};
|
||||
/*
|
||||
struct SegmentContext
|
||||
{
|
||||
vector<SegmentChar> context;
|
||||
bool getDA
|
||||
};*/
|
||||
typedef vector<SegmentChar> SegmentContext;
|
||||
|
||||
struct SegmentContext//: public TrieNodeInfo
|
||||
{
|
||||
vector<uint16_t> uintVec;
|
||||
vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
|
||||
vector< pair<const TrieNodeInfo*, double> > dp;
|
||||
};
|
||||
//struct SegmentContext
|
||||
//{
|
||||
// vector<SegmentChar> context;
|
||||
// //vector<uint16_t> uintVec;
|
||||
// //vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
|
||||
// //vector< pair<const TrieNodeInfo*, double> > dp;
|
||||
//};
|
||||
|
||||
/*
|
||||
struct SegmentWordInfo: public TrieNodeInfo
|
||||
@ -48,7 +76,7 @@ namespace CppJieba
|
||||
KeyWordInfo():idf(0.0),weight(0.0)
|
||||
{
|
||||
}
|
||||
KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
|
||||
KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
|
||||
{
|
||||
}
|
||||
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
|
||||
@ -56,13 +84,12 @@ namespace CppJieba
|
||||
}
|
||||
string toString() const
|
||||
{
|
||||
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
||||
return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf);
|
||||
}
|
||||
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
|
||||
{
|
||||
word = trieNodeInfo.word;
|
||||
freq = trieNodeInfo.freq;
|
||||
wLen = trieNodeInfo.wLen;
|
||||
tag = trieNodeInfo.tag;
|
||||
logFreq = trieNodeInfo.logFreq;
|
||||
return *this;
|
||||
|
Loading…
x
Reference in New Issue
Block a user