mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
big reconstruction: replace string word with Unicode in structs.h and others
This commit is contained in:
parent
70f12f2c97
commit
e8a98d4e4d
@ -8,7 +8,7 @@ using namespace CppJieba;
|
|||||||
void testKeyWordExt(const char * dictPath, const char * filePath)
|
void testKeyWordExt(const char * dictPath, const char * filePath)
|
||||||
{
|
{
|
||||||
KeyWordExt ext;
|
KeyWordExt ext;
|
||||||
if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0"))
|
if(!ext.init(dictPath))
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,7 @@ void cut(const char * const filePath)
|
|||||||
{
|
{
|
||||||
if(!line.empty())
|
if(!line.empty())
|
||||||
{
|
{
|
||||||
seg.cutDAG(line, res);
|
seg.cut(line, res);
|
||||||
cout<<line<<"\n"<<joinStr(res,"/")<<endl;
|
cout<<line<<"\n"<<joinStr(res,"/")<<endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@ void cutAll(const char* const filePath)
|
|||||||
string line;
|
string line;
|
||||||
while(getline(ifs, line))
|
while(getline(ifs, line))
|
||||||
{
|
{
|
||||||
seg.cutDAG(line, res);
|
seg.cut(line, res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KeyWordExt::init(const char* const segDictFile, const char* const stopWordDictFile)
|
bool KeyWordExt::init(const char* const segDictFile)
|
||||||
{
|
{
|
||||||
LogInfo("KeyWordExt init start ...");
|
LogInfo("KeyWordExt init start ...");
|
||||||
if(!_segment.init(segDictFile))
|
if(!_segment.init(segDictFile))
|
||||||
@ -24,40 +24,34 @@ namespace CppJieba
|
|||||||
LogError("_segment.init failed.");
|
LogError("_segment.init failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(!_loadStopWords(stopWordDictFile))
|
|
||||||
{
|
|
||||||
LogError("_loadStopWords failed.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LogInfo("KeyWordExt init OK.");
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
|
//bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
|
||||||
{
|
//{
|
||||||
LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
|
// LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
|
||||||
if(!checkFileExist(filePath))
|
// if(!checkFileExist(filePath))
|
||||||
{
|
// {
|
||||||
LogError(string_format("cann't find file[%s].",filePath));
|
// LogError(string_format("cann't find file[%s].",filePath));
|
||||||
return false;
|
// return false;
|
||||||
}
|
// }
|
||||||
if(!_priorSubWords.empty())
|
// if(!_priorSubWords.empty())
|
||||||
{
|
// {
|
||||||
LogError("_priorSubWords has been initted before");
|
// LogError("_priorSubWords has been initted before");
|
||||||
return false;
|
// return false;
|
||||||
}
|
// }
|
||||||
ifstream infile(filePath);
|
// ifstream infile(filePath);
|
||||||
string subword;
|
// string subword;
|
||||||
while(getline(infile, subword))
|
// while(getline(infile, subword))
|
||||||
{
|
// {
|
||||||
_priorSubWords.push_back(subword);
|
// _priorSubWords.push_back(subword);
|
||||||
}
|
// }
|
||||||
LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
|
// LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
|
||||||
infile.close();
|
// infile.close();
|
||||||
return true;
|
// return true;
|
||||||
}
|
//}
|
||||||
|
|
||||||
bool KeyWordExt::_loadStopWords(const char * const filePath)
|
bool KeyWordExt::loadStopWords(const char * const filePath)
|
||||||
{
|
{
|
||||||
|
|
||||||
LogInfo(string_format("_loadStopWords(%s) start", filePath));
|
LogInfo(string_format("_loadStopWords(%s) start", filePath));
|
||||||
@ -74,9 +68,15 @@ namespace CppJieba
|
|||||||
|
|
||||||
ifstream ifile(filePath);
|
ifstream ifile(filePath);
|
||||||
string line;
|
string line;
|
||||||
|
Unicode word;
|
||||||
while(getline(ifile, line))
|
while(getline(ifile, line))
|
||||||
{
|
{
|
||||||
_stopWords.insert(line);
|
if(!TransCode::strToVec(line, word))
|
||||||
|
{
|
||||||
|
LogError("strToVec failed .");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
_stopWords.insert(word);
|
||||||
}
|
}
|
||||||
LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
|
LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
|
||||||
|
|
||||||
@ -100,12 +100,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
KeyWordInfo& wInfo = wordInfos[i];
|
KeyWordInfo& wInfo = wordInfos[i];
|
||||||
wInfo.idf = - wInfo.logFreq;
|
wInfo.idf = - wInfo.logFreq;
|
||||||
if(0 == wInfo.wLen)
|
wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
|
||||||
{
|
|
||||||
LogFatal("wLen is 0!");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
wInfo.weight = log(double(wInfo.wLen + 1)) * wInfo.idf;
|
|
||||||
}
|
}
|
||||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||||
return true;
|
return true;
|
||||||
@ -143,14 +138,16 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBU
|
|
||||||
LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str()));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
keyWordInfos.clear();
|
keyWordInfos.clear();
|
||||||
for(uint i = 0; i < words.size(); i++)
|
for(uint i = 0; i < words.size(); i++)
|
||||||
{
|
{
|
||||||
keyWordInfos.push_back(words[i]);
|
Unicode uniWord;
|
||||||
|
if(!TransCode::strToVec(words[i], uniWord))
|
||||||
|
{
|
||||||
|
LogError("strToVec failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
keyWordInfos.push_back(uniWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
return _extract(keyWordInfos, topN);
|
return _extract(keyWordInfos, topN);
|
||||||
@ -164,7 +161,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
vector<TrieNodeInfo> trieNodeInfos;
|
vector<TrieNodeInfo> trieNodeInfos;
|
||||||
_segment.cutDAG(title, trieNodeInfos);
|
_segment.cut(title, trieNodeInfos);
|
||||||
|
|
||||||
keyWordInfos.clear();
|
keyWordInfos.clear();
|
||||||
for(uint i = 0; i < trieNodeInfos.size(); i++)
|
for(uint i = 0; i < trieNodeInfos.size(); i++)
|
||||||
@ -249,7 +246,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
|
bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
|
||||||
{
|
{
|
||||||
set<string> st;
|
set<Unicode> st;
|
||||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||||
{
|
{
|
||||||
if(st.find(it->word) != st.end())
|
if(st.find(it->word) != st.end())
|
||||||
@ -271,7 +268,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
|
|
||||||
// filter single word
|
// filter single word
|
||||||
if(1 == it->wLen)
|
if(1 == it->word.size())
|
||||||
{
|
{
|
||||||
it = wordInfos.erase(it);
|
it = wordInfos.erase(it);
|
||||||
}
|
}
|
||||||
@ -285,79 +282,68 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
|
bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
|
||||||
{
|
{
|
||||||
vector<string> tmp ;
|
vector<Unicode> tmp ;
|
||||||
for(uint i = 0; i < wordInfos.size(); i++)
|
for(uint i = 0; i < wordInfos.size(); i++)
|
||||||
{
|
{
|
||||||
tmp.push_back(wordInfos[i].word);
|
tmp.push_back(wordInfos[i].word);
|
||||||
}
|
}
|
||||||
set<string> subs;
|
|
||||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++)
|
|
||||||
{
|
|
||||||
for(uint j = 0; j < tmp.size(); j++)
|
|
||||||
{
|
|
||||||
if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0))
|
|
||||||
{
|
|
||||||
subs.insert(it->word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//erase subs from strs
|
|
||||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||||
{
|
{
|
||||||
if(subs.end() != subs.find(it->word))
|
if(_isSubIn(tmp, it->word))
|
||||||
{
|
{
|
||||||
it = wordInfos.erase(it);
|
it = wordInfos.erase(it);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
it ++;
|
it++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KeyWordExt::_isContainSubWords(const string& word)
|
//bool KeyWordExt::_isContainSubWords(const string& word)
|
||||||
{
|
//{
|
||||||
for(uint i = 0; i < _priorSubWords.size(); i++)
|
// for(uint i = 0; i < _priorSubWords.size(); i++)
|
||||||
{
|
// {
|
||||||
if(string::npos != word.find(_priorSubWords[i]))
|
// if(string::npos != word.find(_priorSubWords[i]))
|
||||||
{
|
// {
|
||||||
return true;
|
// return true;
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
return false;
|
// return false;
|
||||||
}
|
//}
|
||||||
|
|
||||||
bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
|
//bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
|
||||||
{
|
//{
|
||||||
if(2 > wordInfos.size())
|
// if(2 > wordInfos.size())
|
||||||
{
|
// {
|
||||||
return true;
|
// return true;
|
||||||
}
|
// }
|
||||||
|
|
||||||
KeyWordInfo prior;
|
// KeyWordInfo prior;
|
||||||
bool flag = false;
|
// bool flag = false;
|
||||||
for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
// for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
|
||||||
{
|
// {
|
||||||
if(_isContainSubWords(it->word))
|
// if(_isContainSubWords(it->word))
|
||||||
{
|
// {
|
||||||
prior = *it;
|
// prior = *it;
|
||||||
it = wordInfos.erase(it);
|
// it = wordInfos.erase(it);
|
||||||
flag = true;
|
// flag = true;
|
||||||
break;
|
// break;
|
||||||
}
|
// }
|
||||||
else
|
// else
|
||||||
{
|
// {
|
||||||
it ++;
|
// it ++;
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
if(flag)
|
// if(flag)
|
||||||
{
|
// {
|
||||||
wordInfos.insert(wordInfos.begin(), prior);
|
// wordInfos.insert(wordInfos.begin(), prior);
|
||||||
}
|
// }
|
||||||
return true;
|
// return true;
|
||||||
}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -375,12 +361,6 @@ int main()
|
|||||||
}
|
}
|
||||||
ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
|
ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
|
||||||
|
|
||||||
if(!ext._loadPriorSubWords("../dicts/prior.gbk"))
|
|
||||||
{
|
|
||||||
cerr<<"err"<<endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ifstream ifile("testtitle.gbk");
|
ifstream ifile("testtitle.gbk");
|
||||||
vector<string> res;
|
vector<string> res;
|
||||||
string line;
|
string line;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/************************************
|
/************************************
|
||||||
* file enc : ASCII
|
* file enc : ASCII
|
||||||
* author : wuyanyi09@gmail.com
|
* author : wuyanyi09@gmail.com
|
||||||
************************************/
|
************************************/
|
||||||
#ifndef CPPJIEBA_KEYWORDEXT_H
|
#ifndef CPPJIEBA_KEYWORDEXT_H
|
||||||
#define CPPJIEBA_KEYWORDEXT_H
|
#define CPPJIEBA_KEYWORDEXT_H
|
||||||
|
|
||||||
@ -11,45 +11,56 @@
|
|||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
class KeyWordExt
|
class KeyWordExt
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
MPSegment _segment;
|
MPSegment _segment;
|
||||||
vector<string> _priorSubWords;
|
//vector<string> _priorSubWords;
|
||||||
set<string> _stopWords;
|
set<Unicode> _stopWords;
|
||||||
public:
|
public:
|
||||||
KeyWordExt();
|
KeyWordExt();
|
||||||
~KeyWordExt();
|
~KeyWordExt();
|
||||||
bool init(const char* const segDictFile, const char* const stopWordDictFile);
|
bool init(const char* const segDictFile);
|
||||||
bool dispose();
|
bool dispose();
|
||||||
|
bool loadStopWords(const char * const filePath);
|
||||||
private:
|
private:
|
||||||
bool _loadStopWords(const char * const filePath);
|
//bool _loadPriorSubWords(const char * const filePath);
|
||||||
bool _loadPriorSubWords(const char * const filePath);
|
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||||
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||||
private:
|
private:
|
||||||
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
|
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
|
||||||
private:
|
private:
|
||||||
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
|
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
|
||||||
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
|
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
|
||||||
private:
|
private:
|
||||||
//sort by word len - idf
|
//sort by word len - idf
|
||||||
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
|
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
|
||||||
private:
|
private:
|
||||||
bool _filter(vector<KeyWordInfo>& );
|
bool _filter(vector<KeyWordInfo>& );
|
||||||
bool _filterDuplicate(vector<KeyWordInfo>& );
|
bool _filterDuplicate(vector<KeyWordInfo>& );
|
||||||
bool _filterSingleWord(vector<KeyWordInfo>& );
|
bool _filterSingleWord(vector<KeyWordInfo>& );
|
||||||
bool _filterSubstr(vector<KeyWordInfo>& );
|
bool _filterSubstr(vector<KeyWordInfo>& );
|
||||||
bool _filterStopWords(vector<KeyWordInfo>& );
|
bool _filterStopWords(vector<KeyWordInfo>& );
|
||||||
private:
|
private:
|
||||||
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
|
inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
|
||||||
bool _isContainSubWords(const string& word);
|
{
|
||||||
|
|
||||||
};
|
for(uint j = 0; j < words.size(); j++)
|
||||||
|
{
|
||||||
|
if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
//bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
|
||||||
|
//bool _isContainSubWords(const string& word);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36,22 +36,22 @@ namespace CppJieba
|
|||||||
return _trie.dispose();
|
return _trie.dispose();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MPSegment::cutDAG(const string& str, vector<string>& res)
|
bool MPSegment::cut(const string& str, vector<string>& res)
|
||||||
{
|
{
|
||||||
vector<TrieNodeInfo> segWordInfos;
|
vector<TrieNodeInfo> segWordInfos;
|
||||||
if(!cutDAG(str, segWordInfos))
|
if(!cut(str, segWordInfos))
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
res.clear();
|
res.clear();
|
||||||
for(uint i = 0; i < segWordInfos.size(); i++)
|
for(uint i = 0; i < segWordInfos.size(); i++)
|
||||||
{
|
{
|
||||||
res.push_back(segWordInfos[i].word);
|
res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MPSegment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos)
|
bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)
|
||||||
{
|
{
|
||||||
if(str.empty())
|
if(str.empty())
|
||||||
{
|
{
|
||||||
@ -59,13 +59,19 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
segWordInfos.clear();
|
segWordInfos.clear();
|
||||||
SegmentContext segContext;
|
SegmentContext segContext;
|
||||||
|
Unicode sentence;
|
||||||
|
|
||||||
if(!TransCode::strToVec(str, segContext.uintVec))
|
if(!TransCode::strToVec(str, sentence))
|
||||||
{
|
{
|
||||||
LogError("TransCode::strToVec failed.");
|
LogError("TransCode::strToVec failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for(uint i = 0; i < sentence.size(); i++)
|
||||||
|
{
|
||||||
|
segContext.push_back(SegmentChar(sentence[i]));
|
||||||
|
}
|
||||||
|
|
||||||
//calc DAG
|
//calc DAG
|
||||||
if(!_calcDAG(segContext))
|
if(!_calcDAG(segContext))
|
||||||
{
|
{
|
||||||
@ -79,9 +85,9 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!_cutDAG(segContext, segWordInfos))
|
if(!_cut(segContext, segWordInfos))
|
||||||
{
|
{
|
||||||
LogError("_cutDAG failed.");
|
LogError("_cut failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,111 +96,150 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool MPSegment::_calcDAG(SegmentContext& segContext)
|
bool MPSegment::_calcDAG(SegmentContext& segContext)
|
||||||
{
|
{
|
||||||
if(segContext.uintVec.empty())
|
if(segContext.empty())
|
||||||
{
|
{
|
||||||
|
LogError("segContext empty.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
vector<pair<uint, const TrieNodeInfo*> > vec;
|
|
||||||
Unicode::const_iterator beginIter = segContext.uintVec.begin();
|
Unicode unicode;
|
||||||
for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
|
for(uint i = 0; i < segContext.size(); i++)
|
||||||
{
|
{
|
||||||
vec.clear();
|
unicode.clear();
|
||||||
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
|
for(uint j = i ; j < segContext.size(); j++)
|
||||||
for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
|
{
|
||||||
{
|
unicode.push_back(segContext[j].uniCh);
|
||||||
//care: the iterJ exceed iterEnd
|
const TrieNodeInfo* pInfo = _trie.find(unicode);
|
||||||
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
|
if(pInfo)
|
||||||
if(NULL != ptNodeInfo)
|
{
|
||||||
{
|
segContext[i].dag[j] = pInfo;
|
||||||
vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
|
}
|
||||||
}
|
}
|
||||||
}
|
if(segContext[i].dag.end() == segContext[i].dag.find(i))
|
||||||
segContext.dag.push_back(vec);
|
{
|
||||||
}
|
segContext[i].dag[i] = NULL;
|
||||||
return true;
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
//vector<pair<uint, const TrieNodeInfo*> > vec;
|
||||||
|
//Unicode::const_iterator beginIter = segContext.uintVec.begin();
|
||||||
|
//for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
|
||||||
|
//{
|
||||||
|
// vec.clear();
|
||||||
|
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
|
||||||
|
// for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
|
||||||
|
// {
|
||||||
|
// //care: the iterJ exceed iterEnd
|
||||||
|
// const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
|
||||||
|
// if(NULL != ptNodeInfo)
|
||||||
|
// {
|
||||||
|
// vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// segContext.dag.push_back(vec);
|
||||||
|
//}
|
||||||
|
//return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MPSegment::_calcDP(SegmentContext& segContext)
|
bool MPSegment::_calcDP(SegmentContext& segContext)
|
||||||
{
|
{
|
||||||
if(segContext.uintVec.empty())
|
if(segContext.empty())
|
||||||
{
|
{
|
||||||
LogError("uintVec illegal");
|
LogError("segContext empty");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(segContext.uintVec.size() != segContext.dag.size())
|
for(int i = segContext.size() - 1; i >= 0; i--)
|
||||||
{
|
{
|
||||||
LogError("dag is illegal!");
|
segContext[i].pInfo = NULL;
|
||||||
return false;
|
segContext[i].weight = MIN_DOUBLE;
|
||||||
}
|
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
|
||||||
|
{
|
||||||
|
uint nextPos = it->first;
|
||||||
|
const TrieNodeInfo* p = it->second;
|
||||||
|
double val = 0.0;
|
||||||
|
if(nextPos + 1 < segContext.size())
|
||||||
|
{
|
||||||
|
val += segContext[nextPos + 1].weight;
|
||||||
|
}
|
||||||
|
|
||||||
segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
|
if(p)
|
||||||
segContext.dp[segContext.uintVec.size()].first = NULL;
|
{
|
||||||
segContext.dp[segContext.uintVec.size()].second = 0.0;
|
val += p->logFreq;
|
||||||
|
}
|
||||||
for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
|
else
|
||||||
{
|
{
|
||||||
// calc max
|
|
||||||
segContext.dp[i].first = NULL;
|
|
||||||
segContext.dp[i].second = MIN_DOUBLE;
|
|
||||||
for(uint j = 0; j < segContext.dag[i].size(); j++)
|
|
||||||
{
|
|
||||||
const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
|
|
||||||
int pos = p.first;
|
|
||||||
double val = segContext.dp[pos+1].second;
|
|
||||||
if(NULL != p.second)
|
|
||||||
{
|
|
||||||
val += (p.second)->logFreq;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
val += _trie.getMinLogFreq();
|
val += _trie.getMinLogFreq();
|
||||||
}
|
}
|
||||||
|
if(val > segContext[i].weight)
|
||||||
if(val > segContext.dp[i].second)
|
|
||||||
{
|
{
|
||||||
segContext.dp[i].first = p.second;
|
segContext[i].pInfo = p;
|
||||||
segContext.dp[i].second = val;
|
segContext[i].weight = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segContext.dp.pop_back();
|
return true;
|
||||||
return true;
|
|
||||||
|
//segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
|
||||||
|
//segContext.dp[segContext.uintVec.size()].first = NULL;
|
||||||
|
//segContext.dp[segContext.uintVec.size()].second = 0.0;
|
||||||
|
|
||||||
|
//for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
|
||||||
|
//{
|
||||||
|
// // calc max
|
||||||
|
// segContext.dp[i].first = NULL;
|
||||||
|
// segContext.dp[i].second = MIN_DOUBLE;
|
||||||
|
// for(uint j = 0; j < segContext.dag[i].size(); j++)
|
||||||
|
// {
|
||||||
|
// const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
|
||||||
|
// int pos = p.first;
|
||||||
|
// double val = segContext.dp[pos+1].second;
|
||||||
|
// if(NULL != p.second)
|
||||||
|
// {
|
||||||
|
// val += (p.second)->logFreq;
|
||||||
|
// }
|
||||||
|
// else
|
||||||
|
// {
|
||||||
|
// val += _trie.getMinLogFreq();
|
||||||
|
// }
|
||||||
|
|
||||||
|
// if(val > segContext.dp[i].second)
|
||||||
|
// {
|
||||||
|
// segContext.dp[i].first = p.second;
|
||||||
|
// segContext.dp[i].second = val;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
//segContext.dp.pop_back();
|
||||||
|
//return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MPSegment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res)
|
bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)
|
||||||
{
|
{
|
||||||
if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
|
//if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
|
||||||
{
|
//{
|
||||||
LogError("dp or uintVec illegal!");
|
// LogFatal("dp or uintVec illegal!");
|
||||||
return false;
|
// return false;
|
||||||
}
|
//}
|
||||||
res.clear();
|
res.clear();
|
||||||
|
|
||||||
Unicode::const_iterator iterBegin = segContext.uintVec.begin();
|
|
||||||
uint i = 0;
|
uint i = 0;
|
||||||
while(i < segContext.dp.size())
|
while(i < segContext.size())
|
||||||
{
|
{
|
||||||
const TrieNodeInfo* p = segContext.dp[i].first;
|
const TrieNodeInfo* p = segContext[i].pInfo;
|
||||||
if(NULL == p)
|
if(p)
|
||||||
|
{
|
||||||
|
res.push_back(*p);
|
||||||
|
i += p->word.size();
|
||||||
|
}
|
||||||
|
else//single chinese word
|
||||||
{
|
{
|
||||||
TrieNodeInfo nodeInfo;
|
TrieNodeInfo nodeInfo;
|
||||||
nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1);
|
nodeInfo.word.push_back(segContext[i].uniCh);
|
||||||
nodeInfo.wLen = 1;
|
|
||||||
nodeInfo.freq = 0;
|
nodeInfo.freq = 0;
|
||||||
nodeInfo.logFreq = _trie.getMinLogFreq();
|
nodeInfo.logFreq = _trie.getMinLogFreq();
|
||||||
res.push_back(nodeInfo);
|
res.push_back(nodeInfo);
|
||||||
i ++;
|
i++;
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
res.push_back(*p);
|
|
||||||
if(0 == p->wLen)
|
|
||||||
{
|
|
||||||
LogFatal("TrieNodeInfo's wLen is 0!");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
i += p->wLen;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -223,7 +268,7 @@ int main()
|
|||||||
while(getline(ifile, line))
|
while(getline(ifile, line))
|
||||||
{
|
{
|
||||||
res.clear();
|
res.clear();
|
||||||
segment.cutDAG(line, res);
|
segment.cut(line, res);
|
||||||
PRINT_VECTOR(res);
|
PRINT_VECTOR(res);
|
||||||
getchar();
|
getchar();
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,9 @@
|
|||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
|
typedef vector<SegmentChar> SegmentContext;
|
||||||
|
|
||||||
class MPSegment
|
class MPSegment
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
@ -25,15 +28,14 @@ namespace CppJieba
|
|||||||
bool init(const char* const filePath);
|
bool init(const char* const filePath);
|
||||||
bool dispose();
|
bool dispose();
|
||||||
public:
|
public:
|
||||||
bool cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos);
|
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos);
|
||||||
bool cutDAG(const string& str, vector<string>& res);
|
bool cut(const string& str, vector<string>& res);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _calcDAG(SegmentContext& segContext);
|
bool _calcDAG(SegmentContext& segContext);
|
||||||
bool _calcDP(SegmentContext& segContext);
|
bool _calcDP(SegmentContext& segContext);
|
||||||
bool _cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res);
|
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res);
|
||||||
|
|
||||||
//bool _fill(const string& )
|
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -35,12 +35,12 @@ namespace CppJieba
|
|||||||
bool MixSegment::cut(const string& str, vector<string>& res)
|
bool MixSegment::cut(const string& str, vector<string>& res)
|
||||||
{
|
{
|
||||||
vector<TrieNodeInfo> infos;
|
vector<TrieNodeInfo> infos;
|
||||||
if(!_mpSeg.cutDAG(str, infos))
|
if(!_mpSeg.cut(str, infos))
|
||||||
{
|
{
|
||||||
LogError("_mpSeg cutDAG failed.");
|
LogError("_mpSeg cutDAG failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(uint = 0; i < infos.size(); i++)
|
for(uint i= 0; i < infos.size(); i++)
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
22
src/Trie.cpp
22
src/Trie.cpp
@ -109,9 +109,11 @@ namespace CppJieba
|
|||||||
LogError(string_format("line[%s] illegal.", line.c_str()));
|
LogError(string_format("line[%s] illegal.", line.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
nodeInfo.word = vecBuf[0];
|
if(!TransCode::strToVec(vecBuf[0], nodeInfo.word))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
||||||
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
|
|
||||||
if(3 == vecBuf.size())
|
if(3 == vecBuf.size())
|
||||||
{
|
{
|
||||||
nodeInfo.tag = vecBuf[2];
|
nodeInfo.tag = vecBuf[2];
|
||||||
@ -193,7 +195,7 @@ namespace CppJieba
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
const TrieNodeInfo* Trie::find(const string& str)
|
TrieNodeInfo* Trie::find(const string& str)
|
||||||
{
|
{
|
||||||
Unicode uintVec;
|
Unicode uintVec;
|
||||||
bool retFlag = TransCode::strToVec(str, uintVec);
|
bool retFlag = TransCode::strToVec(str, uintVec);
|
||||||
@ -204,7 +206,7 @@ namespace CppJieba
|
|||||||
return find(uintVec);
|
return find(uintVec);
|
||||||
}
|
}
|
||||||
|
|
||||||
const TrieNodeInfo* Trie::find(const Unicode& uintVec)
|
TrieNodeInfo* Trie::find(const Unicode& uintVec)
|
||||||
{
|
{
|
||||||
if(uintVec.empty())
|
if(uintVec.empty())
|
||||||
{
|
{
|
||||||
@ -213,7 +215,7 @@ namespace CppJieba
|
|||||||
return find(uintVec.begin(), uintVec.end());
|
return find(uintVec.begin(), uintVec.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
|
TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||||
{
|
{
|
||||||
|
|
||||||
if(!_getInitFlag())
|
if(!_getInitFlag())
|
||||||
@ -324,16 +326,8 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const string& word = nodeInfo.word;
|
|
||||||
|
|
||||||
Unicode uintVec;
|
|
||||||
bool retFlag = TransCode::strToVec(word, uintVec);
|
|
||||||
if(!retFlag)
|
|
||||||
{
|
|
||||||
LogError("TransCode::strToVec error.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
const Unicode& uintVec = nodeInfo.word;
|
||||||
TrieNode* p = _root;
|
TrieNode* p = _root;
|
||||||
for(uint i = 0; i < uintVec.size(); i++)
|
for(uint i = 0; i < uintVec.size(); i++)
|
||||||
{
|
{
|
||||||
|
@ -66,9 +66,9 @@ namespace CppJieba
|
|||||||
bool _getInitFlag();
|
bool _getInitFlag();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
const TrieNodeInfo* find(const string& str);
|
TrieNodeInfo* find(const string& str);
|
||||||
const TrieNodeInfo* find(const Unicode& uintVec);
|
TrieNodeInfo* find(const Unicode& uintVec);
|
||||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
|
TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||||
const TrieNodeInfo* findPrefix(const string& str);
|
const TrieNodeInfo* findPrefix(const string& str);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -27,7 +27,6 @@ namespace CppJieba
|
|||||||
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||||
typedef unordered_map<uint16_t, double> EmitProbMap;
|
typedef unordered_map<uint16_t, double> EmitProbMap;
|
||||||
|
|
||||||
|
|
||||||
const double MIN_DOUBLE = -3.14e+100;
|
const double MIN_DOUBLE = -3.14e+100;
|
||||||
const double MAX_DOUBLE = 3.14e+100;
|
const double MAX_DOUBLE = 3.14e+100;
|
||||||
}
|
}
|
||||||
|
@ -4,35 +4,63 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include "globals.h"
|
#include "globals.h"
|
||||||
#include "Trie.h"
|
#include "Trie.h"
|
||||||
|
#include "TransCode.h"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
struct TrieNodeInfo
|
struct TrieNodeInfo
|
||||||
{
|
{
|
||||||
string word;
|
//string word;
|
||||||
size_t wLen;// the word's len , not string.length(),
|
//size_t wLen;// the word's len , not string.length(),
|
||||||
|
Unicode word;
|
||||||
size_t freq;
|
size_t freq;
|
||||||
string tag;
|
string tag;
|
||||||
double logFreq; //logFreq = log(freq/sum(freq));
|
double logFreq; //logFreq = log(freq/sum(freq));
|
||||||
TrieNodeInfo():wLen(0),freq(0),logFreq(0.0)
|
TrieNodeInfo():freq(0),logFreq(0.0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
||||||
{
|
{
|
||||||
wLen = TransCode::getWordLength(_word);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SegmentContext//: public TrieNodeInfo
|
typedef unordered_map<uint, const TrieNodeInfo*> DagType;
|
||||||
{
|
struct SegmentChar
|
||||||
vector<uint16_t> uintVec;
|
{
|
||||||
vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
|
uint16_t uniCh;
|
||||||
vector< pair<const TrieNodeInfo*, double> > dp;
|
DagType dag;
|
||||||
};
|
const TrieNodeInfo * pInfo;
|
||||||
|
double weight;
|
||||||
|
|
||||||
|
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/*const TrieNodeInfo* pInfo;
|
||||||
|
double weight;
|
||||||
|
SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
|
||||||
|
{
|
||||||
|
}*/
|
||||||
|
};
|
||||||
|
/*
|
||||||
|
struct SegmentContext
|
||||||
|
{
|
||||||
|
vector<SegmentChar> context;
|
||||||
|
bool getDA
|
||||||
|
};*/
|
||||||
|
typedef vector<SegmentChar> SegmentContext;
|
||||||
|
|
||||||
|
//struct SegmentContext
|
||||||
|
//{
|
||||||
|
// vector<SegmentChar> context;
|
||||||
|
// //vector<uint16_t> uintVec;
|
||||||
|
// //vector< vector<pair<uint, const TrieNodeInfo*> > > dag;
|
||||||
|
// //vector< pair<const TrieNodeInfo*, double> > dp;
|
||||||
|
//};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
struct SegmentWordInfo: public TrieNodeInfo
|
struct SegmentWordInfo: public TrieNodeInfo
|
||||||
@ -48,7 +76,7 @@ namespace CppJieba
|
|||||||
KeyWordInfo():idf(0.0),weight(0.0)
|
KeyWordInfo():idf(0.0),weight(0.0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
|
KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
|
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
|
||||||
@ -56,13 +84,12 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
string toString() const
|
string toString() const
|
||||||
{
|
{
|
||||||
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf);
|
||||||
}
|
}
|
||||||
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
|
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
|
||||||
{
|
{
|
||||||
word = trieNodeInfo.word;
|
word = trieNodeInfo.word;
|
||||||
freq = trieNodeInfo.freq;
|
freq = trieNodeInfo.freq;
|
||||||
wLen = trieNodeInfo.wLen;
|
|
||||||
tag = trieNodeInfo.tag;
|
tag = trieNodeInfo.tag;
|
||||||
logFreq = trieNodeInfo.logFreq;
|
logFreq = trieNodeInfo.logFreq;
|
||||||
return *this;
|
return *this;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user