adding prior into keywordext

This commit is contained in:
gwdwyy 2013-07-16 19:10:31 +08:00
parent 081c9904c7
commit 6cf4d5c8bc
3 changed files with 59 additions and 21 deletions

View File

@ -14,19 +14,24 @@ namespace CppJieba
bool KeyWordExt::init(const char * const filePath) bool KeyWordExt::init(const char * const filePath)
{ {
LogInfo(string_format("init(%s) start", filePath));
if(!checkFileExist(filePath)) if(!checkFileExist(filePath))
{ {
LogError(string_format("cann't find fiel[%s].",filePath)); LogError(string_format("cann't find file[%s].",filePath));
return false; return false;
} }
return _segment.init(filePath); bool retFlag = _segment.init(filePath);
LogInfo(string_format("init(%s) end", filePath));
return retFlag;
} }
bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath) bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath)
{ {
LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath));
if(!checkFileExist(filePath)) if(!checkFileExist(filePath))
{ {
LogError(string_format("cann't find fiel[%s].",filePath)); LogError(string_format("cann't find file[%s].",filePath));
return false; return false;
} }
bool retFlag = _priorPrefixTrie.init(filePath); bool retFlag = _priorPrefixTrie.init(filePath);
@ -35,11 +40,14 @@ namespace CppJieba
LogError("_priorPrefixTrie.init return false."); LogError("_priorPrefixTrie.init return false.");
return false; return false;
} }
LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath));
return true; return true;
} }
bool KeyWordExt::loadStopWords(const char * const filePath) bool KeyWordExt::loadStopWords(const char * const filePath)
{ {
LogInfo(string_format("loadStopWords(%s) start", filePath));
if(!_stopWords.empty()) if(!_stopWords.empty())
{ {
LogError("_stopWords has been loaded before! "); LogError("_stopWords has been loaded before! ");
@ -47,7 +55,7 @@ namespace CppJieba
} }
if(!checkFileExist(filePath)) if(!checkFileExist(filePath))
{ {
LogError(string_format("cann't find fiel[%s].",filePath)); LogError(string_format("cann't find file[%s].",filePath));
return false; return false;
} }
@ -57,7 +65,7 @@ namespace CppJieba
{ {
_stopWords.insert(line); _stopWords.insert(line);
} }
LogDebug(string_format("load stopwords[%d] finished.", _stopWords.size())); LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size()));
return true; return true;
} }
@ -73,6 +81,7 @@ namespace CppJieba
{ {
return a.weight < b.weight; return a.weight < b.weight;
} }
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN) bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
{ {
keywords.clear(); keywords.clear();
@ -86,20 +95,11 @@ namespace CppJieba
wInfo.idf = w; wInfo.idf = w;
wordInfos.push_back(wInfo); wordInfos.push_back(wInfo);
} }
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str()));
//logging result _priorWordPrefixes(wordInfos);
/* LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str()));
vector<string> logBuf;//for LogDebug
for(uint i = 0; i < tmp.size(); i++)
{
logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second));
}
LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str()));
*/
LogDebug(string_format("calc weight & sorted:%s",joinWordInfos(wordInfos).c_str()));
//extract TopN //extract TopN
for(uint i = 0; i < topN && i < wordInfos.size(); i++) for(uint i = 0; i < topN && i < wordInfos.size(); i++)
{ {
@ -274,8 +274,34 @@ namespace CppJieba
return true; return true;
} }
bool _priorWordPrefixes(vector<string>& utf8Strs) bool KeyWordExt::_priorWordPrefixes(vector<WordInfo>& wordInfos)
{ {
if(2 > wordInfos.size())
{
return true;
}
WordInfo prior;
bool flag = false;
for(vector<WordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
{
cout<<__FILE__<<__LINE__<<endl;
if(NULL != _priorPrefixTrie.findUtf8(it->word))
{
prior = *it;
it = wordInfos.erase(it);
flag = true;
break;
}
else
{
it ++;
}
}
if(flag)
{
wordInfos.insert(wordInfos.begin(), prior);
}
return true; return true;
} }
} }
@ -293,20 +319,27 @@ int main()
return 1; return 1;
} }
ext.loadStopWords("stopwords.tmp"); ext.loadStopWords("stopwords.tmp");
ext.loadPriorWordPrefixes("prior.utf8");
if(!ext.loadPriorWordPrefixes("prior.utf8"))
{
cerr<<"err"<<endl;
return 1;
}
//segment.init("dicts/jieba.dict.utf8"); //segment.init("dicts/jieba.dict.utf8");
vector<string> res; vector<string> res;
string title; string title;
title = "我来到北京清华大学"; /*title = "我来到北京清华大学";
res.clear(); res.clear();
ext.extract(title, res, 5); ext.extract(title, res, 5);
PRINT_VECTOR(res); PRINT_VECTOR(res);
title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏";
res.clear(); res.clear();
ext.extract(title, res, 5); ext.extract(title, res, 5);
PRINT_VECTOR(res); PRINT_VECTOR(res);
*/
title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装"; title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装";

View File

@ -64,7 +64,7 @@ namespace CppJieba
bool _filterSubstr(vector<string>& utf8Strs); bool _filterSubstr(vector<string>& utf8Strs);
bool _filterStopWords(vector<string>& utf8Strs); bool _filterStopWords(vector<string>& utf8Strs);
private: private:
bool _priorWordPrefixes(vector<string>& utf8Strs); bool _priorWordPrefixes(vector<WordInfo>& wordInfos);
}; };

View File

@ -142,6 +142,11 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len) const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len)
{ {
if(NULL == _root)
{
LogFatal("trie not initted!");
return NULL;
}
TrieNode* p = _root; TrieNode* p = _root;
for(uint i = 0; i < len; i++) for(uint i = 0; i < len; i++)
{ {