From 6cf4d5c8bcc868c15170db86cc7a57ff5c9d7f1f Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Tue, 16 Jul 2013 19:10:31 +0800 Subject: [PATCH] adding prior into keywordext --- src/KeyWordExt.cpp | 73 +++++++++++++++++++++++++++++++++------------- src/KeyWordExt.h | 2 +- src/Trie.cpp | 5 ++++ 3 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index 80bdb1e..1c93b47 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -14,19 +14,24 @@ namespace CppJieba bool KeyWordExt::init(const char * const filePath) { + LogInfo(string_format("init(%s) start", filePath)); if(!checkFileExist(filePath)) { - LogError(string_format("cann't find fiel[%s].",filePath)); + LogError(string_format("cann't find file[%s].",filePath)); return false; } - return _segment.init(filePath); + bool retFlag = _segment.init(filePath); + LogInfo(string_format("init(%s) end", filePath)); + return retFlag; + } bool KeyWordExt::loadPriorWordPrefixes(const char * const filePath) { + LogInfo(string_format("loadPriorWordPrefixes(%s) start", filePath)); if(!checkFileExist(filePath)) { - LogError(string_format("cann't find fiel[%s].",filePath)); + LogError(string_format("cann't find file[%s].",filePath)); return false; } bool retFlag = _priorPrefixTrie.init(filePath); @@ -35,11 +40,14 @@ namespace CppJieba LogError("_priorPrefixTrie.init return false."); return false; } + LogInfo(string_format("loadPriorWordPrefixes(%s) end", filePath)); return true; } bool KeyWordExt::loadStopWords(const char * const filePath) { + + LogInfo(string_format("loadStopWords(%s) start", filePath)); if(!_stopWords.empty()) { LogError("_stopWords has been loaded before! "); @@ -47,7 +55,7 @@ namespace CppJieba } if(!checkFileExist(filePath)) { - LogError(string_format("cann't find fiel[%s].",filePath)); + LogError(string_format("cann't find file[%s].",filePath)); return false; } @@ -57,7 +65,7 @@ namespace CppJieba { _stopWords.insert(line); } - LogDebug(string_format("load stopwords[%d] finished.", _stopWords.size())); + LogInfo(string_format("load stopwords[%d] finished.", _stopWords.size())); return true; } @@ -73,6 +81,7 @@ namespace CppJieba { return a.weight < b.weight; } + bool KeyWordExt::_extractTopN(const vector& words, vector& keywords, uint topN) { keywords.clear(); @@ -86,20 +95,11 @@ namespace CppJieba wInfo.idf = w; wordInfos.push_back(wInfo); } - sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); - - //logging result - /* - vector logBuf;//for LogDebug - for(uint i = 0; i < tmp.size(); i++) - { - logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second)); - } - LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str())); - */ - LogDebug(string_format("calc weight & sorted:%s",joinWordInfos(wordInfos).c_str())); + LogDebug(string_format("calc weight & sorted:\n%s",joinWordInfos(wordInfos).c_str())); + _priorWordPrefixes(wordInfos); + LogDebug(string_format("_priorWordPrefixes res:\n%s", joinWordInfos(wordInfos).c_str())); //extract TopN for(uint i = 0; i < topN && i < wordInfos.size(); i++) { @@ -274,8 +274,34 @@ namespace CppJieba return true; } - bool _priorWordPrefixes(vector& utf8Strs) + bool KeyWordExt::_priorWordPrefixes(vector& wordInfos) { + if(2 > wordInfos.size()) + { + return true; + } + + WordInfo prior; + bool flag = false; + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) + { + cout<<__FILE__<<__LINE__<word)) + { + prior = *it; + it = wordInfos.erase(it); + flag = true; + break; + } + else + { + it ++; + } + } + if(flag) + { + wordInfos.insert(wordInfos.begin(), prior); + } return true; } } @@ -293,20 +319,27 @@ int main() return 1; } ext.loadStopWords("stopwords.tmp"); - ext.loadPriorWordPrefixes("prior.utf8"); + + if(!ext.loadPriorWordPrefixes("prior.utf8")) + { + cerr<<"err"< res; string title; - title = "我来到北京清华大学"; + /*title = "我来到北京清华大学"; res.clear(); ext.extract(title, res, 5); PRINT_VECTOR(res); + title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; res.clear(); ext.extract(title, res, 5); PRINT_VECTOR(res); + */ title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女装"; diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index 90658c7..aeba697 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -64,7 +64,7 @@ namespace CppJieba bool _filterSubstr(vector& utf8Strs); bool _filterStopWords(vector& utf8Strs); private: - bool _priorWordPrefixes(vector& utf8Strs); + bool _priorWordPrefixes(vector& wordInfos); }; diff --git a/src/Trie.cpp b/src/Trie.cpp index f884d94..d3fe34e 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -142,6 +142,11 @@ namespace CppJieba const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len) { + if(NULL == _root) + { + LogFatal("trie not initted!"); + return NULL; + } TrieNode* p = _root; for(uint i = 0; i < len; i++) {