diff --git a/Segment.cpp b/Segment.cpp index dc191f8..4043e0c 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -79,6 +79,9 @@ namespace CppJieba return false; } + // like str.join([]) in python + LogDebug(string_format("cutDAG result:[%s]", joinStr(tmp, ",").c_str())); + retFlag = _extract(tmp, keywords, 5); if(!retFlag) { @@ -108,14 +111,25 @@ namespace CppJieba { keywords.clear(); vector > tmp; + for(uint i = 0; i < words.size(); i++) { double w = getUtf8WordWeight(words[i]); tmp.push_back(pair(words[i], w)); - LogDebug(string_format("(%s,%lf)", words[i].c_str(), w)); } + sort(tmp.begin(), tmp.end(), _pair_compare); + + //logging result + vector logBuf;//for LogDebug + for(uint i = 0; i < tmp.size(); i++) + { + logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second)); + } + LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str())); + + //extract TopN for(uint i = 0; i < topN && i < tmp.size(); i++) { keywords.push_back(tmp[i].first); @@ -225,6 +239,26 @@ namespace CppJieba } bool Segment::_filter(vector& utf8Strs) + { + bool retFlag; + retFlag = _filterSingleWord(utf8Strs); + if(!retFlag) + { + LogError("_filterSingleWord failed."); + return false; + } + + retFlag = _filterSubstr(utf8Strs); + if(!retFlag) + { + LogError("_filterSubstr failed."); + return false; + } + + return true; + } + + bool Segment::_filterSingleWord(vector& utf8Strs) { for(vector::iterator it = utf8Strs.begin(); it != utf8Strs.end();) { @@ -248,6 +282,37 @@ namespace CppJieba return true; } + bool Segment::_filterSubstr(vector& utf8Strs) + { + vector tmp = utf8Strs; + set subs; + for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); it ++) + { + for(uint j = 0; j < tmp.size(); j++) + { + if(*it != tmp[j] && string::npos != tmp[j].find(*it, 0)) + { + subs.insert(*it); + } + } + } + + //erase subs from utf8Strs + for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); it++) + { + if(subs.end() != subs.find(*it)) + { + LogDebug(string_format("_filterSubstr filter [%s].", it->c_str())); + it = utf8Strs.erase(it); + } + else + { + it ++; + } + } + return true; + } + } @@ -263,8 +328,8 @@ int main() vector res; //string title = "我来到北京清华大学"; //string title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; - //string title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女"; - string title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女"; + string title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女"; + //string title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女"; cout< +#include #include "Trie.h" +#include "globals.h" namespace CppJieba { @@ -28,6 +30,8 @@ namespace CppJieba bool _calcDP(const string& uniStr, const vector >& dag, vector >& res); bool _cutDAG(const string& uniStr, const vector >& dp, vector& res); bool _filter(vector& utf8Strs); + bool _filterSingleWord(vector& utf8Strs); + bool _filterSubstr(vector& utf8Strs); static bool _pair_compare(const pair& a, const pair& b); bool _extract(const vector& words, vector& keywords, uint topN); diff --git a/globals.h b/globals.h index 7db0f1e..62bdef3 100644 --- a/globals.h +++ b/globals.h @@ -2,13 +2,19 @@ #define GLOBALS_H #include +#include +#include -//file path -const char * const DICT_FILE_PATH = "dict.txt"; +namespace CppJieba +{ + //file path + const char * const DICT_FILE_PATH = "dict.txt"; -//typedefs -typedef uint16_t ChUnicode; -typedef unsigned int uint; + //typedefs + typedef uint16_t ChUnicode; + typedef unsigned int uint; + typedef std::vector::iterator VSI; +} #endif