mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add _filterSubStrs
This commit is contained in:
parent
28c3260d4c
commit
09db567fb5
71
Segment.cpp
71
Segment.cpp
@ -79,6 +79,9 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
// like str.join([]) in python
|
||||
LogDebug(string_format("cutDAG result:[%s]", joinStr(tmp, ",").c_str()));
|
||||
|
||||
retFlag = _extract(tmp, keywords, 5);
|
||||
if(!retFlag)
|
||||
{
|
||||
@ -108,14 +111,25 @@ namespace CppJieba
|
||||
{
|
||||
keywords.clear();
|
||||
vector<pair<string, double> > tmp;
|
||||
|
||||
for(uint i = 0; i < words.size(); i++)
|
||||
{
|
||||
double w = getUtf8WordWeight(words[i]);
|
||||
tmp.push_back(pair<string, double>(words[i], w));
|
||||
LogDebug(string_format("(%s,%lf)", words[i].c_str(), w));
|
||||
}
|
||||
|
||||
|
||||
sort(tmp.begin(), tmp.end(), _pair_compare);
|
||||
|
||||
//logging result
|
||||
vector<string> logBuf;//for LogDebug
|
||||
for(uint i = 0; i < tmp.size(); i++)
|
||||
{
|
||||
logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second));
|
||||
}
|
||||
LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str()));
|
||||
|
||||
//extract TopN
|
||||
for(uint i = 0; i < topN && i < tmp.size(); i++)
|
||||
{
|
||||
keywords.push_back(tmp[i].first);
|
||||
@ -225,6 +239,26 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
bool Segment::_filter(vector<string>& utf8Strs)
|
||||
{
|
||||
bool retFlag;
|
||||
retFlag = _filterSingleWord(utf8Strs);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_filterSingleWord failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
retFlag = _filterSubstr(utf8Strs);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_filterSubstr failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Segment::_filterSingleWord(vector<string>& utf8Strs)
|
||||
{
|
||||
for(vector<string>::iterator it = utf8Strs.begin(); it != utf8Strs.end();)
|
||||
{
|
||||
@ -248,6 +282,37 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Segment::_filterSubstr(vector<string>& utf8Strs)
|
||||
{
|
||||
vector<string> tmp = utf8Strs;
|
||||
set<string> subs;
|
||||
for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); it ++)
|
||||
{
|
||||
for(uint j = 0; j < tmp.size(); j++)
|
||||
{
|
||||
if(*it != tmp[j] && string::npos != tmp[j].find(*it, 0))
|
||||
{
|
||||
subs.insert(*it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//erase subs from utf8Strs
|
||||
for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); it++)
|
||||
{
|
||||
if(subs.end() != subs.find(*it))
|
||||
{
|
||||
LogDebug(string_format("_filterSubstr filter [%s].", it->c_str()));
|
||||
it = utf8Strs.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it ++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -263,8 +328,8 @@ int main()
|
||||
vector<string> res;
|
||||
//string title = "我来到北京清华大学";
|
||||
//string title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏";
|
||||
//string title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女";
|
||||
string title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女";
|
||||
string title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女";
|
||||
//string title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女";
|
||||
cout<<title<<endl;
|
||||
//segment.cutDAG(title, res);
|
||||
segment.extract(title, res);
|
||||
|
@ -2,7 +2,9 @@
|
||||
#define SEGMENT_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "Trie.h"
|
||||
#include "globals.h"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
@ -28,6 +30,8 @@ namespace CppJieba
|
||||
bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
|
||||
bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res);
|
||||
bool _filter(vector<string>& utf8Strs);
|
||||
bool _filterSingleWord(vector<string>& utf8Strs);
|
||||
bool _filterSubstr(vector<string>& utf8Strs);
|
||||
static bool _pair_compare(const pair<string, double>& a, const pair<string, double>& b);
|
||||
bool _extract(const vector<string>& words, vector<string>& keywords, uint topN);
|
||||
|
||||
|
16
globals.h
16
globals.h
@ -2,13 +2,19 @@
|
||||
#define GLOBALS_H
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
//file path
|
||||
const char * const DICT_FILE_PATH = "dict.txt";
|
||||
namespace CppJieba
|
||||
{
|
||||
//file path
|
||||
const char * const DICT_FILE_PATH = "dict.txt";
|
||||
|
||||
//typedefs
|
||||
typedef uint16_t ChUnicode;
|
||||
typedef unsigned int uint;
|
||||
//typedefs
|
||||
typedef uint16_t ChUnicode;
|
||||
typedef unsigned int uint;
|
||||
typedef std::vector<std::string>::iterator VSI;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user