add _filterDup

This commit is contained in:
gwdwyy 2013-07-12 17:10:34 +08:00
parent 41a4dbdea4
commit de776afa3f
4 changed files with 32 additions and 5 deletions

View File

@ -24,7 +24,7 @@ all: demo
$(CC) $(CCOPT) $<
demo: $(OBJS) $(SRCLIB)
$(DOLINK)
$(DOLINK) -liconv
$(SRCLIB):
cd $(SRCDIR) && $(MAKE)

View File

@ -20,19 +20,19 @@ int main()
string title;
title = "我来到北京清华大学";
res.clear();
segment.extract(title, res);
segment.extract(title, res, 10);
title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏";
res.clear();
segment.extract(title, res);
segment.extract(title, res, 10);
title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女";
res.clear();
segment.extract(title, res);
segment.extract(title, res, 10);
title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女";
res.clear();
segment.extract(title, res);
segment.extract(title, res, 10);
segment.destroy();

View File

@ -246,6 +246,14 @@ namespace CppJieba
bool Segment::_filter(vector<string>& utf8Strs)
{
bool retFlag;
retFlag = _filterDuplicate(utf8Strs);
if(!retFlag)
{
LogError("_filterDuplicate failed.");
return false;
}
LogDebug(string_format("_filterDuplicate res:[%s]", joinStr(utf8Strs, ",").c_str()));
retFlag = _filterSingleWord(utf8Strs);
if(!retFlag)
{
@ -265,6 +273,24 @@ namespace CppJieba
return true;
}
bool Segment::_filterDuplicate(vector<string>& utf8Strs)
{
set<string> st;
for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); )
{
if(st.find(*it) != st.end())
{
it = utf8Strs.erase(it);
}
else
{
st.insert(*it);
it++;
}
}
return true;
}
bool Segment::_filterSingleWord(vector<string>& utf8Strs)
{
for(vector<string>::iterator it = utf8Strs.begin(); it != utf8Strs.end();)

View File

@ -30,6 +30,7 @@ namespace CppJieba
bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res);
bool _filter(vector<string>& utf8Strs);
bool _filterDuplicate(vector<string>& utf8Strs);
bool _filterSingleWord(vector<string>& utf8Strs);
bool _filterSubstr(vector<string>& utf8Strs);
static bool _pair_compare(const pair<string, double>& a, const pair<string, double>& b);