From de776afa3fb04b01cb0281cccc09e761b5c7b89d Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Fri, 12 Jul 2013 17:10:34 +0800 Subject: [PATCH] add _filterDup --- Makefile | 2 +- demo.cpp | 8 ++++---- src/Segment.cpp | 26 ++++++++++++++++++++++++++ src/Segment.h | 1 + 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 69ecac6..0536b1c 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ all: demo $(CC) $(CCOPT) $< demo: $(OBJS) $(SRCLIB) - $(DOLINK) + $(DOLINK) -liconv $(SRCLIB): cd $(SRCDIR) && $(MAKE) diff --git a/demo.cpp b/demo.cpp index 11f9e5c..40f0e42 100644 --- a/demo.cpp +++ b/demo.cpp @@ -20,19 +20,19 @@ int main() string title; title = "我来到北京清华大学"; res.clear(); - segment.extract(title, res); + segment.extract(title, res, 10); title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; res.clear(); - segment.extract(title, res); + segment.extract(title, res, 10); title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女"; res.clear(); - segment.extract(title, res); + segment.extract(title, res, 10); title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女"; res.clear(); - segment.extract(title, res); + segment.extract(title, res, 10); segment.destroy(); diff --git a/src/Segment.cpp b/src/Segment.cpp index 2c3b666..41479b2 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -246,6 +246,14 @@ namespace CppJieba bool Segment::_filter(vector& utf8Strs) { bool retFlag; + retFlag = _filterDuplicate(utf8Strs); + if(!retFlag) + { + LogError("_filterDuplicate failed."); + return false; + } + LogDebug(string_format("_filterDuplicate res:[%s]", joinStr(utf8Strs, ",").c_str())); + retFlag = _filterSingleWord(utf8Strs); if(!retFlag) { @@ -265,6 +273,24 @@ namespace CppJieba return true; } + bool Segment::_filterDuplicate(vector& utf8Strs) + { + set st; + for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); ) + { + if(st.find(*it) != st.end()) + { + it = utf8Strs.erase(it); + } + else + { + st.insert(*it); + it++; + } + } + return true; + } + bool Segment::_filterSingleWord(vector& utf8Strs) { for(vector::iterator it = utf8Strs.begin(); it != utf8Strs.end();) diff --git a/src/Segment.h b/src/Segment.h index 7f73b77..f583b1b 100644 --- a/src/Segment.h +++ b/src/Segment.h @@ -30,6 +30,7 @@ namespace CppJieba bool _calcDP(const string& uniStr, const vector >& dag, vector >& res); bool _cutDAG(const string& uniStr, const vector >& dp, vector& res); bool _filter(vector& utf8Strs); + bool _filterDuplicate(vector& utf8Strs); bool _filterSingleWord(vector& utf8Strs); bool _filterSubstr(vector& utf8Strs); static bool _pair_compare(const pair& a, const pair& b);