From f98e94869c875e8dda65670444eb1b5e5db4adf7 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 13 Sep 2015 17:28:49 +0800 Subject: [PATCH] add optional argument: hmm --- ChangeLog.md | 1 + src/Application.hpp | 4 ++-- src/Jieba.hpp | 12 ++++-------- src/KeywordExtractor.hpp | 8 ++++---- src/MixSegment.hpp | 10 ++++++---- src/QuerySegment.hpp | 9 ++++----- 6 files changed, 21 insertions(+), 23 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index ad73af5..b440b5a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -6,6 +6,7 @@ 2. 新增层次分词器: LevelSegment 。 3. 增加MPSegment的细粒度分词功能。 4. 增加 class Jieba ,提供可读性更好的接口。 +5. 放弃了统一接口ISegment,因为统一的接口限制了分词方式的灵活性,限制了一些功能的增加。 ## v3.1.0 diff --git a/src/Application.hpp b/src/Application.hpp index 0247d9c..1903b1d 100644 --- a/src/Application.hpp +++ b/src/Application.hpp @@ -34,13 +34,13 @@ class Application { CutMethod method = METHOD_MIX) const { switch(method) { case METHOD_MP: - jieba_.Cut(sentence, false, words); + jieba_.Cut(sentence, words); break; case METHOD_HMM: jieba_.CutHMM(sentence, words); break; case METHOD_MIX: - jieba_.Cut(sentence, true, words); + jieba_.Cut(sentence, words); break; case METHOD_FULL: jieba_.CutAll(sentence, words); diff --git a/src/Jieba.hpp b/src/Jieba.hpp index e1e8f7e..c4dce81 100644 --- a/src/Jieba.hpp +++ b/src/Jieba.hpp @@ -21,18 +21,14 @@ class Jieba { ~Jieba() { } - void Cut(const string& sentence, bool hmm, vector& words) const { - if (hmm) { - mix_seg_.cut(sentence, words); - } else { - mp_seg_.cut(sentence, words); - } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.cut(sentence, words, hmm); } void CutAll(const string& sentence, vector& words) const { full_seg_.cut(sentence, words); } - void CutForSearch(const string& sentence, vector& words) const { - query_seg_.cut(sentence, words); + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.cut(sentence, words, hmm); } void CutHMM(const string& sentence, vector& words) const { hmm_seg_.cut(sentence, words); diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 29ef466..6bc5bd1 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -31,9 +31,9 @@ class KeywordExtractor { ~KeywordExtractor() { } - bool extract(const string& str, vector& keywords, size_t topN) const { + bool extract(const string& sentence, vector& keywords, size_t topN) const { vector > topWords; - if(!extract(str, topWords, topN)) { + if(!extract(sentence, topWords, topN)) { return false; } for(size_t i = 0; i < topWords.size(); i++) { @@ -42,9 +42,9 @@ class KeywordExtractor { return true; } - bool extract(const string& str, vector >& keywords, size_t topN) const { + bool extract(const string& sentence, vector >& keywords, size_t topN) const { vector words; - segment_.cut(str, words); + segment_.cut(sentence, words); map wordmap; for(vector::iterator iter = words.begin(); iter != words.end(); iter++) { diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index a0dfce6..02e7563 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -21,23 +21,25 @@ class MixSegment: public SegmentBase { ~MixSegment() { } - void cut(const string& sentence, - vector& words) const { + void cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector uwords; uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords); + cut(range.begin, range.end, uwords, hmm); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { vector words; words.reserve(end - begin); mpSeg_.cut(begin, end, words); + if (!hmm) { + return; + } vector hmmRes; hmmRes.reserve(end - begin); diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index c75fd43..b5d7bb4 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -26,22 +26,21 @@ class QuerySegment: public SegmentBase { } ~QuerySegment() { } - void cut(const string& sentence, - vector& words) const { + void cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector uwords; uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); - cut(range.begin, range.end, uwords); + cut(range.begin, range.end, uwords, hmm); } TransCode::encode(uwords, words); } - void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { //use mix cut first vector mixRes; - mixSeg_.cut(begin, end, mixRes); + mixSeg_.cut(begin, end, mixRes, hmm); vector fullRes; for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {