add optional argument: hmm

This commit is contained in:
yanyiwu 2015-09-13 17:28:49 +08:00
parent 14974d51b4
commit f98e94869c
6 changed files with 21 additions and 23 deletions

View File

@ -6,6 +6,7 @@
2. 新增层次分词器: LevelSegment 。 2. 新增层次分词器: LevelSegment 。
3. 增加MPSegment的细粒度分词功能。 3. 增加MPSegment的细粒度分词功能。
4. 增加 class Jieba ,提供可读性更好的接口。 4. 增加 class Jieba ,提供可读性更好的接口。
5. 放弃了统一接口ISegment因为统一的接口限制了分词方式的灵活性限制了一些功能的增加。
## v3.1.0 ## v3.1.0

View File

@ -34,13 +34,13 @@ class Application {
CutMethod method = METHOD_MIX) const { CutMethod method = METHOD_MIX) const {
switch(method) { switch(method) {
case METHOD_MP: case METHOD_MP:
jieba_.Cut(sentence, false, words); jieba_.Cut(sentence, words);
break; break;
case METHOD_HMM: case METHOD_HMM:
jieba_.CutHMM(sentence, words); jieba_.CutHMM(sentence, words);
break; break;
case METHOD_MIX: case METHOD_MIX:
jieba_.Cut(sentence, true, words); jieba_.Cut(sentence, words);
break; break;
case METHOD_FULL: case METHOD_FULL:
jieba_.CutAll(sentence, words); jieba_.CutAll(sentence, words);

View File

@ -21,18 +21,14 @@ class Jieba {
~Jieba() { ~Jieba() {
} }
void Cut(const string& sentence, bool hmm, vector<string>& words) const { void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
if (hmm) { mix_seg_.cut(sentence, words, hmm);
mix_seg_.cut(sentence, words);
} else {
mp_seg_.cut(sentence, words);
}
} }
void CutAll(const string& sentence, vector<string>& words) const { void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.cut(sentence, words); full_seg_.cut(sentence, words);
} }
void CutForSearch(const string& sentence, vector<string>& words) const { void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
query_seg_.cut(sentence, words); query_seg_.cut(sentence, words, hmm);
} }
void CutHMM(const string& sentence, vector<string>& words) const { void CutHMM(const string& sentence, vector<string>& words) const {
hmm_seg_.cut(sentence, words); hmm_seg_.cut(sentence, words);

View File

@ -31,9 +31,9 @@ class KeywordExtractor {
~KeywordExtractor() { ~KeywordExtractor() {
} }
bool extract(const string& str, vector<string>& keywords, size_t topN) const { bool extract(const string& sentence, vector<string>& keywords, size_t topN) const {
vector<pair<string, double> > topWords; vector<pair<string, double> > topWords;
if(!extract(str, topWords, topN)) { if(!extract(sentence, topWords, topN)) {
return false; return false;
} }
for(size_t i = 0; i < topWords.size(); i++) { for(size_t i = 0; i < topWords.size(); i++) {
@ -42,9 +42,9 @@ class KeywordExtractor {
return true; return true;
} }
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const { bool extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
vector<string> words; vector<string> words;
segment_.cut(str, words); segment_.cut(sentence, words);
map<string, double> wordmap; map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) { for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {

View File

@ -21,23 +21,25 @@ class MixSegment: public SegmentBase {
~MixSegment() { ~MixSegment() {
} }
void cut(const string& sentence, void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence); PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range; PreFilter::Range range;
vector<Unicode> uwords; vector<Unicode> uwords;
uwords.reserve(sentence.size()); uwords.reserve(sentence.size());
while (pre_filter.HasNext()) { while (pre_filter.HasNext()) {
range = pre_filter.Next(); range = pre_filter.Next();
cut(range.begin, range.end, uwords); cut(range.begin, range.end, uwords, hmm);
} }
TransCode::encode(uwords, words); TransCode::encode(uwords, words);
} }
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
mpSeg_.cut(begin, end, words); mpSeg_.cut(begin, end, words);
if (!hmm) {
return;
}
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
hmmRes.reserve(end - begin); hmmRes.reserve(end - begin);

View File

@ -26,22 +26,21 @@ class QuerySegment: public SegmentBase {
} }
~QuerySegment() { ~QuerySegment() {
} }
void cut(const string& sentence, void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence); PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range; PreFilter::Range range;
vector<Unicode> uwords; vector<Unicode> uwords;
uwords.reserve(sentence.size()); uwords.reserve(sentence.size());
while (pre_filter.HasNext()) { while (pre_filter.HasNext()) {
range = pre_filter.Next(); range = pre_filter.Next();
cut(range.begin, range.end, uwords); cut(range.begin, range.end, uwords, hmm);
} }
TransCode::encode(uwords, words); TransCode::encode(uwords, words);
} }
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
//use mix cut first //use mix cut first
vector<Unicode> mixRes; vector<Unicode> mixRes;
mixSeg_.cut(begin, end, mixRes); mixSeg_.cut(begin, end, mixRes, hmm);
vector<Unicode> fullRes; vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {