mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add optional argument: hmm
This commit is contained in:
parent
14974d51b4
commit
f98e94869c
@ -6,6 +6,7 @@
|
|||||||
2. 新增层次分词器: LevelSegment 。
|
2. 新增层次分词器: LevelSegment 。
|
||||||
3. 增加MPSegment的细粒度分词功能。
|
3. 增加MPSegment的细粒度分词功能。
|
||||||
4. 增加 class Jieba ,提供可读性更好的接口。
|
4. 增加 class Jieba ,提供可读性更好的接口。
|
||||||
|
5. 放弃了统一接口ISegment,因为统一的接口限制了分词方式的灵活性,限制了一些功能的增加。
|
||||||
|
|
||||||
## v3.1.0
|
## v3.1.0
|
||||||
|
|
||||||
|
@ -34,13 +34,13 @@ class Application {
|
|||||||
CutMethod method = METHOD_MIX) const {
|
CutMethod method = METHOD_MIX) const {
|
||||||
switch(method) {
|
switch(method) {
|
||||||
case METHOD_MP:
|
case METHOD_MP:
|
||||||
jieba_.Cut(sentence, false, words);
|
jieba_.Cut(sentence, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_HMM:
|
case METHOD_HMM:
|
||||||
jieba_.CutHMM(sentence, words);
|
jieba_.CutHMM(sentence, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_MIX:
|
case METHOD_MIX:
|
||||||
jieba_.Cut(sentence, true, words);
|
jieba_.Cut(sentence, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_FULL:
|
case METHOD_FULL:
|
||||||
jieba_.CutAll(sentence, words);
|
jieba_.CutAll(sentence, words);
|
||||||
|
@ -21,18 +21,14 @@ class Jieba {
|
|||||||
~Jieba() {
|
~Jieba() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Cut(const string& sentence, bool hmm, vector<string>& words) const {
|
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
if (hmm) {
|
mix_seg_.cut(sentence, words, hmm);
|
||||||
mix_seg_.cut(sentence, words);
|
|
||||||
} else {
|
|
||||||
mp_seg_.cut(sentence, words);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
void CutAll(const string& sentence, vector<string>& words) const {
|
void CutAll(const string& sentence, vector<string>& words) const {
|
||||||
full_seg_.cut(sentence, words);
|
full_seg_.cut(sentence, words);
|
||||||
}
|
}
|
||||||
void CutForSearch(const string& sentence, vector<string>& words) const {
|
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
query_seg_.cut(sentence, words);
|
query_seg_.cut(sentence, words, hmm);
|
||||||
}
|
}
|
||||||
void CutHMM(const string& sentence, vector<string>& words) const {
|
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||||
hmm_seg_.cut(sentence, words);
|
hmm_seg_.cut(sentence, words);
|
||||||
|
@ -31,9 +31,9 @@ class KeywordExtractor {
|
|||||||
~KeywordExtractor() {
|
~KeywordExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
|
bool extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||||
vector<pair<string, double> > topWords;
|
vector<pair<string, double> > topWords;
|
||||||
if(!extract(str, topWords, topN)) {
|
if(!extract(sentence, topWords, topN)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t i = 0; i < topWords.size(); i++) {
|
for(size_t i = 0; i < topWords.size(); i++) {
|
||||||
@ -42,9 +42,9 @@ class KeywordExtractor {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
bool extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
segment_.cut(str, words);
|
segment_.cut(sentence, words);
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
|
@ -21,23 +21,25 @@ class MixSegment: public SegmentBase {
|
|||||||
~MixSegment() {
|
~MixSegment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(const string& sentence,
|
void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
vector<string>& words) const {
|
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
vector<Unicode> uwords;
|
vector<Unicode> uwords;
|
||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords);
|
cut(range.begin, range.end, uwords, hmm);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
words.reserve(end - begin);
|
words.reserve(end - begin);
|
||||||
mpSeg_.cut(begin, end, words);
|
mpSeg_.cut(begin, end, words);
|
||||||
|
if (!hmm) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
vector<Unicode> hmmRes;
|
vector<Unicode> hmmRes;
|
||||||
hmmRes.reserve(end - begin);
|
hmmRes.reserve(end - begin);
|
||||||
|
@ -26,22 +26,21 @@ class QuerySegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
~QuerySegment() {
|
~QuerySegment() {
|
||||||
}
|
}
|
||||||
void cut(const string& sentence,
|
void cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||||
vector<string>& words) const {
|
|
||||||
PreFilter pre_filter(symbols_, sentence);
|
PreFilter pre_filter(symbols_, sentence);
|
||||||
PreFilter::Range range;
|
PreFilter::Range range;
|
||||||
vector<Unicode> uwords;
|
vector<Unicode> uwords;
|
||||||
uwords.reserve(sentence.size());
|
uwords.reserve(sentence.size());
|
||||||
while (pre_filter.HasNext()) {
|
while (pre_filter.HasNext()) {
|
||||||
range = pre_filter.Next();
|
range = pre_filter.Next();
|
||||||
cut(range.begin, range.end, uwords);
|
cut(range.begin, range.end, uwords, hmm);
|
||||||
}
|
}
|
||||||
TransCode::encode(uwords, words);
|
TransCode::encode(uwords, words);
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res, bool hmm) const {
|
||||||
//use mix cut first
|
//use mix cut first
|
||||||
vector<Unicode> mixRes;
|
vector<Unicode> mixRes;
|
||||||
mixSeg_.cut(begin, end, mixRes);
|
mixSeg_.cut(begin, end, mixRes, hmm);
|
||||||
|
|
||||||
vector<Unicode> fullRes;
|
vector<Unicode> fullRes;
|
||||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user