[QuerySegment] add SetMaxWordLen,GetMaxWordLen, and filter the english sentence in secondary Cut

This commit is contained in:
yanyiwu 2015-10-29 14:23:01 +08:00
parent 087f3248f8
commit c3fd357a6d
4 changed files with 41 additions and 5 deletions

View File

@ -1,6 +1,11 @@
# CppJieba ChangeLog
# v4.0.0
## next version
1. QuerySegment切词时加一层判断当长词满足IsAllAscii(比如英文单词)时,不进行细粒度分词。
2. QuerySegment新增SetMaxWordLen和GetMaxWordLen接口。
## v4.0.0
1. 支持多个userdict载入多词典路径用英文冒号(:)作为分隔符就当是向环境变量PATH致敬哈哈。
2. userdict是不带权重的之前对于新的userword默认设置词频权重为最大值现已支持可配置默认使用中位值。

View File

@ -45,7 +45,7 @@ class QuerySegment: public SegmentBase {
vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
// if it's too long, Cut with fullSeg_, put fullRes in res
if (mixResItr->size() > maxWordLen_) {
if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) {
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
res.push_back(*fullResItr);
@ -58,12 +58,26 @@ class QuerySegment: public SegmentBase {
}
}
}
void SetMaxWordLen(size_t len) {
maxWordLen_ = len;
}
size_t GetMaxWordLen() const {
return maxWordLen_;
}
private:
bool IsAllAscii(const Unicode& s) const {
for(size_t i = 0; i < s.size(); i++) {
if (s[i] >= 0x80) {
return false;
}
}
return true;
}
MixSegment mixSeg_;
FullSegment fullSeg_;
size_t maxWordLen_;
}; // QuerySegment
};
}
} // namespace cppjieba
#endif

2
test/testdata/userdict.english vendored Normal file
View File

@ -0,0 +1,2 @@
in
internal

View File

@ -225,7 +225,7 @@ TEST(QuerySegment, Test1) {
}
TEST(QuerySegment, Test2) {
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8", 3);
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8:../test/testdata/userdict.english", 3);
{
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
@ -251,6 +251,21 @@ TEST(QuerySegment, Test2) {
ASSERT_EQ(s1, s2);
}
{
vector<string> words;
segment.Cut("internal", words);
string s = join(words.begin(), words.end(), "/");
ASSERT_EQ("internal", s);
}
segment.SetMaxWordLen(5);
{
vector<string> words;
segment.Cut("中国科学院", words);
string s = join(words.begin(), words.end(), "/");
ASSERT_EQ("中国科学院", s);
}
}
TEST(LevelSegmentTest, Test0) {