diff --git a/ChangeLog.md b/ChangeLog.md index f289fb4..23d5193 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,11 @@ # CppJieba ChangeLog -# v4.0.0 +## next version + +1. QuerySegment切词时加一层判断,当长词满足IsAllAscii(比如英文单词)时,不进行细粒度分词。 +2. QuerySegment新增SetMaxWordLen和GetMaxWordLen接口。 + +## v4.0.0 1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。 2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。 diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 354574b..56083d2 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -45,7 +45,7 @@ class QuerySegment: public SegmentBase { vector fullRes; for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, Cut with fullSeg_, put fullRes in res - if (mixResItr->size() > maxWordLen_) { + if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) { fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes); for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); @@ -58,12 +58,26 @@ class QuerySegment: public SegmentBase { } } } + void SetMaxWordLen(size_t len) { + maxWordLen_ = len; + } + size_t GetMaxWordLen() const { + return maxWordLen_; + } private: + bool IsAllAscii(const Unicode& s) const { + for(size_t i = 0; i < s.size(); i++) { + if (s[i] >= 0x80) { + return false; + } + } + return true; + } MixSegment mixSeg_; FullSegment fullSeg_; size_t maxWordLen_; +}; // QuerySegment -}; -} +} // namespace cppjieba #endif diff --git a/test/testdata/userdict.english b/test/testdata/userdict.english new file mode 100644 index 0000000..b0784a4 --- /dev/null +++ b/test/testdata/userdict.english @@ -0,0 +1,2 @@ +in +internal diff --git a/test/unittest/segments_test.cpp b/test/unittest/segments_test.cpp index cacae35..0c6b3ce 100644 --- a/test/unittest/segments_test.cpp +++ b/test/unittest/segments_test.cpp @@ -225,7 +225,7 @@ TEST(QuerySegment, Test1) { } TEST(QuerySegment, Test2) { - QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8", 3); + QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8:../test/testdata/userdict.english", 3); { const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; @@ -251,6 +251,21 @@ TEST(QuerySegment, Test2) { ASSERT_EQ(s1, s2); } + { + vector words; + segment.Cut("internal", words); + string s = join(words.begin(), words.end(), "/"); + ASSERT_EQ("internal", s); + } + + segment.SetMaxWordLen(5); + + { + vector words; + segment.Cut("中国科学院", words); + string s = join(words.begin(), words.end(), "/"); + ASSERT_EQ("中国科学院", s); + } } TEST(LevelSegmentTest, Test0) {