mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
[QuerySegment] add SetMaxWordLen,GetMaxWordLen, and filter the english sentence in secondary Cut
This commit is contained in:
parent
087f3248f8
commit
c3fd357a6d
@ -1,6 +1,11 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
# v4.0.0
|
||||
## next version
|
||||
|
||||
1. QuerySegment切词时加一层判断,当长词满足IsAllAscii(比如英文单词)时,不进行细粒度分词。
|
||||
2. QuerySegment新增SetMaxWordLen和GetMaxWordLen接口。
|
||||
|
||||
## v4.0.0
|
||||
|
||||
1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。
|
||||
2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。
|
||||
|
@ -45,7 +45,7 @@ class QuerySegment: public SegmentBase {
|
||||
vector<Unicode> fullRes;
|
||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
// if it's too long, Cut with fullSeg_, put fullRes in res
|
||||
if (mixResItr->size() > maxWordLen_) {
|
||||
if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) {
|
||||
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||
res.push_back(*fullResItr);
|
||||
@ -58,12 +58,26 @@ class QuerySegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void SetMaxWordLen(size_t len) {
|
||||
maxWordLen_ = len;
|
||||
}
|
||||
size_t GetMaxWordLen() const {
|
||||
return maxWordLen_;
|
||||
}
|
||||
private:
|
||||
bool IsAllAscii(const Unicode& s) const {
|
||||
for(size_t i = 0; i < s.size(); i++) {
|
||||
if (s[i] >= 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
MixSegment mixSeg_;
|
||||
FullSegment fullSeg_;
|
||||
size_t maxWordLen_;
|
||||
}; // QuerySegment
|
||||
|
||||
};
|
||||
}
|
||||
} // namespace cppjieba
|
||||
|
||||
#endif
|
||||
|
2
test/testdata/userdict.english
vendored
Normal file
2
test/testdata/userdict.english
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
in
|
||||
internal
|
@ -225,7 +225,7 @@ TEST(QuerySegment, Test1) {
|
||||
}
|
||||
|
||||
TEST(QuerySegment, Test2) {
|
||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8", 3);
|
||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8:../test/testdata/userdict.english", 3);
|
||||
|
||||
{
|
||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
@ -251,6 +251,21 @@ TEST(QuerySegment, Test2) {
|
||||
ASSERT_EQ(s1, s2);
|
||||
}
|
||||
|
||||
{
|
||||
vector<string> words;
|
||||
segment.Cut("internal", words);
|
||||
string s = join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ("internal", s);
|
||||
}
|
||||
|
||||
segment.SetMaxWordLen(5);
|
||||
|
||||
{
|
||||
vector<string> words;
|
||||
segment.Cut("中国科学院", words);
|
||||
string s = join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ("中国科学院", s);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(LevelSegmentTest, Test0) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user