mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
[QuerySegment] add SetMaxWordLen,GetMaxWordLen, and filter the english sentence in secondary Cut
This commit is contained in:
parent
087f3248f8
commit
c3fd357a6d
@ -1,6 +1,11 @@
|
|||||||
# CppJieba ChangeLog
|
# CppJieba ChangeLog
|
||||||
|
|
||||||
# v4.0.0
|
## next version
|
||||||
|
|
||||||
|
1. QuerySegment切词时加一层判断,当长词满足IsAllAscii(比如英文单词)时,不进行细粒度分词。
|
||||||
|
2. QuerySegment新增SetMaxWordLen和GetMaxWordLen接口。
|
||||||
|
|
||||||
|
## v4.0.0
|
||||||
|
|
||||||
1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。
|
1. 支持多个userdict载入,多词典路径用英文冒号(:)作为分隔符,就当是向环境变量PATH致敬,哈哈。
|
||||||
2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。
|
2. userdict是不带权重的,之前对于新的userword默认设置词频权重为最大值,现已支持可配置,默认使用中位值。
|
||||||
|
@ -45,7 +45,7 @@ class QuerySegment: public SegmentBase {
|
|||||||
vector<Unicode> fullRes;
|
vector<Unicode> fullRes;
|
||||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||||
// if it's too long, Cut with fullSeg_, put fullRes in res
|
// if it's too long, Cut with fullSeg_, put fullRes in res
|
||||||
if (mixResItr->size() > maxWordLen_) {
|
if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) {
|
||||||
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes);
|
||||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||||
res.push_back(*fullResItr);
|
res.push_back(*fullResItr);
|
||||||
@ -58,12 +58,26 @@ class QuerySegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void SetMaxWordLen(size_t len) {
|
||||||
|
maxWordLen_ = len;
|
||||||
|
}
|
||||||
|
size_t GetMaxWordLen() const {
|
||||||
|
return maxWordLen_;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
|
bool IsAllAscii(const Unicode& s) const {
|
||||||
|
for(size_t i = 0; i < s.size(); i++) {
|
||||||
|
if (s[i] >= 0x80) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
MixSegment mixSeg_;
|
MixSegment mixSeg_;
|
||||||
FullSegment fullSeg_;
|
FullSegment fullSeg_;
|
||||||
size_t maxWordLen_;
|
size_t maxWordLen_;
|
||||||
|
}; // QuerySegment
|
||||||
|
|
||||||
};
|
} // namespace cppjieba
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
2
test/testdata/userdict.english
vendored
Normal file
2
test/testdata/userdict.english
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
in
|
||||||
|
internal
|
@ -225,7 +225,7 @@ TEST(QuerySegment, Test1) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(QuerySegment, Test2) {
|
TEST(QuerySegment, Test2) {
|
||||||
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8", 3);
|
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8:../test/testdata/userdict.english", 3);
|
||||||
|
|
||||||
{
|
{
|
||||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||||
@ -251,6 +251,21 @@ TEST(QuerySegment, Test2) {
|
|||||||
ASSERT_EQ(s1, s2);
|
ASSERT_EQ(s1, s2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<string> words;
|
||||||
|
segment.Cut("internal", words);
|
||||||
|
string s = join(words.begin(), words.end(), "/");
|
||||||
|
ASSERT_EQ("internal", s);
|
||||||
|
}
|
||||||
|
|
||||||
|
segment.SetMaxWordLen(5);
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<string> words;
|
||||||
|
segment.Cut("中国科学院", words);
|
||||||
|
string s = join(words.begin(), words.end(), "/");
|
||||||
|
ASSERT_EQ("中国科学院", s);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(LevelSegmentTest, Test0) {
|
TEST(LevelSegmentTest, Test0) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user