From 5b654f66dbcd24b2356051e4754c253433b5955b Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 17 May 2014 16:22:54 +0800 Subject: [PATCH] make single one chinese word in userdict will not be ignored in mixsegment.hpp --- src/DictTrie.hpp | 17 ++++++++++++++--- src/MPSegment.hpp | 4 ++++ src/MixSegment.hpp | 8 ++++---- test/testdata/testlines.utf8 | 1 + test/testdata/userdict.utf8 | 2 ++ 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 53a6291..f655d7d 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -50,6 +50,13 @@ namespace CppJieba TrieType * _trie; double _minWeight; + private: + unordered_set _userDictSingleChineseWord; + public: + bool isUserDictSingleChineseWord(const Unicode::value_type& word) const + { + return isIn(_userDictSingleChineseWord, word); + } public: double getMinWeight() const {return _minWeight;}; @@ -84,7 +91,7 @@ namespace CppJieba if(userDictPath.size()) { double maxWeight = _findMaxWeight(_nodeInfos); - _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos); + _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); } _shrink(_nodeInfos); _trie = _creatTrie(_nodeInfos); @@ -118,7 +125,7 @@ namespace CppJieba TrieType * trie = new TrieType(words, valuePointers); return trie; } - void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag, vector& nodeInfos) const + void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) { ifstream ifs(filePath.c_str()); assert(ifs); @@ -132,9 +139,13 @@ namespace CppJieba LogError("line[%u:%s] illegal.", lineno, line.c_str()); continue; } + if(nodeInfo.word.size() == 1) + { + _userDictSingleChineseWord.insert(nodeInfo.word[0]); + } nodeInfo.weight = defaultWeight; nodeInfo.tag = defaultTag; - nodeInfos.push_back(nodeInfo); + _nodeInfos.push_back(nodeInfo); } LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); } diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index f132b61..3ae78b9 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -53,6 +53,10 @@ namespace CppJieba LogInfo("MPSegment init(%s) ok", dictPath.c_str()); return _setInitFlag(true); } + bool isUserDictSingleChineseWord(const Unicode::value_type & value) const + { + return _dictTrie.isUserDictSingleChineseWord(value); + } public: using SegmentBase::cut; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index cc6a02a..e613c8f 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -56,15 +56,15 @@ namespace CppJieba for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result - if (1 != words[i].size()) + if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } - // if mp get a single one, collect it in sequence + // if mp get a single one and it is not in userdict, collect it in sequence j = i; - while (j < words.size() && words[j].size() == 1) + while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; @@ -77,7 +77,7 @@ namespace CppJieba return false; } - //put hmm result to return + //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); diff --git a/test/testdata/testlines.utf8 b/test/testdata/testlines.utf8 index 18510c8..769e0d2 100644 --- a/test/testdata/testlines.utf8 +++ b/test/testdata/testlines.utf8 @@ -5,3 +5,4 @@ 人事处女干事 去医院做B超,编号123 令狐冲是云计算行业的专家 +AB diff --git a/test/testdata/userdict.utf8 b/test/testdata/userdict.utf8 index 0e77ef4..b34db3b 100644 --- a/test/testdata/userdict.utf8 +++ b/test/testdata/userdict.utf8 @@ -1,2 +1,4 @@ 云计算 韩玉鉴赏 +A +B