make single one chinese word in userdict will not be ignored in mixsegment.hpp

This commit is contained in:
wyy 2014-05-17 16:22:54 +08:00
parent 5174ac098a
commit 5b654f66db
5 changed files with 25 additions and 7 deletions

View File

@ -50,6 +50,13 @@ namespace CppJieba
TrieType * _trie; TrieType * _trie;
double _minWeight; double _minWeight;
private:
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
public:
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
{
return isIn(_userDictSingleChineseWord, word);
}
public: public:
double getMinWeight() const {return _minWeight;}; double getMinWeight() const {return _minWeight;};
@ -84,7 +91,7 @@ namespace CppJieba
if(userDictPath.size()) if(userDictPath.size())
{ {
double maxWeight = _findMaxWeight(_nodeInfos); double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos); _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
} }
_shrink(_nodeInfos); _shrink(_nodeInfos);
_trie = _creatTrie(_nodeInfos); _trie = _creatTrie(_nodeInfos);
@ -118,7 +125,7 @@ namespace CppJieba
TrieType * trie = new TrieType(words, valuePointers); TrieType * trie = new TrieType(words, valuePointers);
return trie; return trie;
} }
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag, vector<DictUnit>& nodeInfos) const void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
{ {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
assert(ifs); assert(ifs);
@ -132,9 +139,13 @@ namespace CppJieba
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue; continue;
} }
if(nodeInfo.word.size() == 1)
{
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
}
nodeInfo.weight = defaultWeight; nodeInfo.weight = defaultWeight;
nodeInfo.tag = defaultTag; nodeInfo.tag = defaultTag;
nodeInfos.push_back(nodeInfo); _nodeInfos.push_back(nodeInfo);
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }

View File

@ -53,6 +53,10 @@ namespace CppJieba
LogInfo("MPSegment init(%s) ok", dictPath.c_str()); LogInfo("MPSegment init(%s) ok", dictPath.c_str());
return _setInitFlag(true); return _setInitFlag(true);
} }
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
{
return _dictTrie.isUserDictSingleChineseWord(value);
}
public: public:
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const

View File

@ -56,15 +56,15 @@ namespace CppJieba
for (size_t i = 0, j = 0; i < words.size(); i++) for (size_t i = 0, j = 0; i < words.size(); i++)
{ {
//if mp get a word, it's ok, put it into result //if mp get a word, it's ok, put it into result
if (1 != words[i].size()) if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
{ {
res.push_back(words[i]); res.push_back(words[i]);
continue; continue;
} }
// if mp get a single one, collect it in sequence // if mp get a single one and it is not in userdict, collect it in sequence
j = i; j = i;
while (j < words.size() && words[j].size() == 1) while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
{ {
piece.push_back(words[j][0]); piece.push_back(words[j][0]);
j++; j++;
@ -77,7 +77,7 @@ namespace CppJieba
return false; return false;
} }
//put hmm result to return //put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) for (size_t k = 0; k < hmmRes.size(); k++)
{ {
res.push_back(hmmRes[k]); res.push_back(hmmRes[k]);

View File

@ -5,3 +5,4 @@
人事处女干事 人事处女干事
去医院做B超编号123 去医院做B超编号123
令狐冲是云计算行业的专家 令狐冲是云计算行业的专家
AB

View File

@ -1,2 +1,4 @@
云计算 云计算
韩玉鉴赏 韩玉鉴赏
A
B