mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
make single one chinese word in userdict will not be ignored in mixsegment.hpp
This commit is contained in:
parent
5174ac098a
commit
5b654f66db
@ -50,6 +50,13 @@ namespace CppJieba
|
|||||||
TrieType * _trie;
|
TrieType * _trie;
|
||||||
|
|
||||||
double _minWeight;
|
double _minWeight;
|
||||||
|
private:
|
||||||
|
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
||||||
|
public:
|
||||||
|
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
||||||
|
{
|
||||||
|
return isIn(_userDictSingleChineseWord, word);
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
double getMinWeight() const {return _minWeight;};
|
double getMinWeight() const {return _minWeight;};
|
||||||
|
|
||||||
@ -84,7 +91,7 @@ namespace CppJieba
|
|||||||
if(userDictPath.size())
|
if(userDictPath.size())
|
||||||
{
|
{
|
||||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
|
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||||
}
|
}
|
||||||
_shrink(_nodeInfos);
|
_shrink(_nodeInfos);
|
||||||
_trie = _creatTrie(_nodeInfos);
|
_trie = _creatTrie(_nodeInfos);
|
||||||
@ -118,7 +125,7 @@ namespace CppJieba
|
|||||||
TrieType * trie = new TrieType(words, valuePointers);
|
TrieType * trie = new TrieType(words, valuePointers);
|
||||||
return trie;
|
return trie;
|
||||||
}
|
}
|
||||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag, vector<DictUnit>& nodeInfos) const
|
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
|
||||||
{
|
{
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
assert(ifs);
|
assert(ifs);
|
||||||
@ -132,9 +139,13 @@ namespace CppJieba
|
|||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if(nodeInfo.word.size() == 1)
|
||||||
|
{
|
||||||
|
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
||||||
|
}
|
||||||
nodeInfo.weight = defaultWeight;
|
nodeInfo.weight = defaultWeight;
|
||||||
nodeInfo.tag = defaultTag;
|
nodeInfo.tag = defaultTag;
|
||||||
nodeInfos.push_back(nodeInfo);
|
_nodeInfos.push_back(nodeInfo);
|
||||||
}
|
}
|
||||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||||
}
|
}
|
||||||
|
@ -53,6 +53,10 @@ namespace CppJieba
|
|||||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||||
return _setInitFlag(true);
|
return _setInitFlag(true);
|
||||||
}
|
}
|
||||||
|
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
|
||||||
|
{
|
||||||
|
return _dictTrie.isUserDictSingleChineseWord(value);
|
||||||
|
}
|
||||||
public:
|
public:
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||||
|
@ -56,15 +56,15 @@ namespace CppJieba
|
|||||||
for (size_t i = 0, j = 0; i < words.size(); i++)
|
for (size_t i = 0, j = 0; i < words.size(); i++)
|
||||||
{
|
{
|
||||||
//if mp get a word, it's ok, put it into result
|
//if mp get a word, it's ok, put it into result
|
||||||
if (1 != words[i].size())
|
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
|
||||||
{
|
{
|
||||||
res.push_back(words[i]);
|
res.push_back(words[i]);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if mp get a single one, collect it in sequence
|
// if mp get a single one and it is not in userdict, collect it in sequence
|
||||||
j = i;
|
j = i;
|
||||||
while (j < words.size() && words[j].size() == 1)
|
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
|
||||||
{
|
{
|
||||||
piece.push_back(words[j][0]);
|
piece.push_back(words[j][0]);
|
||||||
j++;
|
j++;
|
||||||
@ -77,7 +77,7 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//put hmm result to return
|
//put hmm result to result
|
||||||
for (size_t k = 0; k < hmmRes.size(); k++)
|
for (size_t k = 0; k < hmmRes.size(); k++)
|
||||||
{
|
{
|
||||||
res.push_back(hmmRes[k]);
|
res.push_back(hmmRes[k]);
|
||||||
|
1
test/testdata/testlines.utf8
vendored
1
test/testdata/testlines.utf8
vendored
@ -5,3 +5,4 @@
|
|||||||
人事处女干事
|
人事处女干事
|
||||||
去医院做B超,编号123
|
去医院做B超,编号123
|
||||||
令狐冲是云计算行业的专家
|
令狐冲是云计算行业的专家
|
||||||
|
AB
|
||||||
|
2
test/testdata/userdict.utf8
vendored
2
test/testdata/userdict.utf8
vendored
@ -1,2 +1,4 @@
|
|||||||
云计算
|
云计算
|
||||||
韩玉鉴赏
|
韩玉鉴赏
|
||||||
|
A
|
||||||
|
B
|
||||||
|
Loading…
x
Reference in New Issue
Block a user