mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
adding user dict interface
This commit is contained in:
parent
dc96bb3795
commit
2937985243
@ -13,6 +13,7 @@ CppJieba是"结巴"中文分词的C++版本
|
||||
+ 内置分词服务,在linux环境下可安装使用。mac因为没有自带`epoll`,使用示例请看[libcppjieba]。
|
||||
+ [libcppjieba] 最简单易懂的CppJieba头文件库使用示例。
|
||||
+ 项目自带较为完善的单元测试,核心功能中文分词的稳定性接受过线上环境检验。
|
||||
+ 支持载自定义用户词典。
|
||||
|
||||
## Usage & Example
|
||||
|
||||
@ -181,6 +182,12 @@ Full方法切出所有字典里的词语。
|
||||
|
||||
Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
|
||||
|
||||
### 自定义用户词典
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
|
||||
### 关键词抽取
|
||||
|
||||
```
|
||||
|
@ -60,10 +60,10 @@ namespace CppJieba
|
||||
_minWeight = MAX_DOUBLE;
|
||||
_setInitFlag(false);
|
||||
}
|
||||
DictTrie(const string& filePath)
|
||||
DictTrie(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
new (this) DictTrie();
|
||||
_setInitFlag(init(filePath));
|
||||
_setInitFlag(init(dictPath, userDictPath));
|
||||
}
|
||||
~DictTrie()
|
||||
{
|
||||
@ -80,9 +80,12 @@ namespace CppJieba
|
||||
_loadDict(dictPath, _nodeInfos);
|
||||
_calculateWeight(_nodeInfos);
|
||||
_minWeight = _findMinWeight(_nodeInfos);
|
||||
|
||||
if(userDictPath.size())
|
||||
{
|
||||
_loadUserDict(dictPath, _minWeight, UNKNOWN_TAG, _nodeInfos);
|
||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
|
||||
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
|
||||
}
|
||||
_shrink(_nodeInfos);
|
||||
_trie = _creatTrie(_nodeInfos);
|
||||
@ -167,6 +170,15 @@ namespace CppJieba
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
|
||||
{
|
||||
double ret = MIN_DOUBLE;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
ret = max(nodeInfos[i].weight, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void _calculateWeight(vector<DictUnit>& nodeInfos) const
|
||||
{
|
||||
|
@ -35,20 +35,20 @@ namespace CppJieba
|
||||
|
||||
public:
|
||||
MPSegment(){_setInitFlag(false);};
|
||||
explicit MPSegment(const string& dictPath)
|
||||
explicit MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
_setInitFlag(init(dictPath));
|
||||
_setInitFlag(init(dictPath, userDictPath));
|
||||
};
|
||||
virtual ~MPSegment(){};
|
||||
public:
|
||||
bool init(const string& dictPath)
|
||||
bool init(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("already inited before now.");
|
||||
return false;
|
||||
}
|
||||
_dictTrie.init(dictPath);
|
||||
_dictTrie.init(dictPath, userDictPath);
|
||||
assert(_dictTrie);
|
||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||
return _setInitFlag(true);
|
||||
|
@ -15,17 +15,17 @@ namespace CppJieba
|
||||
HMMSegment _hmmSeg;
|
||||
public:
|
||||
MixSegment(){_setInitFlag(false);};
|
||||
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
|
||||
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||
{
|
||||
_setInitFlag(init(mpSegDict, hmmSegDict));
|
||||
_setInitFlag(init(mpSegDict, hmmSegDict, userDict));
|
||||
assert(_getInitFlag());
|
||||
}
|
||||
virtual ~MixSegment(){}
|
||||
public:
|
||||
bool init(const string& mpSegDict, const string& hmmSegDict)
|
||||
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||
{
|
||||
assert(!_getInitFlag());
|
||||
if(!_mpSeg.init(mpSegDict))
|
||||
if(!_mpSeg.init(mpSegDict, userDict))
|
||||
{
|
||||
LogError("_mpSeg init");
|
||||
return false;
|
||||
|
@ -58,6 +58,21 @@ TEST(MixSegmentTest, Test1)
|
||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, UserDict)
|
||||
{
|
||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
|
||||
ASSERT_TRUE(segment);
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
print(words);
|
||||
exit(0);
|
||||
|
||||
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||
}
|
||||
|
||||
TEST(MPSegmentTest, Test1)
|
||||
{
|
||||
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");;
|
||||
|
@ -54,3 +54,13 @@ TEST(DictTrieTest, Test1)
|
||||
// print(vec);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, UserDict)
|
||||
{
|
||||
DictTrie trie(DICT_FILE);
|
||||
ASSERT_TRUE(trie);
|
||||
string word = "云计算";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
print((*trie.find(unicode.begin(), unicode.end())));
|
||||
exit(0);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user