mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
adding user dict interface
This commit is contained in:
parent
dc96bb3795
commit
2937985243
@ -13,6 +13,7 @@ CppJieba是"结巴"中文分词的C++版本
|
|||||||
+ 内置分词服务,在linux环境下可安装使用。mac因为没有自带`epoll`,使用示例请看[libcppjieba]。
|
+ 内置分词服务,在linux环境下可安装使用。mac因为没有自带`epoll`,使用示例请看[libcppjieba]。
|
||||||
+ [libcppjieba] 最简单易懂的CppJieba头文件库使用示例。
|
+ [libcppjieba] 最简单易懂的CppJieba头文件库使用示例。
|
||||||
+ 项目自带较为完善的单元测试,核心功能中文分词的稳定性接受过线上环境检验。
|
+ 项目自带较为完善的单元测试,核心功能中文分词的稳定性接受过线上环境检验。
|
||||||
|
+ 支持载自定义用户词典。
|
||||||
|
|
||||||
## Usage & Example
|
## Usage & Example
|
||||||
|
|
||||||
@ -181,6 +182,12 @@ Full方法切出所有字典里的词语。
|
|||||||
|
|
||||||
Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
|
Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
|
||||||
|
|
||||||
|
### 自定义用户词典
|
||||||
|
|
||||||
|
```
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### 关键词抽取
|
### 关键词抽取
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -60,10 +60,10 @@ namespace CppJieba
|
|||||||
_minWeight = MAX_DOUBLE;
|
_minWeight = MAX_DOUBLE;
|
||||||
_setInitFlag(false);
|
_setInitFlag(false);
|
||||||
}
|
}
|
||||||
DictTrie(const string& filePath)
|
DictTrie(const string& dictPath, const string& userDictPath = "")
|
||||||
{
|
{
|
||||||
new (this) DictTrie();
|
new (this) DictTrie();
|
||||||
_setInitFlag(init(filePath));
|
_setInitFlag(init(dictPath, userDictPath));
|
||||||
}
|
}
|
||||||
~DictTrie()
|
~DictTrie()
|
||||||
{
|
{
|
||||||
@ -80,9 +80,12 @@ namespace CppJieba
|
|||||||
_loadDict(dictPath, _nodeInfos);
|
_loadDict(dictPath, _nodeInfos);
|
||||||
_calculateWeight(_nodeInfos);
|
_calculateWeight(_nodeInfos);
|
||||||
_minWeight = _findMinWeight(_nodeInfos);
|
_minWeight = _findMinWeight(_nodeInfos);
|
||||||
|
|
||||||
if(userDictPath.size())
|
if(userDictPath.size())
|
||||||
{
|
{
|
||||||
_loadUserDict(dictPath, _minWeight, UNKNOWN_TAG, _nodeInfos);
|
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||||
|
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
|
||||||
|
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
|
||||||
}
|
}
|
||||||
_shrink(_nodeInfos);
|
_shrink(_nodeInfos);
|
||||||
_trie = _creatTrie(_nodeInfos);
|
_trie = _creatTrie(_nodeInfos);
|
||||||
@ -167,6 +170,15 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
|
||||||
|
{
|
||||||
|
double ret = MIN_DOUBLE;
|
||||||
|
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||||
|
{
|
||||||
|
ret = max(nodeInfos[i].weight, ret);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
void _calculateWeight(vector<DictUnit>& nodeInfos) const
|
void _calculateWeight(vector<DictUnit>& nodeInfos) const
|
||||||
{
|
{
|
||||||
|
@ -35,20 +35,20 @@ namespace CppJieba
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
MPSegment(){_setInitFlag(false);};
|
MPSegment(){_setInitFlag(false);};
|
||||||
explicit MPSegment(const string& dictPath)
|
explicit MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||||
{
|
{
|
||||||
_setInitFlag(init(dictPath));
|
_setInitFlag(init(dictPath, userDictPath));
|
||||||
};
|
};
|
||||||
virtual ~MPSegment(){};
|
virtual ~MPSegment(){};
|
||||||
public:
|
public:
|
||||||
bool init(const string& dictPath)
|
bool init(const string& dictPath, const string& userDictPath = "")
|
||||||
{
|
{
|
||||||
if(_getInitFlag())
|
if(_getInitFlag())
|
||||||
{
|
{
|
||||||
LogError("already inited before now.");
|
LogError("already inited before now.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
_dictTrie.init(dictPath);
|
_dictTrie.init(dictPath, userDictPath);
|
||||||
assert(_dictTrie);
|
assert(_dictTrie);
|
||||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||||
return _setInitFlag(true);
|
return _setInitFlag(true);
|
||||||
|
@ -15,17 +15,17 @@ namespace CppJieba
|
|||||||
HMMSegment _hmmSeg;
|
HMMSegment _hmmSeg;
|
||||||
public:
|
public:
|
||||||
MixSegment(){_setInitFlag(false);};
|
MixSegment(){_setInitFlag(false);};
|
||||||
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
|
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||||
{
|
{
|
||||||
_setInitFlag(init(mpSegDict, hmmSegDict));
|
_setInitFlag(init(mpSegDict, hmmSegDict, userDict));
|
||||||
assert(_getInitFlag());
|
assert(_getInitFlag());
|
||||||
}
|
}
|
||||||
virtual ~MixSegment(){}
|
virtual ~MixSegment(){}
|
||||||
public:
|
public:
|
||||||
bool init(const string& mpSegDict, const string& hmmSegDict)
|
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||||
{
|
{
|
||||||
assert(!_getInitFlag());
|
assert(!_getInitFlag());
|
||||||
if(!_mpSeg.init(mpSegDict))
|
if(!_mpSeg.init(mpSegDict, userDict))
|
||||||
{
|
{
|
||||||
LogError("_mpSeg init");
|
LogError("_mpSeg init");
|
||||||
return false;
|
return false;
|
||||||
|
@ -58,6 +58,21 @@ TEST(MixSegmentTest, Test1)
|
|||||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(MixSegmentTest, UserDict)
|
||||||
|
{
|
||||||
|
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||||
|
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
|
||||||
|
ASSERT_TRUE(segment);
|
||||||
|
const char* str = "令狐冲是云计算方面的专家";
|
||||||
|
vector<string> words;
|
||||||
|
ASSERT_TRUE(segment.cut(str, words));
|
||||||
|
print(words);
|
||||||
|
exit(0);
|
||||||
|
|
||||||
|
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||||
|
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||||
|
}
|
||||||
|
|
||||||
TEST(MPSegmentTest, Test1)
|
TEST(MPSegmentTest, Test1)
|
||||||
{
|
{
|
||||||
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");;
|
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");;
|
||||||
|
@ -54,3 +54,13 @@ TEST(DictTrieTest, Test1)
|
|||||||
// print(vec);
|
// print(vec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(DictTrieTest, UserDict)
|
||||||
|
{
|
||||||
|
DictTrie trie(DICT_FILE);
|
||||||
|
ASSERT_TRUE(trie);
|
||||||
|
string word = "云计算";
|
||||||
|
Unicode unicode;
|
||||||
|
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||||
|
print((*trie.find(unicode.begin(), unicode.end())));
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user