adding user dict interface

This commit is contained in:
wyy 2014-04-25 18:47:22 +08:00
parent dc96bb3795
commit 2937985243
6 changed files with 55 additions and 11 deletions

View File

@ -13,6 +13,7 @@ CppJieba是"结巴"中文分词的C++版本
+ 内置分词服务在linux环境下可安装使用。mac因为没有自带`epoll`,使用示例请看[libcppjieba]。
+ [libcppjieba] 最简单易懂的CppJieba头文件库使用示例。
+ 项目自带较为完善的单元测试,核心功能中文分词的稳定性接受过线上环境检验。
+ 支持载自定义用户词典。
## Usage & Example
@ -181,6 +182,12 @@ Full方法切出所有字典里的词语。
Query方法先使用Mix方法切词对于切出来的较长的词再使用Full方法。
### 自定义用户词典
```
```
### 关键词抽取
```

View File

@ -60,10 +60,10 @@ namespace CppJieba
_minWeight = MAX_DOUBLE;
_setInitFlag(false);
}
DictTrie(const string& filePath)
DictTrie(const string& dictPath, const string& userDictPath = "")
{
new (this) DictTrie();
_setInitFlag(init(filePath));
_setInitFlag(init(dictPath, userDictPath));
}
~DictTrie()
{
@ -80,9 +80,12 @@ namespace CppJieba
_loadDict(dictPath, _nodeInfos);
_calculateWeight(_nodeInfos);
_minWeight = _findMinWeight(_nodeInfos);
if(userDictPath.size())
{
_loadUserDict(dictPath, _minWeight, UNKNOWN_TAG, _nodeInfos);
double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
}
_shrink(_nodeInfos);
_trie = _creatTrie(_nodeInfos);
@ -167,6 +170,15 @@ namespace CppJieba
}
return ret;
}
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
{
double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
ret = max(nodeInfos[i].weight, ret);
}
return ret;
}
void _calculateWeight(vector<DictUnit>& nodeInfos) const
{

View File

@ -35,20 +35,20 @@ namespace CppJieba
public:
MPSegment(){_setInitFlag(false);};
explicit MPSegment(const string& dictPath)
explicit MPSegment(const string& dictPath, const string& userDictPath = "")
{
_setInitFlag(init(dictPath));
_setInitFlag(init(dictPath, userDictPath));
};
virtual ~MPSegment(){};
public:
bool init(const string& dictPath)
bool init(const string& dictPath, const string& userDictPath = "")
{
if(_getInitFlag())
{
LogError("already inited before now.");
return false;
}
_dictTrie.init(dictPath);
_dictTrie.init(dictPath, userDictPath);
assert(_dictTrie);
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
return _setInitFlag(true);

View File

@ -15,17 +15,17 @@ namespace CppJieba
HMMSegment _hmmSeg;
public:
MixSegment(){_setInitFlag(false);};
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
{
_setInitFlag(init(mpSegDict, hmmSegDict));
_setInitFlag(init(mpSegDict, hmmSegDict, userDict));
assert(_getInitFlag());
}
virtual ~MixSegment(){}
public:
bool init(const string& mpSegDict, const string& hmmSegDict)
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
{
assert(!_getInitFlag());
if(!_mpSeg.init(mpSegDict))
if(!_mpSeg.init(mpSegDict, userDict))
{
LogError("_mpSeg init");
return false;

View File

@ -58,6 +58,21 @@ TEST(MixSegmentTest, Test1)
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
}
TEST(MixSegmentTest, UserDict)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
ASSERT_TRUE(segment);
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
print(words);
exit(0);
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
}
TEST(MPSegmentTest, Test1)
{
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");;

View File

@ -54,3 +54,13 @@ TEST(DictTrieTest, Test1)
// print(vec);
}
TEST(DictTrieTest, UserDict)
{
DictTrie trie(DICT_FILE);
ASSERT_TRUE(trie);
string word = "云计算";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
print((*trie.find(unicode.begin(), unicode.end())));
exit(0);
}