From 29379852438ffe29e116bdf15493274ead1a98d4 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 25 Apr 2014 18:47:22 +0800 Subject: [PATCH] adding user dict interface --- README.md | 7 +++++++ src/DictTrie.hpp | 18 +++++++++++++++--- src/MPSegment.hpp | 8 ++++---- src/MixSegment.hpp | 8 ++++---- test/unittest/TSegments.cpp | 15 +++++++++++++++ test/unittest/TTrie.cpp | 10 ++++++++++ 6 files changed, 55 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e4a99a9..9281793 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ CppJieba是"结巴"中文分词的C++版本 + 内置分词服务,在linux环境下可安装使用。mac因为没有自带`epoll`,使用示例请看[libcppjieba]。 + [libcppjieba] 最简单易懂的CppJieba头文件库使用示例。 + 项目自带较为完善的单元测试,核心功能中文分词的稳定性接受过线上环境检验。 ++ 支持载自定义用户词典。 ## Usage & Example @@ -181,6 +182,12 @@ Full方法切出所有字典里的词语。 Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。 +### 自定义用户词典 + +``` +``` + + ### 关键词抽取 ``` diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 7bce4ae..8137084 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -60,10 +60,10 @@ namespace CppJieba _minWeight = MAX_DOUBLE; _setInitFlag(false); } - DictTrie(const string& filePath) + DictTrie(const string& dictPath, const string& userDictPath = "") { new (this) DictTrie(); - _setInitFlag(init(filePath)); + _setInitFlag(init(dictPath, userDictPath)); } ~DictTrie() { @@ -80,9 +80,12 @@ namespace CppJieba _loadDict(dictPath, _nodeInfos); _calculateWeight(_nodeInfos); _minWeight = _findMinWeight(_nodeInfos); + if(userDictPath.size()) { - _loadUserDict(dictPath, _minWeight, UNKNOWN_TAG, _nodeInfos); + double maxWeight = _findMaxWeight(_nodeInfos); + _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos); + LogDebug("load userdict[%s] ok.", userDictPath.c_str()); } _shrink(_nodeInfos); _trie = _creatTrie(_nodeInfos); @@ -167,6 +170,15 @@ namespace CppJieba } return ret; } + double _findMaxWeight(const vector& nodeInfos) const + { + double ret = MIN_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) + { + ret = max(nodeInfos[i].weight, ret); + } + return ret; + } void _calculateWeight(vector& nodeInfos) const { diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 5c43ffc..f132b61 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -35,20 +35,20 @@ namespace CppJieba public: MPSegment(){_setInitFlag(false);}; - explicit MPSegment(const string& dictPath) + explicit MPSegment(const string& dictPath, const string& userDictPath = "") { - _setInitFlag(init(dictPath)); + _setInitFlag(init(dictPath, userDictPath)); }; virtual ~MPSegment(){}; public: - bool init(const string& dictPath) + bool init(const string& dictPath, const string& userDictPath = "") { if(_getInitFlag()) { LogError("already inited before now."); return false; } - _dictTrie.init(dictPath); + _dictTrie.init(dictPath, userDictPath); assert(_dictTrie); LogInfo("MPSegment init(%s) ok", dictPath.c_str()); return _setInitFlag(true); diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 36fcb08..cc6a02a 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -15,17 +15,17 @@ namespace CppJieba HMMSegment _hmmSeg; public: MixSegment(){_setInitFlag(false);}; - explicit MixSegment(const string& mpSegDict, const string& hmmSegDict) + explicit MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { - _setInitFlag(init(mpSegDict, hmmSegDict)); + _setInitFlag(init(mpSegDict, hmmSegDict, userDict)); assert(_getInitFlag()); } virtual ~MixSegment(){} public: - bool init(const string& mpSegDict, const string& hmmSegDict) + bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { assert(!_getInitFlag()); - if(!_mpSeg.init(mpSegDict)) + if(!_mpSeg.init(mpSegDict, userDict)) { LogError("_mpSeg init"); return false; diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 8f3677f..b930d2e 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -58,6 +58,21 @@ TEST(MixSegmentTest, Test1) ASSERT_EQ(words, vector(res2, res2 + sizeof(res2)/sizeof(res2[0]))); } +TEST(MixSegmentTest, UserDict) +{ + MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); + //MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8"); + ASSERT_TRUE(segment); + const char* str = "令狐冲是云计算方面的专家"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + print(words); + exit(0); + + //* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / + // 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / +} + TEST(MPSegmentTest, Test1) { MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");; diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index d2c68ae..d44f8d0 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -54,3 +54,13 @@ TEST(DictTrieTest, Test1) // print(vec); } +TEST(DictTrieTest, UserDict) +{ + DictTrie trie(DICT_FILE); + ASSERT_TRUE(trie); + string word = "云计算"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + print((*trie.find(unicode.begin(), unicode.end()))); + exit(0); +}