diff --git a/ChangeLog.md b/ChangeLog.md index a424076..15186a5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,8 +1,9 @@ # CppJieba ChangeLog -## v2.4.3 (is coming) +## v2.4.3 (upcoming) 1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据丢失的问题。 +2. 修改 PosTagger 的参数结构,删除暂时无用的参数。并添加使用自定义字典的参数,也就是支持 **自定义词性**。 ## v2.4.2 diff --git a/dict/user.dict.utf8 b/dict/user.dict.utf8 index 0e77ef4..d7f7a08 100644 --- a/dict/user.dict.utf8 +++ b/dict/user.dict.utf8 @@ -1,2 +1,3 @@ 云计算 韩玉鉴赏 +蓝翔 3 nz diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 5a7390c..29f88a4 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -24,6 +24,7 @@ namespace CppJieba const char* const UNKNOWN_TAG = "x"; + struct DictUnit { Unicode word; @@ -82,7 +83,7 @@ namespace CppJieba bool init(const string& dictPath, const string& userDictPath = "") { assert(!_trie); - _loadDict(dictPath, _nodeInfos); + _loadDict(dictPath); _calculateWeight(_nodeInfos); _minWeight = _findMinWeight(_nodeInfos); @@ -92,7 +93,7 @@ namespace CppJieba _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); } _shrink(_nodeInfos); - _trie = _creatTrie(_nodeInfos); + _trie = _createTrie(_nodeInfos); assert(_trie); return true; } @@ -109,7 +110,7 @@ namespace CppJieba private: - TrieType * _creatTrie(const vector& dictUnits) + TrieType * _createTrie(const vector& dictUnits) { assert(dictUnits.size()); vector words; @@ -129,10 +130,14 @@ namespace CppJieba assert(ifs); string line; DictUnit nodeInfo; + vector buf; size_t lineno; for(lineno = 0; getline(ifs, line); lineno++) { - if(!TransCode::decode(line, nodeInfo.word)) + buf.clear(); + split(line, buf, " "); + assert(buf.size() >= 1); + if(!TransCode::decode(buf[0], nodeInfo.word)) { LogError("line[%u:%s] illegal.", lineno, line.c_str()); continue; @@ -141,13 +146,13 @@ namespace CppJieba { _userDictSingleChineseWord.insert(nodeInfo.word[0]); } - nodeInfo.weight = defaultWeight; - nodeInfo.tag = defaultTag; + nodeInfo.weight = (buf.size() == DICT_COLUMN_NUM ? atoi(buf[1].c_str()) : defaultWeight); + nodeInfo.tag = (buf.size() == DICT_COLUMN_NUM ? buf[2] : defaultTag); _nodeInfos.push_back(nodeInfo); } LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); } - void _loadDict(const string& filePath, vector& nodeInfos) const + void _loadDict(const string& filePath) { ifstream ifs(filePath.c_str()); assert(ifs); @@ -168,7 +173,7 @@ namespace CppJieba nodeInfo.weight = atof(buf[1].c_str()); nodeInfo.tag = buf[2]; - nodeInfos.push_back(nodeInfo); + _nodeInfos.push_back(nodeInfo); } } double _findMinWeight(const vector& nodeInfos) const diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 21513b9..3b62f4a 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -30,7 +30,7 @@ namespace CppJieba class MPSegment: public SegmentBase { - protected: + private: DictTrie _dictTrie; public: diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index b63f1de..4939e02 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -13,27 +13,33 @@ namespace CppJieba { private: MixSegment _segment; - DictTrie _dictTrie; + const DictTrie * _dictTrie; public: - PosTagger(){}; + PosTagger() + {} PosTagger( const string& dictPath, - const string& hmmFilePath + const string& hmmFilePath, + const string& userDictPath = "" ) { - LIMONP_CHECK(init(dictPath, hmmFilePath)); + init(dictPath, hmmFilePath, userDictPath); }; ~PosTagger(){}; public: - bool init(const string& dictPath, const string& hmmFilePath) + void init( + const string& dictPath, + const string& hmmFilePath, + const string& userDictPath = "" + ) { - LIMONP_CHECK(_dictTrie.init(dictPath)); - LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); - return true; + LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath)); + _dictTrie = _segment.getDictTrie(); + LIMONP_CHECK(_dictTrie); }; - bool tag(const string& src, vector >& res) + bool tag(const string& src, vector >& res) const { vector cutRes; if (!_segment.cut(src, cutRes)) @@ -51,7 +57,7 @@ namespace CppJieba LogError("decode failed."); return false; } - tmp = _dictTrie.find(unico.begin(), unico.end()); + tmp = _dictTrie->find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; diff --git a/test/unittest/TPosTagger.cpp b/test/unittest/TPosTagger.cpp index 3f32e7e..051fd0c 100644 --- a/test/unittest/TPosTagger.cpp +++ b/test/unittest/TPosTagger.cpp @@ -5,6 +5,8 @@ using namespace CppJieba; const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。"; const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:x\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]"; +const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。"; +const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:x\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]"; TEST(PosTaggerTest, Test1) { @@ -15,3 +17,12 @@ TEST(PosTaggerTest, Test1) s << res; ASSERT_TRUE(s == ANS_TEST1); } +TEST(PosTaggerTest, Test2) +{ + PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8"); + vector > res; + tagger.tag(QUERY_TEST2, res); + string s; + s << res; + ASSERT_TRUE(s == ANS_TEST2); +}