支持自定义词性

This commit is contained in:
wyy 2014-09-28 13:22:37 +08:00
parent 28246fba5d
commit 6a8ebae344
6 changed files with 44 additions and 20 deletions

View File

@ -1,8 +1,9 @@
# CppJieba ChangeLog # CppJieba ChangeLog
## v2.4.3 (is coming) ## v2.4.3 (upcoming)
1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据丢失的问题。 1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据丢失的问题。
2. 修改 PosTagger 的参数结构,删除暂时无用的参数。并添加使用自定义字典的参数,也就是支持 **自定义词性**
## v2.4.2 ## v2.4.2

View File

@ -1,2 +1,3 @@
云计算 云计算
韩玉鉴赏 韩玉鉴赏
蓝翔 3 nz

View File

@ -24,6 +24,7 @@ namespace CppJieba
const char* const UNKNOWN_TAG = "x"; const char* const UNKNOWN_TAG = "x";
struct DictUnit struct DictUnit
{ {
Unicode word; Unicode word;
@ -82,7 +83,7 @@ namespace CppJieba
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "")
{ {
assert(!_trie); assert(!_trie);
_loadDict(dictPath, _nodeInfos); _loadDict(dictPath);
_calculateWeight(_nodeInfos); _calculateWeight(_nodeInfos);
_minWeight = _findMinWeight(_nodeInfos); _minWeight = _findMinWeight(_nodeInfos);
@ -92,7 +93,7 @@ namespace CppJieba
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
} }
_shrink(_nodeInfos); _shrink(_nodeInfos);
_trie = _creatTrie(_nodeInfos); _trie = _createTrie(_nodeInfos);
assert(_trie); assert(_trie);
return true; return true;
} }
@ -109,7 +110,7 @@ namespace CppJieba
private: private:
TrieType * _creatTrie(const vector<DictUnit>& dictUnits) TrieType * _createTrie(const vector<DictUnit>& dictUnits)
{ {
assert(dictUnits.size()); assert(dictUnits.size());
vector<Unicode> words; vector<Unicode> words;
@ -129,10 +130,14 @@ namespace CppJieba
assert(ifs); assert(ifs);
string line; string line;
DictUnit nodeInfo; DictUnit nodeInfo;
vector<string> buf;
size_t lineno; size_t lineno;
for(lineno = 0; getline(ifs, line); lineno++) for(lineno = 0; getline(ifs, line); lineno++)
{ {
if(!TransCode::decode(line, nodeInfo.word)) buf.clear();
split(line, buf, " ");
assert(buf.size() >= 1);
if(!TransCode::decode(buf[0], nodeInfo.word))
{ {
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue; continue;
@ -141,13 +146,13 @@ namespace CppJieba
{ {
_userDictSingleChineseWord.insert(nodeInfo.word[0]); _userDictSingleChineseWord.insert(nodeInfo.word[0]);
} }
nodeInfo.weight = defaultWeight; nodeInfo.weight = (buf.size() == DICT_COLUMN_NUM ? atoi(buf[1].c_str()) : defaultWeight);
nodeInfo.tag = defaultTag; nodeInfo.tag = (buf.size() == DICT_COLUMN_NUM ? buf[2] : defaultTag);
_nodeInfos.push_back(nodeInfo); _nodeInfos.push_back(nodeInfo);
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const void _loadDict(const string& filePath)
{ {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
assert(ifs); assert(ifs);
@ -168,7 +173,7 @@ namespace CppJieba
nodeInfo.weight = atof(buf[1].c_str()); nodeInfo.weight = atof(buf[1].c_str());
nodeInfo.tag = buf[2]; nodeInfo.tag = buf[2];
nodeInfos.push_back(nodeInfo); _nodeInfos.push_back(nodeInfo);
} }
} }
double _findMinWeight(const vector<DictUnit>& nodeInfos) const double _findMinWeight(const vector<DictUnit>& nodeInfos) const

View File

@ -30,7 +30,7 @@ namespace CppJieba
class MPSegment: public SegmentBase class MPSegment: public SegmentBase
{ {
protected: private:
DictTrie _dictTrie; DictTrie _dictTrie;
public: public:

View File

@ -13,27 +13,33 @@ namespace CppJieba
{ {
private: private:
MixSegment _segment; MixSegment _segment;
DictTrie _dictTrie; const DictTrie * _dictTrie;
public: public:
PosTagger(){}; PosTagger()
{}
PosTagger( PosTagger(
const string& dictPath, const string& dictPath,
const string& hmmFilePath const string& hmmFilePath,
const string& userDictPath = ""
) )
{ {
LIMONP_CHECK(init(dictPath, hmmFilePath)); init(dictPath, hmmFilePath, userDictPath);
}; };
~PosTagger(){}; ~PosTagger(){};
public: public:
bool init(const string& dictPath, const string& hmmFilePath) void init(
const string& dictPath,
const string& hmmFilePath,
const string& userDictPath = ""
)
{ {
LIMONP_CHECK(_dictTrie.init(dictPath)); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); _dictTrie = _segment.getDictTrie();
return true; LIMONP_CHECK(_dictTrie);
}; };
bool tag(const string& src, vector<pair<string, string> >& res) bool tag(const string& src, vector<pair<string, string> >& res) const
{ {
vector<string> cutRes; vector<string> cutRes;
if (!_segment.cut(src, cutRes)) if (!_segment.cut(src, cutRes))
@ -51,7 +57,7 @@ namespace CppJieba
LogError("decode failed."); LogError("decode failed.");
return false; return false;
} }
tmp = _dictTrie.find(unico.begin(), unico.end()); tmp = _dictTrie->find(unico.begin(), unico.end());
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
} }
tmp = NULL; tmp = NULL;

View File

@ -5,6 +5,8 @@ using namespace CppJieba;
const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。"; const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。";
const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \":x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \":x\", \"当上:t\", \"总经理:n\", \":x\", \"出任:v\", \"CEO:x\", \":x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \":x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]"; const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \":x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \":x\", \"当上:t\", \"总经理:n\", \":x\", \"出任:v\", \"CEO:x\", \":x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \":x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。";
const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \":x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \":x\", \"当上:t\", \"总经理:n\", \":x\", \"出任:v\", \"CEO:x\", \":x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \":x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
TEST(PosTaggerTest, Test1) TEST(PosTaggerTest, Test1)
{ {
@ -15,3 +17,12 @@ TEST(PosTaggerTest, Test1)
s << res; s << res;
ASSERT_TRUE(s == ANS_TEST1); ASSERT_TRUE(s == ANS_TEST1);
} }
TEST(PosTaggerTest, Test2)
{
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
vector<pair<string, string> > res;
tagger.tag(QUERY_TEST2, res);
string s;
s << res;
ASSERT_TRUE(s == ANS_TEST2);
}