mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
支持自定义词性
This commit is contained in:
parent
28246fba5d
commit
6a8ebae344
@ -1,8 +1,9 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
## v2.4.3 (is coming)
|
||||
## v2.4.3 (upcoming)
|
||||
|
||||
1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据丢失的问题。
|
||||
2. 修改 PosTagger 的参数结构,删除暂时无用的参数。并添加使用自定义字典的参数,也就是支持 **自定义词性**。
|
||||
|
||||
## v2.4.2
|
||||
|
||||
|
@ -1,2 +1,3 @@
|
||||
云计算
|
||||
韩玉鉴赏
|
||||
蓝翔 3 nz
|
||||
|
@ -24,6 +24,7 @@ namespace CppJieba
|
||||
const char* const UNKNOWN_TAG = "x";
|
||||
|
||||
|
||||
|
||||
struct DictUnit
|
||||
{
|
||||
Unicode word;
|
||||
@ -82,7 +83,7 @@ namespace CppJieba
|
||||
bool init(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
assert(!_trie);
|
||||
_loadDict(dictPath, _nodeInfos);
|
||||
_loadDict(dictPath);
|
||||
_calculateWeight(_nodeInfos);
|
||||
_minWeight = _findMinWeight(_nodeInfos);
|
||||
|
||||
@ -92,7 +93,7 @@ namespace CppJieba
|
||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||
}
|
||||
_shrink(_nodeInfos);
|
||||
_trie = _creatTrie(_nodeInfos);
|
||||
_trie = _createTrie(_nodeInfos);
|
||||
assert(_trie);
|
||||
return true;
|
||||
}
|
||||
@ -109,7 +110,7 @@ namespace CppJieba
|
||||
|
||||
|
||||
private:
|
||||
TrieType * _creatTrie(const vector<DictUnit>& dictUnits)
|
||||
TrieType * _createTrie(const vector<DictUnit>& dictUnits)
|
||||
{
|
||||
assert(dictUnits.size());
|
||||
vector<Unicode> words;
|
||||
@ -129,10 +130,14 @@ namespace CppJieba
|
||||
assert(ifs);
|
||||
string line;
|
||||
DictUnit nodeInfo;
|
||||
vector<string> buf;
|
||||
size_t lineno;
|
||||
for(lineno = 0; getline(ifs, line); lineno++)
|
||||
{
|
||||
if(!TransCode::decode(line, nodeInfo.word))
|
||||
buf.clear();
|
||||
split(line, buf, " ");
|
||||
assert(buf.size() >= 1);
|
||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
||||
{
|
||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||
continue;
|
||||
@ -141,13 +146,13 @@ namespace CppJieba
|
||||
{
|
||||
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
||||
}
|
||||
nodeInfo.weight = defaultWeight;
|
||||
nodeInfo.tag = defaultTag;
|
||||
nodeInfo.weight = (buf.size() == DICT_COLUMN_NUM ? atoi(buf[1].c_str()) : defaultWeight);
|
||||
nodeInfo.tag = (buf.size() == DICT_COLUMN_NUM ? buf[2] : defaultTag);
|
||||
_nodeInfos.push_back(nodeInfo);
|
||||
}
|
||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||
}
|
||||
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
|
||||
void _loadDict(const string& filePath)
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
assert(ifs);
|
||||
@ -168,7 +173,7 @@ namespace CppJieba
|
||||
nodeInfo.weight = atof(buf[1].c_str());
|
||||
nodeInfo.tag = buf[2];
|
||||
|
||||
nodeInfos.push_back(nodeInfo);
|
||||
_nodeInfos.push_back(nodeInfo);
|
||||
}
|
||||
}
|
||||
double _findMinWeight(const vector<DictUnit>& nodeInfos) const
|
||||
|
@ -30,7 +30,7 @@ namespace CppJieba
|
||||
|
||||
class MPSegment: public SegmentBase
|
||||
{
|
||||
protected:
|
||||
private:
|
||||
DictTrie _dictTrie;
|
||||
|
||||
public:
|
||||
|
@ -13,27 +13,33 @@ namespace CppJieba
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
DictTrie _dictTrie;
|
||||
const DictTrie * _dictTrie;
|
||||
|
||||
public:
|
||||
PosTagger(){};
|
||||
PosTagger()
|
||||
{}
|
||||
PosTagger(
|
||||
const string& dictPath,
|
||||
const string& hmmFilePath
|
||||
const string& hmmFilePath,
|
||||
const string& userDictPath = ""
|
||||
)
|
||||
{
|
||||
LIMONP_CHECK(init(dictPath, hmmFilePath));
|
||||
init(dictPath, hmmFilePath, userDictPath);
|
||||
};
|
||||
~PosTagger(){};
|
||||
public:
|
||||
bool init(const string& dictPath, const string& hmmFilePath)
|
||||
void init(
|
||||
const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& userDictPath = ""
|
||||
)
|
||||
{
|
||||
LIMONP_CHECK(_dictTrie.init(dictPath));
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
|
||||
return true;
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
|
||||
_dictTrie = _segment.getDictTrie();
|
||||
LIMONP_CHECK(_dictTrie);
|
||||
};
|
||||
|
||||
bool tag(const string& src, vector<pair<string, string> >& res)
|
||||
bool tag(const string& src, vector<pair<string, string> >& res) const
|
||||
{
|
||||
vector<string> cutRes;
|
||||
if (!_segment.cut(src, cutRes))
|
||||
@ -51,7 +57,7 @@ namespace CppJieba
|
||||
LogError("decode failed.");
|
||||
return false;
|
||||
}
|
||||
tmp = _dictTrie.find(unico.begin(), unico.end());
|
||||
tmp = _dictTrie->find(unico.begin(), unico.end());
|
||||
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
|
||||
}
|
||||
tmp = NULL;
|
||||
|
@ -5,6 +5,8 @@ using namespace CppJieba;
|
||||
|
||||
const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
|
||||
const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:x\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
|
||||
const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
|
||||
const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \",:x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \",:x\", \"当上:t\", \"总经理:n\", \",:x\", \"出任:v\", \"CEO:x\", \",:x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \",:x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
|
||||
|
||||
TEST(PosTaggerTest, Test1)
|
||||
{
|
||||
@ -15,3 +17,12 @@ TEST(PosTaggerTest, Test1)
|
||||
s << res;
|
||||
ASSERT_TRUE(s == ANS_TEST1);
|
||||
}
|
||||
TEST(PosTaggerTest, Test2)
|
||||
{
|
||||
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
|
||||
vector<pair<string, string> > res;
|
||||
tagger.tag(QUERY_TEST2, res);
|
||||
string s;
|
||||
s << res;
|
||||
ASSERT_TRUE(s == ANS_TEST2);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user