make QuerySegment support user.dict.utf8

This commit is contained in:
yanyiwu 2015-01-23 01:10:12 +08:00
parent a406c0f8cc
commit 269bc0fd0d
5 changed files with 40 additions and 9 deletions

View File

@ -344,7 +344,9 @@ http://cppjieba-webdemo.herokuapp.com/
## 客服
`wuyanyi09@foxmail.com`
`i@yanyiwu.com`
![image](http://yanyiwu.com/weedfs/2/5a7d1b5c0d/yanyiwu_personal_qrcodes.jpg)
## 鸣谢
@ -353,7 +355,7 @@ https://github.com/fxsjy/jieba
## 作者
- aszxqw https://github.com/aszxqw i@yanyiwu.com
- yanyiwu https://github.com/aszxqw i@yanyiwu.com
- aholic https://github.com/aholic ruochen.xu@gmail.com
[CppJieba]:https://github.com/aszxqw/cppjieba

View File

@ -24,15 +24,15 @@ namespace CppJieba
public:
QuerySegment(){};
QuerySegment(const string& dict, const string& model, size_t maxWordLen)
QuerySegment(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
{
init(dict, model, maxWordLen);
init(dict, model, maxWordLen, userDict);
};
virtual ~QuerySegment(){};
public:
bool init(const string& dict, const string& model, size_t maxWordLen)
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "")
{
LIMONP_CHECK(_mixSeg.init(dict, model));
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
assert(maxWordLen);
_maxWordLen = maxWordLen;

View File

@ -2,5 +2,5 @@
韩玉鉴赏
A
B
iphone6
iPhone6
蓝翔 nz

View File

@ -8,8 +8,8 @@ static const char * const ANS_TEST1 = "[\"我:r\", \"是:v\", \"蓝翔:x\", \"
static const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。";
static const char * const ANS_TEST2 = "[\"我:r\", \"是:v\", \"蓝翔:nz\", \"技工:n\", \"拖拉机:n\", \"学院:n\", \"手扶拖拉机:n\", \"专业:n\", \"的:uj\", \"。:x\", \"不用:v\", \"多久:m\", \":x\", \"我:r\", \"就:d\", \"会:v\", \"升职:v\", \"加薪:nr\", \":x\", \"当上:t\", \"总经理:n\", \":x\", \"出任:v\", \"CEO:eng\", \":x\", \"迎娶:v\", \"白富:x\", \"美:ns\", \":x\", \"走上:v\", \"人生:n\", \"巅峰:n\", \"。:x\"]";
static const char * const QUERY_TEST3 = "iphone6手机的最大特点是很容易弯曲。";
static const char * const ANS_TEST3 = "[\"iphone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容易弯曲。";
static const char * const ANS_TEST3 = "[\"iPhone6:eng\", \"手机:n\", \"的:uj\", \"最大:a\", \"特点:n\", \"是:v\", \"很:zg\", \"容易:a\", \"弯曲:v\", \"。:x\"]";
//static const char * const ANS_TEST3 = "";
TEST(PosTaggerTest, Test)

View File

@ -185,3 +185,32 @@ TEST(QuerySegment, Test1)
}
TEST(QuerySegment, Test2)
{
QuerySegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", 3, "../test/testdata/userdict.utf8");
{
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string s1, s2;
s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"\", \"\", \"\", \"日本\", \"京都\", \"京都大学\", \"大学\", \"深造\"]";
ASSERT_EQ(s1, s2);
}
{
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string s1, s2;
s1 << words;
s2 = "[\"小明\", \"硕士\", \"毕业\", \"\", \"中国\", \"中国科学院\", \"科学\", \"科学院\", \"学院\", \"计算所\", \"iPhone6\"]";
ASSERT_EQ(s1, s2);
}
}