mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
This commit is contained in:
commit
c2125b5371
12
README.md
12
README.md
@ -34,6 +34,13 @@ cmake ..
|
||||
make
|
||||
```
|
||||
|
||||
有兴趣的可以跑跑测试(可选):
|
||||
|
||||
```
|
||||
./test/test.run
|
||||
./load_test
|
||||
```
|
||||
|
||||
## 演示
|
||||
|
||||
```
|
||||
@ -301,6 +308,10 @@ make && ./keyword.demo
|
||||
|
||||
如果有需要在`erlang`中使用分词的话,不妨试一下[exjieba]。
|
||||
|
||||
### jiebaR
|
||||
|
||||
如果有需要在`R`中使用分词的话,不妨试一下[jiebaR]。
|
||||
|
||||
### libcppjieba
|
||||
|
||||
[libcppjieba] 是最简单易懂的CppJieba头文件库使用示例。
|
||||
@ -337,6 +348,7 @@ https://github.com/fxsjy/jieba
|
||||
[cppjiebapy]:https://github.com/jannson/cppjiebapy
|
||||
[cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
|
||||
[NodeJieba]:https://github.com/aszxqw/nodejieba
|
||||
[jiebaR]:https://github.com/qinwf/jiebaR
|
||||
[simhash]:https://github.com/aszxqw/simhash
|
||||
[代码详解]:https://github.com/aszxqw/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
|
||||
[libcppjieba]:https://github.com/aszxqw/libcppjieba
|
||||
|
@ -74,10 +74,19 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
left = right;
|
||||
while(*right < 0x80 && right != end)
|
||||
{
|
||||
right++;
|
||||
}
|
||||
do {
|
||||
right = _sequentialLetterRule(left, end);
|
||||
if(right != left)
|
||||
{
|
||||
break;
|
||||
}
|
||||
right = _numbersRule(left, end);
|
||||
if(right != left)
|
||||
{
|
||||
break;
|
||||
}
|
||||
right ++;
|
||||
} while(false);
|
||||
res.push_back(Unicode(left, right));
|
||||
left = right;
|
||||
}
|
||||
@ -93,6 +102,50 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
Unicode::value_type x;
|
||||
while(begin != end)
|
||||
{
|
||||
x = *begin;
|
||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
||||
{
|
||||
begin ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
//
|
||||
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
Unicode::value_type x = *begin;
|
||||
if('0' <= x && x <= '9')
|
||||
{
|
||||
begin ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
return begin;
|
||||
}
|
||||
while(begin != end)
|
||||
{
|
||||
x = *begin;
|
||||
if( ('0' <= x && x <= '9') || x == '.')
|
||||
{
|
||||
begin++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
vector<size_t> status;
|
||||
@ -231,7 +284,6 @@ namespace CppJieba
|
||||
for(size_t j = 0; j< tmp.size(); j++)
|
||||
{
|
||||
_startProb[j] = atof(tmp[j].c_str());
|
||||
//cout<<_startProb[j]<<endl;
|
||||
}
|
||||
|
||||
//load _transProb
|
||||
@ -250,7 +302,6 @@ namespace CppJieba
|
||||
for(size_t j =0; j < STATUS_SUM; j++)
|
||||
{
|
||||
_transProb[i][j] = atof(tmp[j].c_str());
|
||||
//cout<<_transProb[i][j]<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@ using namespace CppJieba;
|
||||
const char * const TEST_FILE = "../test/testdata/testlines.utf8";
|
||||
const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
|
||||
const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
|
||||
const char * const USER_DICT_FILE = "../test/testdata/userdict.utf8";
|
||||
const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
|
||||
|
||||
void cut(const ISegment& seg, const char * const filePath)
|
||||
{
|
||||
|
2
test/servertest/go_load_test.sh
Executable file
2
test/servertest/go_load_test.sh
Executable file
@ -0,0 +1,2 @@
|
||||
# go get github.com/aszxqw/go_http_load
|
||||
go_http_load -method=GET -get_urls="../test/testdata/load_test.urls" -loop_count=500 -goroutines=2
|
1
test/testdata/load_test.urls
vendored
1
test/testdata/load_test.urls
vendored
@ -1 +1,2 @@
|
||||
http://127.0.0.1:11200/?key=南京市长江大桥
|
||||
http://127.0.0.1:11200/?key=长春市长春药店
|
||||
|
2
test/testdata/testlines.utf8
vendored
2
test/testdata/testlines.utf8
vendored
@ -5,4 +5,4 @@
|
||||
人事处女干事
|
||||
去医院做B超,编号123
|
||||
令狐冲是云计算行业的专家
|
||||
AB
|
||||
IBM,3.14
|
||||
|
@ -35,14 +35,58 @@ TEST(MixSegmentTest, NoUserDict)
|
||||
|
||||
}
|
||||
TEST(MixSegmentTest, UserDict)
|
||||
{
|
||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
|
||||
{
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
}
|
||||
{
|
||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
||||
}
|
||||
{
|
||||
const char* str = "IBM,3.14";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
|
||||
}
|
||||
}
|
||||
TEST(MixSegmentTest, UserDict2)
|
||||
{
|
||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
|
||||
{
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
}
|
||||
{
|
||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
||||
}
|
||||
{
|
||||
const char* str = "IBM,3.14";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MPSegmentTest, Test1)
|
||||
@ -97,11 +141,21 @@ TEST(MPSegmentTest, Test2)
|
||||
TEST(HMMSegmentTest, Test1)
|
||||
{
|
||||
HMMSegment segment("../dict/hmm_model.utf8");;
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
{
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
||||
{
|
||||
const char* str = "IBM,1.2,123";
|
||||
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FullSegment, Test1)
|
||||
|
Loading…
x
Reference in New Issue
Block a user