This commit is contained in:
aholic 2014-11-05 11:12:33 +08:00
commit c2125b5371
7 changed files with 139 additions and 19 deletions

View File

@ -34,6 +34,13 @@ cmake ..
make
```
有兴趣的可以跑跑测试(可选):
```
./test/test.run
./load_test
```
## 演示
```
@ -301,6 +308,10 @@ make && ./keyword.demo
如果有需要在`erlang`中使用分词的话,不妨试一下[exjieba]。
### jiebaR
如果有需要在`R`中使用分词的话,不妨试一下[jiebaR]。
### libcppjieba
[libcppjieba] 是最简单易懂的CppJieba头文件库使用示例。
@ -337,6 +348,7 @@ https://github.com/fxsjy/jieba
[cppjiebapy]:https://github.com/jannson/cppjiebapy
[cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
[NodeJieba]:https://github.com/aszxqw/nodejieba
[jiebaR]:https://github.com/qinwf/jiebaR
[simhash]:https://github.com/aszxqw/simhash
[代码详解]:https://github.com/aszxqw/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
[libcppjieba]:https://github.com/aszxqw/libcppjieba

View File

@ -74,10 +74,19 @@ namespace CppJieba
return false;
}
left = right;
while(*right < 0x80 && right != end)
{
right++;
}
do {
right = _sequentialLetterRule(left, end);
if(right != left)
{
break;
}
right = _numbersRule(left, end);
if(right != left)
{
break;
}
right ++;
} while(false);
res.push_back(Unicode(left, right));
left = right;
}
@ -93,6 +102,50 @@ namespace CppJieba
return true;
}
private:
// sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
Unicode::value_type x;
while(begin != end)
{
x = *begin;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
{
begin ++;
}
else
{
break;
}
}
return begin;
}
//
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
Unicode::value_type x = *begin;
if('0' <= x && x <= '9')
{
begin ++;
}
else
{
return begin;
}
while(begin != end)
{
x = *begin;
if( ('0' <= x && x <= '9') || x == '.')
{
begin++;
}
else
{
break;
}
}
return begin;
}
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
vector<size_t> status;
@ -231,7 +284,6 @@ namespace CppJieba
for(size_t j = 0; j< tmp.size(); j++)
{
_startProb[j] = atof(tmp[j].c_str());
//cout<<_startProb[j]<<endl;
}
//load _transProb
@ -250,7 +302,6 @@ namespace CppJieba
for(size_t j =0; j < STATUS_SUM; j++)
{
_transProb[i][j] = atof(tmp[j].c_str());
//cout<<_transProb[i][j]<<endl;
}
}

View File

@ -12,7 +12,7 @@ using namespace CppJieba;
const char * const TEST_FILE = "../test/testdata/testlines.utf8";
const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
const char * const USER_DICT_FILE = "../test/testdata/userdict.utf8";
const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
void cut(const ISegment& seg, const char * const filePath)
{

View File

@ -0,0 +1,2 @@
# go get github.com/aszxqw/go_http_load
go_http_load -method=GET -get_urls="../test/testdata/load_test.urls" -loop_count=500 -goroutines=2

View File

@ -1 +1,2 @@
http://127.0.0.1:11200/?key=南京市长江大桥
http://127.0.0.1:11200/?key=长春市长春药店

View File

@ -5,4 +5,4 @@
人事处女干事
去医院做B超编号123
令狐冲是云计算行业的专家
AB
IBM,3.14

View File

@ -35,14 +35,58 @@ TEST(MixSegmentTest, NoUserDict)
}
TEST(MixSegmentTest, UserDict)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
}
{
const char* str = "IBM,3.14";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
}
}
TEST(MixSegmentTest, UserDict2)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
}
{
const char* str = "IBM,3.14";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
}
}
TEST(MPSegmentTest, Test1)
@ -97,11 +141,21 @@ TEST(MPSegmentTest, Test2)
TEST(HMMSegmentTest, Test1)
{
HMMSegment segment("../dict/hmm_model.utf8");;
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
{
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
{
const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
}
TEST(FullSegment, Test1)