mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
commit
2159798685
29
README.md
29
README.md
@ -5,7 +5,8 @@
|
|||||||
之所以全写成hpp文件,是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。
|
之所以全写成hpp文件,是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。
|
||||||
|
|
||||||
实践证明写成hpp使用起来真的很爽,在后面提到的在iOS应用中的使用,和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
|
实践证明写成hpp使用起来真的很爽,在后面提到的在iOS应用中的使用,和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
|
||||||
`
|
|
||||||
|
如果对代码细节感兴趣的请见 [代码详解]
|
||||||
|
|
||||||
## 中文编码
|
## 中文编码
|
||||||
|
|
||||||
@ -169,20 +170,26 @@ you will see:
|
|||||||
关键词抽取的demo代码请见`test/keyword_demo.cpp`
|
关键词抽取的demo代码请见`test/keyword_demo.cpp`
|
||||||
|
|
||||||
|
|
||||||
## 代码详解
|
## 相关应用
|
||||||
|
|
||||||
详见:http://aszxqw.com/jekyll/update/2014/02/10/cppjieba-dai-ma-xiang-jie.html
|
### 关于CppJieba的跨语言包装使用
|
||||||
|
|
||||||
|
|
||||||
## 关于CppJieba的跨语言包装使用
|
|
||||||
|
|
||||||
收到邮件询问跨语言包装(ios应用开发)使用的问题,这方面我没有相关的经验,建议参考如下python使用cppjieba的项目:
|
收到邮件询问跨语言包装(ios应用开发)使用的问题,这方面我没有相关的经验,建议参考如下python使用cppjieba的项目:
|
||||||
|
|
||||||
[jannson] 开发的供 python模块调用的项目 [cppjiebapy] , 和相关讨论 [cppjiebapy_discussion] .
|
[jannson] 开发的供 python模块调用的项目 [cppjiebapy] , 和相关讨论 [cppjiebapy_discussion] .
|
||||||
|
|
||||||
## NodeJieba
|
### NodeJieba
|
||||||
|
|
||||||
如果有需要在`nodejs`中使用分词,不妨试一下[NodeJieba]。
|
如果有需要在`node.js`中使用分词,不妨试一下[NodeJieba]。
|
||||||
|
|
||||||
|
### simhash
|
||||||
|
|
||||||
|
如果有需要在处理中文文档的的相似度计算,不妨试一下[simhash]。
|
||||||
|
|
||||||
|
## 演示
|
||||||
|
|
||||||
|
http://cppjieba-webdemo.herokuapp.com/
|
||||||
|
(建议使用chrome打开)
|
||||||
|
|
||||||
## 客服
|
## 客服
|
||||||
|
|
||||||
@ -190,13 +197,15 @@ you will see:
|
|||||||
|
|
||||||
## 鸣谢
|
## 鸣谢
|
||||||
|
|
||||||
"结巴中文"分词作者: SunJunyi
|
"结巴"中文分词作者: SunJunyi
|
||||||
https://github.com/fxsjy/jieba
|
https://github.com/fxsjy/jieba
|
||||||
|
|
||||||
顾名思义,之所以叫CppJieba,是参照SunJunyi大神的Jieba分词Python程序写成的,所以饮水思源,再次感谢SunJunyi。
|
顾名思义,之所以叫CppJieba,是参照Jieba分词Python程序写成的,所以饮水思源,再次感谢SunJunyi。
|
||||||
|
|
||||||
[CppJieba]:https://github.com/aszxqw/cppjieba
|
[CppJieba]:https://github.com/aszxqw/cppjieba
|
||||||
[jannson]:https://github.com/jannson
|
[jannson]:https://github.com/jannson
|
||||||
[cppjiebapy]:https://github.com/jannson/cppjiebapy
|
[cppjiebapy]:https://github.com/jannson/cppjiebapy
|
||||||
[cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
|
[cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
|
||||||
[NodeJieba]:https://github.com/aszxqw/nodejieba
|
[NodeJieba]:https://github.com/aszxqw/nodejieba
|
||||||
|
[simhash]:https://github.com/aszxqw/simhash
|
||||||
|
[代码详解]:http://aszxqw.github.io/jekyll/update/2014/02/10/cppjieba-dai-ma-xiang-jie.html
|
||||||
|
6653
dict/pos_dict/char_state_tab.utf8
Normal file
6653
dict/pos_dict/char_state_tab.utf8
Normal file
File diff suppressed because it is too large
Load Diff
166
dict/pos_dict/prob_emit.utf8
Normal file
166
dict/pos_dict/prob_emit.utf8
Normal file
File diff suppressed because one or more lines are too long
259
dict/pos_dict/prob_start.utf8
Normal file
259
dict/pos_dict/prob_start.utf8
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
#初始状态的概率
|
||||||
|
#格式
|
||||||
|
#状态:概率
|
||||||
|
B,a:-4.7623052146
|
||||||
|
B,ad:-6.68006603678
|
||||||
|
B,ag:-3.14e+100
|
||||||
|
B,an:-8.69708322302
|
||||||
|
B,b:-5.01837436211
|
||||||
|
B,bg:-3.14e+100
|
||||||
|
B,c:-3.42388018495
|
||||||
|
B,d:-3.97504752976
|
||||||
|
B,df:-8.88897423083
|
||||||
|
B,dg:-3.14e+100
|
||||||
|
B,e:-8.56355183039
|
||||||
|
B,en:-3.14e+100
|
||||||
|
B,f:-5.49163041848
|
||||||
|
B,g:-3.14e+100
|
||||||
|
B,h:-13.53336513
|
||||||
|
B,i:-6.11578472756
|
||||||
|
B,in:-3.14e+100
|
||||||
|
B,j:-5.05761912847
|
||||||
|
B,jn:-3.14e+100
|
||||||
|
B,k:-3.14e+100
|
||||||
|
B,l:-4.90588358466
|
||||||
|
B,ln:-3.14e+100
|
||||||
|
B,m:-3.6524299819
|
||||||
|
B,mg:-3.14e+100
|
||||||
|
B,mq:-6.7869530014
|
||||||
|
B,n:-1.69662577975
|
||||||
|
B,ng:-3.14e+100
|
||||||
|
B,nr:-2.23104959138
|
||||||
|
B,nrfg:-5.87372217541
|
||||||
|
B,nrt:-4.98564273352
|
||||||
|
B,ns:-2.8228438315
|
||||||
|
B,nt:-4.84609166818
|
||||||
|
B,nz:-3.94698846058
|
||||||
|
B,o:-8.43349870215
|
||||||
|
B,p:-4.20098413209
|
||||||
|
B,q:-6.99812385896
|
||||||
|
B,qe:-3.14e+100
|
||||||
|
B,qg:-3.14e+100
|
||||||
|
B,r:-3.40981877908
|
||||||
|
B,rg:-3.14e+100
|
||||||
|
B,rr:-12.4347528413
|
||||||
|
B,rz:-7.94611647157
|
||||||
|
B,s:-5.52267359084
|
||||||
|
B,t:-3.36474790945
|
||||||
|
B,tg:-3.14e+100
|
||||||
|
B,u:-9.1639172775
|
||||||
|
B,ud:-3.14e+100
|
||||||
|
B,ug:-3.14e+100
|
||||||
|
B,uj:-3.14e+100
|
||||||
|
B,ul:-3.14e+100
|
||||||
|
B,uv:-3.14e+100
|
||||||
|
B,uz:-3.14e+100
|
||||||
|
B,v:-2.67405848743
|
||||||
|
B,vd:-9.04472876024
|
||||||
|
B,vg:-3.14e+100
|
||||||
|
B,vi:-12.4347528413
|
||||||
|
B,vn:-4.33156108902
|
||||||
|
B,vq:-12.1470707689
|
||||||
|
B,w:-3.14e+100
|
||||||
|
B,x:-3.14e+100
|
||||||
|
B,y:-9.84448567586
|
||||||
|
B,yg:-3.14e+100
|
||||||
|
B,z:-7.04568111149
|
||||||
|
B,zg:-3.14e+100
|
||||||
|
E,a:-3.14e+100
|
||||||
|
E,ad:-3.14e+100
|
||||||
|
E,ag:-3.14e+100
|
||||||
|
E,an:-3.14e+100
|
||||||
|
E,b:-3.14e+100
|
||||||
|
E,bg:-3.14e+100
|
||||||
|
E,c:-3.14e+100
|
||||||
|
E,d:-3.14e+100
|
||||||
|
E,df:-3.14e+100
|
||||||
|
E,dg:-3.14e+100
|
||||||
|
E,e:-3.14e+100
|
||||||
|
E,en:-3.14e+100
|
||||||
|
E,f:-3.14e+100
|
||||||
|
E,g:-3.14e+100
|
||||||
|
E,h:-3.14e+100
|
||||||
|
E,i:-3.14e+100
|
||||||
|
E,in:-3.14e+100
|
||||||
|
E,j:-3.14e+100
|
||||||
|
E,jn:-3.14e+100
|
||||||
|
E,k:-3.14e+100
|
||||||
|
E,l:-3.14e+100
|
||||||
|
E,ln:-3.14e+100
|
||||||
|
E,m:-3.14e+100
|
||||||
|
E,mg:-3.14e+100
|
||||||
|
E,mq:-3.14e+100
|
||||||
|
E,n:-3.14e+100
|
||||||
|
E,ng:-3.14e+100
|
||||||
|
E,nr:-3.14e+100
|
||||||
|
E,nrfg:-3.14e+100
|
||||||
|
E,nrt:-3.14e+100
|
||||||
|
E,ns:-3.14e+100
|
||||||
|
E,nt:-3.14e+100
|
||||||
|
E,nz:-3.14e+100
|
||||||
|
E,o:-3.14e+100
|
||||||
|
E,p:-3.14e+100
|
||||||
|
E,q:-3.14e+100
|
||||||
|
E,qe:-3.14e+100
|
||||||
|
E,qg:-3.14e+100
|
||||||
|
E,r:-3.14e+100
|
||||||
|
E,rg:-3.14e+100
|
||||||
|
E,rr:-3.14e+100
|
||||||
|
E,rz:-3.14e+100
|
||||||
|
E,s:-3.14e+100
|
||||||
|
E,t:-3.14e+100
|
||||||
|
E,tg:-3.14e+100
|
||||||
|
E,u:-3.14e+100
|
||||||
|
E,ud:-3.14e+100
|
||||||
|
E,ug:-3.14e+100
|
||||||
|
E,uj:-3.14e+100
|
||||||
|
E,ul:-3.14e+100
|
||||||
|
E,uv:-3.14e+100
|
||||||
|
E,uz:-3.14e+100
|
||||||
|
E,v:-3.14e+100
|
||||||
|
E,vd:-3.14e+100
|
||||||
|
E,vg:-3.14e+100
|
||||||
|
E,vi:-3.14e+100
|
||||||
|
E,vn:-3.14e+100
|
||||||
|
E,vq:-3.14e+100
|
||||||
|
E,w:-3.14e+100
|
||||||
|
E,x:-3.14e+100
|
||||||
|
E,y:-3.14e+100
|
||||||
|
E,yg:-3.14e+100
|
||||||
|
E,z:-3.14e+100
|
||||||
|
E,zg:-3.14e+100
|
||||||
|
M,a:-3.14e+100
|
||||||
|
M,ad:-3.14e+100
|
||||||
|
M,ag:-3.14e+100
|
||||||
|
M,an:-3.14e+100
|
||||||
|
M,b:-3.14e+100
|
||||||
|
M,bg:-3.14e+100
|
||||||
|
M,c:-3.14e+100
|
||||||
|
M,d:-3.14e+100
|
||||||
|
M,df:-3.14e+100
|
||||||
|
M,dg:-3.14e+100
|
||||||
|
M,e:-3.14e+100
|
||||||
|
M,en:-3.14e+100
|
||||||
|
M,f:-3.14e+100
|
||||||
|
M,g:-3.14e+100
|
||||||
|
M,h:-3.14e+100
|
||||||
|
M,i:-3.14e+100
|
||||||
|
M,in:-3.14e+100
|
||||||
|
M,j:-3.14e+100
|
||||||
|
M,jn:-3.14e+100
|
||||||
|
M,k:-3.14e+100
|
||||||
|
M,l:-3.14e+100
|
||||||
|
M,ln:-3.14e+100
|
||||||
|
M,m:-3.14e+100
|
||||||
|
M,mg:-3.14e+100
|
||||||
|
M,mq:-3.14e+100
|
||||||
|
M,n:-3.14e+100
|
||||||
|
M,ng:-3.14e+100
|
||||||
|
M,nr:-3.14e+100
|
||||||
|
M,nrfg:-3.14e+100
|
||||||
|
M,nrt:-3.14e+100
|
||||||
|
M,ns:-3.14e+100
|
||||||
|
M,nt:-3.14e+100
|
||||||
|
M,nz:-3.14e+100
|
||||||
|
M,o:-3.14e+100
|
||||||
|
M,p:-3.14e+100
|
||||||
|
M,q:-3.14e+100
|
||||||
|
M,qe:-3.14e+100
|
||||||
|
M,qg:-3.14e+100
|
||||||
|
M,r:-3.14e+100
|
||||||
|
M,rg:-3.14e+100
|
||||||
|
M,rr:-3.14e+100
|
||||||
|
M,rz:-3.14e+100
|
||||||
|
M,s:-3.14e+100
|
||||||
|
M,t:-3.14e+100
|
||||||
|
M,tg:-3.14e+100
|
||||||
|
M,u:-3.14e+100
|
||||||
|
M,ud:-3.14e+100
|
||||||
|
M,ug:-3.14e+100
|
||||||
|
M,uj:-3.14e+100
|
||||||
|
M,ul:-3.14e+100
|
||||||
|
M,uv:-3.14e+100
|
||||||
|
M,uz:-3.14e+100
|
||||||
|
M,v:-3.14e+100
|
||||||
|
M,vd:-3.14e+100
|
||||||
|
M,vg:-3.14e+100
|
||||||
|
M,vi:-3.14e+100
|
||||||
|
M,vn:-3.14e+100
|
||||||
|
M,vq:-3.14e+100
|
||||||
|
M,w:-3.14e+100
|
||||||
|
M,x:-3.14e+100
|
||||||
|
M,y:-3.14e+100
|
||||||
|
M,yg:-3.14e+100
|
||||||
|
M,z:-3.14e+100
|
||||||
|
M,zg:-3.14e+100
|
||||||
|
S,a:-3.90253968313
|
||||||
|
S,ad:-11.0484584802
|
||||||
|
S,ag:-6.95411391796
|
||||||
|
S,an:-12.8402179494
|
||||||
|
S,b:-6.47288876397
|
||||||
|
S,bg:-3.14e+100
|
||||||
|
S,c:-4.78696679586
|
||||||
|
S,d:-3.90391976418
|
||||||
|
S,df:-3.14e+100
|
||||||
|
S,dg:-8.9483976513
|
||||||
|
S,e:-5.94251300628
|
||||||
|
S,en:-3.14e+100
|
||||||
|
S,f:-5.19482024998
|
||||||
|
S,g:-6.50782681533
|
||||||
|
S,h:-8.65056320738
|
||||||
|
S,i:-3.14e+100
|
||||||
|
S,in:-3.14e+100
|
||||||
|
S,j:-4.91199211964
|
||||||
|
S,jn:-3.14e+100
|
||||||
|
S,k:-6.94032059583
|
||||||
|
S,l:-3.14e+100
|
||||||
|
S,ln:-3.14e+100
|
||||||
|
S,m:-3.26920065212
|
||||||
|
S,mg:-10.8253149289
|
||||||
|
S,mq:-3.14e+100
|
||||||
|
S,n:-3.85514838976
|
||||||
|
S,ng:-4.9134348611
|
||||||
|
S,nr:-4.48366310396
|
||||||
|
S,nrfg:-3.14e+100
|
||||||
|
S,nrt:-3.14e+100
|
||||||
|
S,ns:-3.14e+100
|
||||||
|
S,nt:-12.1470707689
|
||||||
|
S,nz:-3.14e+100
|
||||||
|
S,o:-8.46446092775
|
||||||
|
S,p:-2.98684018136
|
||||||
|
S,q:-4.88865861826
|
||||||
|
S,qe:-3.14e+100
|
||||||
|
S,qg:-3.14e+100
|
||||||
|
S,r:-2.76353367841
|
||||||
|
S,rg:-10.2752685919
|
||||||
|
S,rr:-3.14e+100
|
||||||
|
S,rz:-3.14e+100
|
||||||
|
S,s:-3.14e+100
|
||||||
|
S,t:-3.14e+100
|
||||||
|
S,tg:-6.27284253188
|
||||||
|
S,u:-6.94032059583
|
||||||
|
S,ud:-7.72823016105
|
||||||
|
S,ug:-7.53940370266
|
||||||
|
S,uj:-6.85251045118
|
||||||
|
S,ul:-8.41537131755
|
||||||
|
S,uv:-8.15808672229
|
||||||
|
S,uz:-9.29925862537
|
||||||
|
S,v:-3.05329230341
|
||||||
|
S,vd:-3.14e+100
|
||||||
|
S,vg:-5.94301818437
|
||||||
|
S,vi:-3.14e+100
|
||||||
|
S,vn:-11.4539235883
|
||||||
|
S,vq:-3.14e+100
|
||||||
|
S,w:-3.14e+100
|
||||||
|
S,x:-8.42741965607
|
||||||
|
S,y:-6.19707946995
|
||||||
|
S,yg:-13.53336513
|
||||||
|
S,z:-3.14e+100
|
||||||
|
S,zg:-3.14e+100
|
5222
dict/pos_dict/prob_trans.utf8
Normal file
5222
dict/pos_dict/prob_trans.utf8
Normal file
File diff suppressed because it is too large
Load Diff
71
src/PosTagger.hpp
Normal file
71
src/PosTagger.hpp
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#ifndef CPPJIEBA_POS_TAGGING_H
|
||||||
|
#define CPPJIEBA_POS_TAGGING_H
|
||||||
|
|
||||||
|
#include "MixSegment.hpp"
|
||||||
|
#include "Limonp/str_functs.hpp"
|
||||||
|
#include "Trie.hpp"
|
||||||
|
#include "TrieManager.hpp"
|
||||||
|
|
||||||
|
namespace CppJieba
|
||||||
|
{
|
||||||
|
using namespace Limonp;
|
||||||
|
|
||||||
|
class PosTagger: public InitOnOff
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
MixSegment _segment;
|
||||||
|
Trie* _trie;
|
||||||
|
|
||||||
|
public:
|
||||||
|
PosTagger(){_setInitFlag(false);};
|
||||||
|
explicit PosTagger(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string transProb)
|
||||||
|
{
|
||||||
|
_setInitFlag(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb));
|
||||||
|
};
|
||||||
|
~PosTagger(){};
|
||||||
|
public:
|
||||||
|
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string transProb)
|
||||||
|
{
|
||||||
|
if (_getInitFlag())
|
||||||
|
{
|
||||||
|
LogError("already inited before.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
|
||||||
|
if (NULL == _trie)
|
||||||
|
{
|
||||||
|
LogError("get a NULL pointor from getTrie(\"%s\").", dictPath.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||||
|
};
|
||||||
|
|
||||||
|
bool tag(const string& src, vector<pair<string, string> >& res)
|
||||||
|
{
|
||||||
|
assert(_getInitFlag());
|
||||||
|
vector<string> cutRes;
|
||||||
|
if (!_segment.cut(src, cutRes))
|
||||||
|
{
|
||||||
|
LogError("_mixSegment cut failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const TrieNodeInfo *tmp = NULL;
|
||||||
|
Unicode unico;
|
||||||
|
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
|
||||||
|
{
|
||||||
|
if (!TransCode::decode(*itr, unico))
|
||||||
|
{
|
||||||
|
LogError("decode failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
tmp = _trie->find(unico.begin(), unico.end());
|
||||||
|
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
|
||||||
|
}
|
||||||
|
tmp = NULL;
|
||||||
|
return !res.empty();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
12
test/tagging_demo.cpp
Normal file
12
test/tagging_demo.cpp
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#include "../src/PosTagger.hpp"
|
||||||
|
using namespace CppJieba;
|
||||||
|
|
||||||
|
int main(int argc, char ** argv)
|
||||||
|
{
|
||||||
|
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "", "", "", "", "");
|
||||||
|
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
|
||||||
|
vector<pair<string, string> > res;
|
||||||
|
tagger.tag(s, res);
|
||||||
|
cout << res << endl;
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user