mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
commit
2159798685
29
README.md
29
README.md
@ -5,7 +5,8 @@
|
||||
之所以全写成hpp文件,是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。
|
||||
|
||||
实践证明写成hpp使用起来真的很爽,在后面提到的在iOS应用中的使用,和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
|
||||
`
|
||||
|
||||
如果对代码细节感兴趣的请见 [代码详解]
|
||||
|
||||
## 中文编码
|
||||
|
||||
@ -169,20 +170,26 @@ you will see:
|
||||
关键词抽取的demo代码请见`test/keyword_demo.cpp`
|
||||
|
||||
|
||||
## 代码详解
|
||||
## 相关应用
|
||||
|
||||
详见:http://aszxqw.com/jekyll/update/2014/02/10/cppjieba-dai-ma-xiang-jie.html
|
||||
|
||||
|
||||
## 关于CppJieba的跨语言包装使用
|
||||
### 关于CppJieba的跨语言包装使用
|
||||
|
||||
收到邮件询问跨语言包装(ios应用开发)使用的问题,这方面我没有相关的经验,建议参考如下python使用cppjieba的项目:
|
||||
|
||||
[jannson] 开发的供 python模块调用的项目 [cppjiebapy] , 和相关讨论 [cppjiebapy_discussion] .
|
||||
|
||||
## NodeJieba
|
||||
### NodeJieba
|
||||
|
||||
如果有需要在`nodejs`中使用分词,不妨试一下[NodeJieba]。
|
||||
如果有需要在`node.js`中使用分词,不妨试一下[NodeJieba]。
|
||||
|
||||
### simhash
|
||||
|
||||
如果有需要在处理中文文档的的相似度计算,不妨试一下[simhash]。
|
||||
|
||||
## 演示
|
||||
|
||||
http://cppjieba-webdemo.herokuapp.com/
|
||||
(建议使用chrome打开)
|
||||
|
||||
## 客服
|
||||
|
||||
@ -190,13 +197,15 @@ you will see:
|
||||
|
||||
## 鸣谢
|
||||
|
||||
"结巴中文"分词作者: SunJunyi
|
||||
"结巴"中文分词作者: SunJunyi
|
||||
https://github.com/fxsjy/jieba
|
||||
|
||||
顾名思义,之所以叫CppJieba,是参照SunJunyi大神的Jieba分词Python程序写成的,所以饮水思源,再次感谢SunJunyi。
|
||||
顾名思义,之所以叫CppJieba,是参照Jieba分词Python程序写成的,所以饮水思源,再次感谢SunJunyi。
|
||||
|
||||
[CppJieba]:https://github.com/aszxqw/cppjieba
|
||||
[jannson]:https://github.com/jannson
|
||||
[cppjiebapy]:https://github.com/jannson/cppjiebapy
|
||||
[cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
|
||||
[NodeJieba]:https://github.com/aszxqw/nodejieba
|
||||
[simhash]:https://github.com/aszxqw/simhash
|
||||
[代码详解]:http://aszxqw.github.io/jekyll/update/2014/02/10/cppjieba-dai-ma-xiang-jie.html
|
||||
|
6653
dict/pos_dict/char_state_tab.utf8
Normal file
6653
dict/pos_dict/char_state_tab.utf8
Normal file
File diff suppressed because it is too large
Load Diff
166
dict/pos_dict/prob_emit.utf8
Normal file
166
dict/pos_dict/prob_emit.utf8
Normal file
File diff suppressed because one or more lines are too long
259
dict/pos_dict/prob_start.utf8
Normal file
259
dict/pos_dict/prob_start.utf8
Normal file
@ -0,0 +1,259 @@
|
||||
#初始状态的概率
|
||||
#格式
|
||||
#状态:概率
|
||||
B,a:-4.7623052146
|
||||
B,ad:-6.68006603678
|
||||
B,ag:-3.14e+100
|
||||
B,an:-8.69708322302
|
||||
B,b:-5.01837436211
|
||||
B,bg:-3.14e+100
|
||||
B,c:-3.42388018495
|
||||
B,d:-3.97504752976
|
||||
B,df:-8.88897423083
|
||||
B,dg:-3.14e+100
|
||||
B,e:-8.56355183039
|
||||
B,en:-3.14e+100
|
||||
B,f:-5.49163041848
|
||||
B,g:-3.14e+100
|
||||
B,h:-13.53336513
|
||||
B,i:-6.11578472756
|
||||
B,in:-3.14e+100
|
||||
B,j:-5.05761912847
|
||||
B,jn:-3.14e+100
|
||||
B,k:-3.14e+100
|
||||
B,l:-4.90588358466
|
||||
B,ln:-3.14e+100
|
||||
B,m:-3.6524299819
|
||||
B,mg:-3.14e+100
|
||||
B,mq:-6.7869530014
|
||||
B,n:-1.69662577975
|
||||
B,ng:-3.14e+100
|
||||
B,nr:-2.23104959138
|
||||
B,nrfg:-5.87372217541
|
||||
B,nrt:-4.98564273352
|
||||
B,ns:-2.8228438315
|
||||
B,nt:-4.84609166818
|
||||
B,nz:-3.94698846058
|
||||
B,o:-8.43349870215
|
||||
B,p:-4.20098413209
|
||||
B,q:-6.99812385896
|
||||
B,qe:-3.14e+100
|
||||
B,qg:-3.14e+100
|
||||
B,r:-3.40981877908
|
||||
B,rg:-3.14e+100
|
||||
B,rr:-12.4347528413
|
||||
B,rz:-7.94611647157
|
||||
B,s:-5.52267359084
|
||||
B,t:-3.36474790945
|
||||
B,tg:-3.14e+100
|
||||
B,u:-9.1639172775
|
||||
B,ud:-3.14e+100
|
||||
B,ug:-3.14e+100
|
||||
B,uj:-3.14e+100
|
||||
B,ul:-3.14e+100
|
||||
B,uv:-3.14e+100
|
||||
B,uz:-3.14e+100
|
||||
B,v:-2.67405848743
|
||||
B,vd:-9.04472876024
|
||||
B,vg:-3.14e+100
|
||||
B,vi:-12.4347528413
|
||||
B,vn:-4.33156108902
|
||||
B,vq:-12.1470707689
|
||||
B,w:-3.14e+100
|
||||
B,x:-3.14e+100
|
||||
B,y:-9.84448567586
|
||||
B,yg:-3.14e+100
|
||||
B,z:-7.04568111149
|
||||
B,zg:-3.14e+100
|
||||
E,a:-3.14e+100
|
||||
E,ad:-3.14e+100
|
||||
E,ag:-3.14e+100
|
||||
E,an:-3.14e+100
|
||||
E,b:-3.14e+100
|
||||
E,bg:-3.14e+100
|
||||
E,c:-3.14e+100
|
||||
E,d:-3.14e+100
|
||||
E,df:-3.14e+100
|
||||
E,dg:-3.14e+100
|
||||
E,e:-3.14e+100
|
||||
E,en:-3.14e+100
|
||||
E,f:-3.14e+100
|
||||
E,g:-3.14e+100
|
||||
E,h:-3.14e+100
|
||||
E,i:-3.14e+100
|
||||
E,in:-3.14e+100
|
||||
E,j:-3.14e+100
|
||||
E,jn:-3.14e+100
|
||||
E,k:-3.14e+100
|
||||
E,l:-3.14e+100
|
||||
E,ln:-3.14e+100
|
||||
E,m:-3.14e+100
|
||||
E,mg:-3.14e+100
|
||||
E,mq:-3.14e+100
|
||||
E,n:-3.14e+100
|
||||
E,ng:-3.14e+100
|
||||
E,nr:-3.14e+100
|
||||
E,nrfg:-3.14e+100
|
||||
E,nrt:-3.14e+100
|
||||
E,ns:-3.14e+100
|
||||
E,nt:-3.14e+100
|
||||
E,nz:-3.14e+100
|
||||
E,o:-3.14e+100
|
||||
E,p:-3.14e+100
|
||||
E,q:-3.14e+100
|
||||
E,qe:-3.14e+100
|
||||
E,qg:-3.14e+100
|
||||
E,r:-3.14e+100
|
||||
E,rg:-3.14e+100
|
||||
E,rr:-3.14e+100
|
||||
E,rz:-3.14e+100
|
||||
E,s:-3.14e+100
|
||||
E,t:-3.14e+100
|
||||
E,tg:-3.14e+100
|
||||
E,u:-3.14e+100
|
||||
E,ud:-3.14e+100
|
||||
E,ug:-3.14e+100
|
||||
E,uj:-3.14e+100
|
||||
E,ul:-3.14e+100
|
||||
E,uv:-3.14e+100
|
||||
E,uz:-3.14e+100
|
||||
E,v:-3.14e+100
|
||||
E,vd:-3.14e+100
|
||||
E,vg:-3.14e+100
|
||||
E,vi:-3.14e+100
|
||||
E,vn:-3.14e+100
|
||||
E,vq:-3.14e+100
|
||||
E,w:-3.14e+100
|
||||
E,x:-3.14e+100
|
||||
E,y:-3.14e+100
|
||||
E,yg:-3.14e+100
|
||||
E,z:-3.14e+100
|
||||
E,zg:-3.14e+100
|
||||
M,a:-3.14e+100
|
||||
M,ad:-3.14e+100
|
||||
M,ag:-3.14e+100
|
||||
M,an:-3.14e+100
|
||||
M,b:-3.14e+100
|
||||
M,bg:-3.14e+100
|
||||
M,c:-3.14e+100
|
||||
M,d:-3.14e+100
|
||||
M,df:-3.14e+100
|
||||
M,dg:-3.14e+100
|
||||
M,e:-3.14e+100
|
||||
M,en:-3.14e+100
|
||||
M,f:-3.14e+100
|
||||
M,g:-3.14e+100
|
||||
M,h:-3.14e+100
|
||||
M,i:-3.14e+100
|
||||
M,in:-3.14e+100
|
||||
M,j:-3.14e+100
|
||||
M,jn:-3.14e+100
|
||||
M,k:-3.14e+100
|
||||
M,l:-3.14e+100
|
||||
M,ln:-3.14e+100
|
||||
M,m:-3.14e+100
|
||||
M,mg:-3.14e+100
|
||||
M,mq:-3.14e+100
|
||||
M,n:-3.14e+100
|
||||
M,ng:-3.14e+100
|
||||
M,nr:-3.14e+100
|
||||
M,nrfg:-3.14e+100
|
||||
M,nrt:-3.14e+100
|
||||
M,ns:-3.14e+100
|
||||
M,nt:-3.14e+100
|
||||
M,nz:-3.14e+100
|
||||
M,o:-3.14e+100
|
||||
M,p:-3.14e+100
|
||||
M,q:-3.14e+100
|
||||
M,qe:-3.14e+100
|
||||
M,qg:-3.14e+100
|
||||
M,r:-3.14e+100
|
||||
M,rg:-3.14e+100
|
||||
M,rr:-3.14e+100
|
||||
M,rz:-3.14e+100
|
||||
M,s:-3.14e+100
|
||||
M,t:-3.14e+100
|
||||
M,tg:-3.14e+100
|
||||
M,u:-3.14e+100
|
||||
M,ud:-3.14e+100
|
||||
M,ug:-3.14e+100
|
||||
M,uj:-3.14e+100
|
||||
M,ul:-3.14e+100
|
||||
M,uv:-3.14e+100
|
||||
M,uz:-3.14e+100
|
||||
M,v:-3.14e+100
|
||||
M,vd:-3.14e+100
|
||||
M,vg:-3.14e+100
|
||||
M,vi:-3.14e+100
|
||||
M,vn:-3.14e+100
|
||||
M,vq:-3.14e+100
|
||||
M,w:-3.14e+100
|
||||
M,x:-3.14e+100
|
||||
M,y:-3.14e+100
|
||||
M,yg:-3.14e+100
|
||||
M,z:-3.14e+100
|
||||
M,zg:-3.14e+100
|
||||
S,a:-3.90253968313
|
||||
S,ad:-11.0484584802
|
||||
S,ag:-6.95411391796
|
||||
S,an:-12.8402179494
|
||||
S,b:-6.47288876397
|
||||
S,bg:-3.14e+100
|
||||
S,c:-4.78696679586
|
||||
S,d:-3.90391976418
|
||||
S,df:-3.14e+100
|
||||
S,dg:-8.9483976513
|
||||
S,e:-5.94251300628
|
||||
S,en:-3.14e+100
|
||||
S,f:-5.19482024998
|
||||
S,g:-6.50782681533
|
||||
S,h:-8.65056320738
|
||||
S,i:-3.14e+100
|
||||
S,in:-3.14e+100
|
||||
S,j:-4.91199211964
|
||||
S,jn:-3.14e+100
|
||||
S,k:-6.94032059583
|
||||
S,l:-3.14e+100
|
||||
S,ln:-3.14e+100
|
||||
S,m:-3.26920065212
|
||||
S,mg:-10.8253149289
|
||||
S,mq:-3.14e+100
|
||||
S,n:-3.85514838976
|
||||
S,ng:-4.9134348611
|
||||
S,nr:-4.48366310396
|
||||
S,nrfg:-3.14e+100
|
||||
S,nrt:-3.14e+100
|
||||
S,ns:-3.14e+100
|
||||
S,nt:-12.1470707689
|
||||
S,nz:-3.14e+100
|
||||
S,o:-8.46446092775
|
||||
S,p:-2.98684018136
|
||||
S,q:-4.88865861826
|
||||
S,qe:-3.14e+100
|
||||
S,qg:-3.14e+100
|
||||
S,r:-2.76353367841
|
||||
S,rg:-10.2752685919
|
||||
S,rr:-3.14e+100
|
||||
S,rz:-3.14e+100
|
||||
S,s:-3.14e+100
|
||||
S,t:-3.14e+100
|
||||
S,tg:-6.27284253188
|
||||
S,u:-6.94032059583
|
||||
S,ud:-7.72823016105
|
||||
S,ug:-7.53940370266
|
||||
S,uj:-6.85251045118
|
||||
S,ul:-8.41537131755
|
||||
S,uv:-8.15808672229
|
||||
S,uz:-9.29925862537
|
||||
S,v:-3.05329230341
|
||||
S,vd:-3.14e+100
|
||||
S,vg:-5.94301818437
|
||||
S,vi:-3.14e+100
|
||||
S,vn:-11.4539235883
|
||||
S,vq:-3.14e+100
|
||||
S,w:-3.14e+100
|
||||
S,x:-8.42741965607
|
||||
S,y:-6.19707946995
|
||||
S,yg:-13.53336513
|
||||
S,z:-3.14e+100
|
||||
S,zg:-3.14e+100
|
5222
dict/pos_dict/prob_trans.utf8
Normal file
5222
dict/pos_dict/prob_trans.utf8
Normal file
File diff suppressed because it is too large
Load Diff
71
src/PosTagger.hpp
Normal file
71
src/PosTagger.hpp
Normal file
@ -0,0 +1,71 @@
|
||||
#ifndef CPPJIEBA_POS_TAGGING_H
|
||||
#define CPPJIEBA_POS_TAGGING_H
|
||||
|
||||
#include "MixSegment.hpp"
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "TrieManager.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
class PosTagger: public InitOnOff
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
Trie* _trie;
|
||||
|
||||
public:
|
||||
PosTagger(){_setInitFlag(false);};
|
||||
explicit PosTagger(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string transProb)
|
||||
{
|
||||
_setInitFlag(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb));
|
||||
};
|
||||
~PosTagger(){};
|
||||
public:
|
||||
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string transProb)
|
||||
{
|
||||
if (_getInitFlag())
|
||||
{
|
||||
LogError("already inited before.");
|
||||
return false;
|
||||
}
|
||||
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
|
||||
if (NULL == _trie)
|
||||
{
|
||||
LogError("get a NULL pointor from getTrie(\"%s\").", dictPath.c_str());
|
||||
return false;
|
||||
}
|
||||
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||
};
|
||||
|
||||
bool tag(const string& src, vector<pair<string, string> >& res)
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<string> cutRes;
|
||||
if (!_segment.cut(src, cutRes))
|
||||
{
|
||||
LogError("_mixSegment cut failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
const TrieNodeInfo *tmp = NULL;
|
||||
Unicode unico;
|
||||
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
|
||||
{
|
||||
if (!TransCode::decode(*itr, unico))
|
||||
{
|
||||
LogError("decode failed.");
|
||||
return false;
|
||||
}
|
||||
tmp = _trie->find(unico.begin(), unico.end());
|
||||
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
|
||||
}
|
||||
tmp = NULL;
|
||||
return !res.empty();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
12
test/tagging_demo.cpp
Normal file
12
test/tagging_demo.cpp
Normal file
@ -0,0 +1,12 @@
|
||||
#include "../src/PosTagger.hpp"
|
||||
using namespace CppJieba;
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "", "", "", "", "");
|
||||
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
|
||||
vector<pair<string, string> > res;
|
||||
tagger.tag(s, res);
|
||||
cout << res << endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user