Merge pull request #21 from aholic/dev

add part of speech
This commit is contained in:
Yanyi Wu 2014-02-27 12:05:18 +08:00
commit 2159798685
7 changed files with 12402 additions and 10 deletions

View File

@ -5,7 +5,8 @@
之所以全写成hpp文件是因为这样在别的项目需要使用到中文分词功能的时候直接`#include"xx.hpp" `进来就可以使用,无需麻烦的链接。
实践证明写成hpp使用起来真的很爽在后面提到的在iOS应用中的使用和包装成`Node.js`的扩展[NodeJieba]都特别顺利。
`
如果对代码细节感兴趣的请见 [代码详解]
## 中文编码
@ -169,20 +170,26 @@ you will see:
关键词抽取的demo代码请见`test/keyword_demo.cpp`
## 代码详解
## 相关应用
详见http://aszxqw.com/jekyll/update/2014/02/10/cppjieba-dai-ma-xiang-jie.html
## 关于CppJieba的跨语言包装使用
### 关于CppJieba的跨语言包装使用
收到邮件询问跨语言包装(ios应用开发)使用的问题这方面我没有相关的经验建议参考如下python使用cppjieba的项目
[jannson] 开发的供 python模块调用的项目 [cppjiebapy] , 和相关讨论 [cppjiebapy_discussion] .
## NodeJieba
### NodeJieba
如果有需要在`nodejs`中使用分词,不妨试一下[NodeJieba]。
如果有需要在`node.js`中使用分词,不妨试一下[NodeJieba]。
### simhash
如果有需要在处理中文文档的的相似度计算,不妨试一下[simhash]。
## 演示
http://cppjieba-webdemo.herokuapp.com/
(建议使用chrome打开)
## 客服
@ -190,13 +197,15 @@ you will see:
## 鸣谢
"结巴中文"分词作者: SunJunyi
"结巴"中文分词作者: SunJunyi
https://github.com/fxsjy/jieba
顾名思义之所以叫CppJieba是参照SunJunyi大神的Jieba分词Python程序写成的所以饮水思源再次感谢SunJunyi。
顾名思义之所以叫CppJieba是参照Jieba分词Python程序写成的所以饮水思源再次感谢SunJunyi。
[CppJieba]:https://github.com/aszxqw/cppjieba
[jannson]:https://github.com/jannson
[cppjiebapy]:https://github.com/jannson/cppjiebapy
[cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
[NodeJieba]:https://github.com/aszxqw/nodejieba
[simhash]:https://github.com/aszxqw/simhash
[代码详解]:http://aszxqw.github.io/jekyll/update/2014/02/10/cppjieba-dai-ma-xiang-jie.html

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,259 @@
#初始状态的概率
#格式
#状态:概率
B,a:-4.7623052146
B,ad:-6.68006603678
B,ag:-3.14e+100
B,an:-8.69708322302
B,b:-5.01837436211
B,bg:-3.14e+100
B,c:-3.42388018495
B,d:-3.97504752976
B,df:-8.88897423083
B,dg:-3.14e+100
B,e:-8.56355183039
B,en:-3.14e+100
B,f:-5.49163041848
B,g:-3.14e+100
B,h:-13.53336513
B,i:-6.11578472756
B,in:-3.14e+100
B,j:-5.05761912847
B,jn:-3.14e+100
B,k:-3.14e+100
B,l:-4.90588358466
B,ln:-3.14e+100
B,m:-3.6524299819
B,mg:-3.14e+100
B,mq:-6.7869530014
B,n:-1.69662577975
B,ng:-3.14e+100
B,nr:-2.23104959138
B,nrfg:-5.87372217541
B,nrt:-4.98564273352
B,ns:-2.8228438315
B,nt:-4.84609166818
B,nz:-3.94698846058
B,o:-8.43349870215
B,p:-4.20098413209
B,q:-6.99812385896
B,qe:-3.14e+100
B,qg:-3.14e+100
B,r:-3.40981877908
B,rg:-3.14e+100
B,rr:-12.4347528413
B,rz:-7.94611647157
B,s:-5.52267359084
B,t:-3.36474790945
B,tg:-3.14e+100
B,u:-9.1639172775
B,ud:-3.14e+100
B,ug:-3.14e+100
B,uj:-3.14e+100
B,ul:-3.14e+100
B,uv:-3.14e+100
B,uz:-3.14e+100
B,v:-2.67405848743
B,vd:-9.04472876024
B,vg:-3.14e+100
B,vi:-12.4347528413
B,vn:-4.33156108902
B,vq:-12.1470707689
B,w:-3.14e+100
B,x:-3.14e+100
B,y:-9.84448567586
B,yg:-3.14e+100
B,z:-7.04568111149
B,zg:-3.14e+100
E,a:-3.14e+100
E,ad:-3.14e+100
E,ag:-3.14e+100
E,an:-3.14e+100
E,b:-3.14e+100
E,bg:-3.14e+100
E,c:-3.14e+100
E,d:-3.14e+100
E,df:-3.14e+100
E,dg:-3.14e+100
E,e:-3.14e+100
E,en:-3.14e+100
E,f:-3.14e+100
E,g:-3.14e+100
E,h:-3.14e+100
E,i:-3.14e+100
E,in:-3.14e+100
E,j:-3.14e+100
E,jn:-3.14e+100
E,k:-3.14e+100
E,l:-3.14e+100
E,ln:-3.14e+100
E,m:-3.14e+100
E,mg:-3.14e+100
E,mq:-3.14e+100
E,n:-3.14e+100
E,ng:-3.14e+100
E,nr:-3.14e+100
E,nrfg:-3.14e+100
E,nrt:-3.14e+100
E,ns:-3.14e+100
E,nt:-3.14e+100
E,nz:-3.14e+100
E,o:-3.14e+100
E,p:-3.14e+100
E,q:-3.14e+100
E,qe:-3.14e+100
E,qg:-3.14e+100
E,r:-3.14e+100
E,rg:-3.14e+100
E,rr:-3.14e+100
E,rz:-3.14e+100
E,s:-3.14e+100
E,t:-3.14e+100
E,tg:-3.14e+100
E,u:-3.14e+100
E,ud:-3.14e+100
E,ug:-3.14e+100
E,uj:-3.14e+100
E,ul:-3.14e+100
E,uv:-3.14e+100
E,uz:-3.14e+100
E,v:-3.14e+100
E,vd:-3.14e+100
E,vg:-3.14e+100
E,vi:-3.14e+100
E,vn:-3.14e+100
E,vq:-3.14e+100
E,w:-3.14e+100
E,x:-3.14e+100
E,y:-3.14e+100
E,yg:-3.14e+100
E,z:-3.14e+100
E,zg:-3.14e+100
M,a:-3.14e+100
M,ad:-3.14e+100
M,ag:-3.14e+100
M,an:-3.14e+100
M,b:-3.14e+100
M,bg:-3.14e+100
M,c:-3.14e+100
M,d:-3.14e+100
M,df:-3.14e+100
M,dg:-3.14e+100
M,e:-3.14e+100
M,en:-3.14e+100
M,f:-3.14e+100
M,g:-3.14e+100
M,h:-3.14e+100
M,i:-3.14e+100
M,in:-3.14e+100
M,j:-3.14e+100
M,jn:-3.14e+100
M,k:-3.14e+100
M,l:-3.14e+100
M,ln:-3.14e+100
M,m:-3.14e+100
M,mg:-3.14e+100
M,mq:-3.14e+100
M,n:-3.14e+100
M,ng:-3.14e+100
M,nr:-3.14e+100
M,nrfg:-3.14e+100
M,nrt:-3.14e+100
M,ns:-3.14e+100
M,nt:-3.14e+100
M,nz:-3.14e+100
M,o:-3.14e+100
M,p:-3.14e+100
M,q:-3.14e+100
M,qe:-3.14e+100
M,qg:-3.14e+100
M,r:-3.14e+100
M,rg:-3.14e+100
M,rr:-3.14e+100
M,rz:-3.14e+100
M,s:-3.14e+100
M,t:-3.14e+100
M,tg:-3.14e+100
M,u:-3.14e+100
M,ud:-3.14e+100
M,ug:-3.14e+100
M,uj:-3.14e+100
M,ul:-3.14e+100
M,uv:-3.14e+100
M,uz:-3.14e+100
M,v:-3.14e+100
M,vd:-3.14e+100
M,vg:-3.14e+100
M,vi:-3.14e+100
M,vn:-3.14e+100
M,vq:-3.14e+100
M,w:-3.14e+100
M,x:-3.14e+100
M,y:-3.14e+100
M,yg:-3.14e+100
M,z:-3.14e+100
M,zg:-3.14e+100
S,a:-3.90253968313
S,ad:-11.0484584802
S,ag:-6.95411391796
S,an:-12.8402179494
S,b:-6.47288876397
S,bg:-3.14e+100
S,c:-4.78696679586
S,d:-3.90391976418
S,df:-3.14e+100
S,dg:-8.9483976513
S,e:-5.94251300628
S,en:-3.14e+100
S,f:-5.19482024998
S,g:-6.50782681533
S,h:-8.65056320738
S,i:-3.14e+100
S,in:-3.14e+100
S,j:-4.91199211964
S,jn:-3.14e+100
S,k:-6.94032059583
S,l:-3.14e+100
S,ln:-3.14e+100
S,m:-3.26920065212
S,mg:-10.8253149289
S,mq:-3.14e+100
S,n:-3.85514838976
S,ng:-4.9134348611
S,nr:-4.48366310396
S,nrfg:-3.14e+100
S,nrt:-3.14e+100
S,ns:-3.14e+100
S,nt:-12.1470707689
S,nz:-3.14e+100
S,o:-8.46446092775
S,p:-2.98684018136
S,q:-4.88865861826
S,qe:-3.14e+100
S,qg:-3.14e+100
S,r:-2.76353367841
S,rg:-10.2752685919
S,rr:-3.14e+100
S,rz:-3.14e+100
S,s:-3.14e+100
S,t:-3.14e+100
S,tg:-6.27284253188
S,u:-6.94032059583
S,ud:-7.72823016105
S,ug:-7.53940370266
S,uj:-6.85251045118
S,ul:-8.41537131755
S,uv:-8.15808672229
S,uz:-9.29925862537
S,v:-3.05329230341
S,vd:-3.14e+100
S,vg:-5.94301818437
S,vi:-3.14e+100
S,vn:-11.4539235883
S,vq:-3.14e+100
S,w:-3.14e+100
S,x:-8.42741965607
S,y:-6.19707946995
S,yg:-13.53336513
S,z:-3.14e+100
S,zg:-3.14e+100

File diff suppressed because it is too large Load Diff

71
src/PosTagger.hpp Normal file
View File

@ -0,0 +1,71 @@
#ifndef CPPJIEBA_POS_TAGGING_H
#define CPPJIEBA_POS_TAGGING_H
#include "MixSegment.hpp"
#include "Limonp/str_functs.hpp"
#include "Trie.hpp"
#include "TrieManager.hpp"
namespace CppJieba
{
using namespace Limonp;
class PosTagger: public InitOnOff
{
private:
MixSegment _segment;
Trie* _trie;
public:
PosTagger(){_setInitFlag(false);};
explicit PosTagger(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string transProb)
{
_setInitFlag(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb));
};
~PosTagger(){};
public:
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string transProb)
{
if (_getInitFlag())
{
LogError("already inited before.");
return false;
}
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
if (NULL == _trie)
{
LogError("get a NULL pointor from getTrie(\"%s\").", dictPath.c_str());
return false;
}
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
};
bool tag(const string& src, vector<pair<string, string> >& res)
{
assert(_getInitFlag());
vector<string> cutRes;
if (!_segment.cut(src, cutRes))
{
LogError("_mixSegment cut failed");
return false;
}
const TrieNodeInfo *tmp = NULL;
Unicode unico;
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
{
if (!TransCode::decode(*itr, unico))
{
LogError("decode failed.");
return false;
}
tmp = _trie->find(unico.begin(), unico.end());
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
}
tmp = NULL;
return !res.empty();
}
};
}
#endif

12
test/tagging_demo.cpp Normal file
View File

@ -0,0 +1,12 @@
#include "../src/PosTagger.hpp"
using namespace CppJieba;
int main(int argc, char ** argv)
{
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "", "", "", "", "");
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。");
vector<pair<string, string> > res;
tagger.tag(s, res);
cout << res << endl;
return EXIT_SUCCESS;
}