mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
Compare commits
461 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
67fa2e36e7 | ||
|
1e20c89b66 | ||
|
5704e23bbf | ||
|
aa65031788 | ||
|
2eb11c8028 | ||
|
d703bce302 | ||
|
dc2b788eb3 | ||
|
0868c323d9 | ||
|
eb37e048da | ||
|
381b0691ac | ||
|
97c32464e1 | ||
|
0489a6979e | ||
|
30ea8f929e | ||
|
0b74b6c2de | ||
|
2fdee89883 | ||
|
17bab6a2d1 | ||
|
80947ff843 | ||
|
68ce6955b7 | ||
|
d47e14e5b3 | ||
|
27910094ac | ||
|
9dc8e6d992 | ||
|
478c3b9bb4 | ||
|
5b3bb4b7f2 | ||
|
38134ee20f | ||
|
3645a5bb5d | ||
|
8212b6c572 | ||
|
843cdc2b7c | ||
|
68f2a64f7e | ||
|
4c8479cfa6 | ||
|
ca444fb4da | ||
|
36a27302ce | ||
|
7653db2e33 | ||
|
17ef8abba3 | ||
|
cb0de2973b | ||
|
b4dd5b58f3 | ||
|
4eef868338 | ||
|
b485ae916c | ||
|
ee0ce32bbd | ||
|
8ba26cf97e | ||
|
60acefd9b1 | ||
|
03cd4b5fb6 | ||
|
76ae798137 | ||
|
0243d568e9 | ||
|
12b2b17741 | ||
|
1d5ea9f061 | ||
|
e5c9af78e2 | ||
|
87734d3785 | ||
|
3d29b0c8e8 | ||
|
1fcd3a417c | ||
|
093980647b | ||
|
f73a2183a5 | ||
|
8814e08f9b | ||
|
70f019b669 | ||
|
5270ed66ff | ||
|
99d0fb1a8a | ||
|
1c33252fce | ||
|
e5e41a4aad | ||
|
4f8ca83661 | ||
|
26e339f8f7 | ||
|
b6f1ce773e | ||
|
343bfe9783 | ||
|
cb414cb861 | ||
|
8e99a13aa9 | ||
|
d0e68974bf | ||
|
66fe17517d | ||
|
be46ddef9a | ||
|
17652e764f | ||
|
ceb5c26be4 | ||
|
9f4d9376b0 | ||
|
3b76328f2a | ||
|
3ec4c43788 | ||
|
94840a734c | ||
|
e359d08964 | ||
|
f6e57ab2ae | ||
|
60f0028175 | ||
|
e712a4de61 | ||
|
29d2b838dc | ||
|
c07b7fef54 | ||
|
753c1be49c | ||
|
84ffa0d4bf | ||
|
885417aed1 | ||
|
eeaab012bf | ||
|
89481cfd84 | ||
|
59aa8b69b1 | ||
|
4fa2728fb6 | ||
|
4a552ca94f | ||
|
1b4721ebb8 | ||
|
62433a3205 | ||
|
4b4aff6d89 | ||
|
f29430f49e | ||
|
a4fb439070 | ||
|
01b7f6efcf | ||
|
4e05cde07e | ||
|
8b8c6c85d0 | ||
|
a6d1b2479e | ||
|
1152db7736 | ||
|
49657c976d | ||
|
abcaf3e475 | ||
|
a06b7d388e | ||
|
9ca5b69907 | ||
|
f2b7183a71 | ||
|
b14eb329e3 | ||
|
872a7039f2 | ||
|
f808ea0ebb | ||
|
4d7b515801 | ||
|
5bfa43a781 | ||
|
f3a53dd2da | ||
|
a229041e58 | ||
|
5d321cbccd | ||
|
8cbb26a7b6 | ||
|
41b47b0593 | ||
|
32a0e92a09 | ||
|
22bcf8be7a | ||
|
caae26fbfa | ||
|
4197dfb8fa | ||
|
765fd6b7f0 | ||
|
c95f402e2b | ||
|
1d91072498 | ||
|
852a07c4f2 | ||
|
7bcb128f5f | ||
|
b08c3f8ed7 | ||
|
fea3aec6bd | ||
|
8be082017a | ||
|
293dbbc390 | ||
|
3dad899ec8 | ||
|
c6b386f65b | ||
|
7b7c6955a9 | ||
|
8a2e7f0e7e | ||
|
9cb76dd8b9 | ||
|
99748bfc17 | ||
|
a336e26403 | ||
|
bab5f362ba | ||
|
6b0da06481 | ||
|
5c487dbcba | ||
|
447c1ded8c | ||
|
dd62477605 | ||
|
a5ecf70f71 | ||
|
d82d2c18df | ||
|
315a411e52 | ||
|
ec68c21ea0 | ||
|
3eea28d6f4 | ||
|
5571a0337a | ||
|
40c0edfd99 | ||
|
4a6140081e | ||
|
7a6caa0c3c | ||
|
36bc9e18c6 | ||
|
7ce63e53b7 | ||
|
6772f0282e | ||
|
a5944bb88e | ||
|
77a831b8c1 | ||
|
cf2aa88122 | ||
|
751ff35eb5 | ||
|
e3f3dcccba | ||
|
4cb1924d09 | ||
|
d6ef07a472 | ||
|
fd9f1f2c0e | ||
|
9d2818b440 | ||
|
31b7d11809 | ||
|
a6119cc995 | ||
|
0049b0c5b4 | ||
|
138d713e98 | ||
|
4030d8ed86 | ||
|
6eb9f6149c | ||
|
1850bd6d37 | ||
|
f5ca87e088 | ||
|
10b86e90fb | ||
|
ba87fcb01f | ||
|
82bfffb6ed | ||
|
56e8336af1 | ||
|
4a93f21918 | ||
|
bb1e6000c6 | ||
|
14671d4feb | ||
|
b367690eeb | ||
|
51df77831b | ||
|
eb98eb9248 | ||
|
7f965e0aa3 | ||
|
77b442fa88 | ||
|
8f52419386 | ||
|
626b415152 | ||
|
6a3f228c72 | ||
|
b16cf0d63f | ||
|
6fad5fbb2c | ||
|
fc511de012 | ||
|
99ea59e88d | ||
|
6eb43acc10 | ||
|
40adb1c591 | ||
|
d432789cb4 | ||
|
cf31a99bf6 | ||
|
e4d323c78b | ||
|
16d626d347 | ||
|
b658ee69cb | ||
|
7198d562f1 | ||
|
91e5b26f5f | ||
|
8b07bce568 | ||
|
0d99ebce54 | ||
|
c04ccd0d12 | ||
|
81f77d7a08 | ||
|
473ac1df75 | ||
|
7583f7760a | ||
|
2726a7c89b | ||
|
5b2ec920ed | ||
|
5574304a9e | ||
|
7f3513edb7 | ||
|
28621e8b00 | ||
|
1f144ebf55 | ||
|
7488b114e7 | ||
|
2682e887b8 | ||
|
9d4ac26f16 | ||
|
6942795fae | ||
|
ccfa54530e | ||
|
3e430e9769 | ||
|
6946b00f14 | ||
|
7720fbc1d8 | ||
|
cc708de40c | ||
|
dafc73425e | ||
|
7cc7e70843 | ||
|
18678d50c6 | ||
|
62240c5add | ||
|
e2c796088f | ||
|
5e6a2c4661 | ||
|
136676381a | ||
|
e79d54b380 | ||
|
95286b8887 | ||
|
14a0ab0466 | ||
|
759e1029c8 | ||
|
2ef9dd3a70 | ||
|
1cf3f0d00b | ||
|
fd96527f71 | ||
|
6a66620088 | ||
|
00bc72c877 | ||
|
31d5845535 | ||
|
7e7fcc1184 | ||
|
21f7da0ca4 | ||
|
c5bd9773d1 | ||
|
0125548a37 | ||
|
510a3d6bed | ||
|
25839b5127 | ||
|
ebd40ed65e | ||
|
d49542c06e | ||
|
6024497917 | ||
|
835e68c585 | ||
|
d16727ba89 | ||
|
dce353f88b | ||
|
2857ae45cc | ||
|
66e334229b | ||
|
cc81135429 | ||
|
efebf5371c | ||
|
90ab511deb | ||
|
92c6c3d9cd | ||
|
0bb2ddcc1b | ||
|
81390a2d23 | ||
|
3667a4ab01 | ||
|
33089138fd | ||
|
d0578ad99b | ||
|
d97c1d584c | ||
|
b77645b3aa | ||
|
ed1fa64e27 | ||
|
0f972df0ac | ||
|
e68bb5a28e | ||
|
689e27280a | ||
|
9d87e798fd | ||
|
4fad12017e | ||
|
5d83855088 | ||
|
1dbc525dff | ||
|
2ceb981da0 | ||
|
8e9b4bbe72 | ||
|
d4ede0fee6 | ||
|
8757148d51 | ||
|
aea8496b1f | ||
|
6549deabbd | ||
|
d691d91674 | ||
|
d63140fe5e | ||
|
a1ad2cbd55 | ||
|
c2ded83ead | ||
|
99d2492d67 | ||
|
fbfaac2eaa | ||
|
7bfd432fc5 | ||
|
7334bedf5c | ||
|
6035bb6320 | ||
|
27cf9cfd62 | ||
|
9d0ea771a5 | ||
|
ba5114dc95 | ||
|
4b237f79fa | ||
|
f424862222 | ||
|
b18d56d2a3 | ||
|
b9b1f1a418 | ||
|
becd32b178 | ||
|
c01680c6a8 | ||
|
b62f052927 | ||
|
9ea14a8a54 | ||
|
45daf561c7 | ||
|
632a086035 | ||
|
3246236133 | ||
|
e1c1d46324 | ||
|
915b3164b0 | ||
|
45e6594a09 | ||
|
0886875af3 | ||
|
dbec3ad9df | ||
|
efc784312c | ||
|
f08690a2df | ||
|
237dc6625e | ||
|
cb1b0499f7 | ||
|
11a3b10755 | ||
|
8eab1cdb6d | ||
|
1a3be67691 | ||
|
465e475460 | ||
|
ca97b19951 | ||
|
38b6bcd54e | ||
|
e1afafe353 | ||
|
a9f53e9c85 | ||
|
c015f4e297 | ||
|
7343679ba8 | ||
|
c0816b9bb0 | ||
|
c9e8da9e63 | ||
|
322e8e48b6 | ||
|
1d06f124d6 | ||
|
dbfd0e0f63 | ||
|
cfcfb26792 | ||
|
9d1e23ce6f | ||
|
b1238a2306 | ||
|
02e9a0328d | ||
|
b050bfe946 | ||
|
08bfabb9d7 | ||
|
be1686654d | ||
|
69e584677a | ||
|
7993a3ea73 | ||
|
bdfaaa4eea | ||
|
1febdf847f | ||
|
ffea881a46 | ||
|
979a9177ae | ||
|
e12e176d17 | ||
|
d3531f197d | ||
|
f2d6abf063 | ||
|
0087a4e7e3 | ||
|
872d159b61 | ||
|
d4943f9072 | ||
|
0bda20db82 | ||
|
dfc807e65b | ||
|
df8e0ab44d | ||
|
4300f79788 | ||
|
a8f902545c | ||
|
c6fc94a2e8 | ||
|
afea4ca1ca | ||
|
9ee20a5293 | ||
|
0c050b5eb2 | ||
|
b0f9e6721e | ||
|
a7ff398edc | ||
|
667203a9ae | ||
|
a2d2078465 | ||
|
7ce5116a93 | ||
|
e0434871eb | ||
|
5e1ccf2086 | ||
|
4a9f2d1e19 | ||
|
37a179436f | ||
|
9b0f60df93 | ||
|
65d07d2ddf | ||
|
c691a23084 | ||
|
c2f4b04722 | ||
|
8ba8735f46 | ||
|
c2ebfd8d00 | ||
|
c1bf815343 | ||
|
5cf9034625 | ||
|
a9f92e37ce | ||
|
1cb721689c | ||
|
4eca1a2f47 | ||
|
ff4ea5d882 | ||
|
0e833cd441 | ||
|
de9e7f61c3 | ||
|
1275b3679f | ||
|
35aa38ed12 | ||
|
3c8913e0e0 | ||
|
273996f7d4 | ||
|
aae91b6fb6 | ||
|
2a2095e512 | ||
|
ae15492257 | ||
|
da635859d4 | ||
|
9e4fce6b68 | ||
|
1f51b2a3ff | ||
|
c1d143385f | ||
|
94d455b079 | ||
|
347a3a8034 | ||
|
59d5d3b811 | ||
|
c8df565981 | ||
|
04eb4f08cf | ||
|
8666428fb0 | ||
|
9bebe6120b | ||
|
d3339633d5 | ||
|
bc049090a5 | ||
|
d2460029d5 | ||
|
7342a18534 | ||
|
c6098a8657 | ||
|
47d94a13e6 | ||
|
c350fab2b9 | ||
|
65b78b2b4d | ||
|
966532b462 | ||
|
166c2ca7a5 | ||
|
5f8435ce58 | ||
|
7337c6d420 | ||
|
ceae5c56d8 | ||
|
604e6910e2 | ||
|
9af4d0a9d9 | ||
|
b06d6de174 | ||
|
f2fa585f3a | ||
|
825da757d0 | ||
|
1bb497ac09 | ||
|
3f003e2f29 | ||
|
b46166f768 | ||
|
6b83593b5a | ||
|
62cf22121f | ||
|
6da857b554 | ||
|
8d89e8afda | ||
|
012fddf13f | ||
|
45591bb9ab | ||
|
c77823aa1d | ||
|
afdcb8a77d | ||
|
94ad7e7035 | ||
|
72fff6c8e2 | ||
|
a383f035ba | ||
|
7ce3433316 | ||
|
600a7fc285 | ||
|
ddeb766202 | ||
|
6632bb80ec | ||
|
f1d5d90ae6 | ||
|
fcb3747814 | ||
|
9fd2b38293 | ||
|
4a9193de4f | ||
|
a600868363 | ||
|
659326c4e1 | ||
|
7d227da5c4 | ||
|
8e49199993 | ||
|
58c363655c | ||
|
44e19a2e27 | ||
|
6cc0e95759 | ||
|
d2634a049b | ||
|
0f4f9067c3 | ||
|
87c2799692 | ||
|
121a457e82 | ||
|
5e861921f2 | ||
|
8a699cf462 | ||
|
d58402c8f6 | ||
|
981d58e106 | ||
|
182289c2eb | ||
|
13e3850ba8 | ||
|
1edc1651ee | ||
|
8d8e50fbf9 | ||
|
fd20cbbd4b | ||
|
f8f3db7cc4 | ||
|
263a9947bd | ||
|
88adb0c78e | ||
|
06ebc6f71c | ||
|
a8ae0398b4 | ||
|
a879ac0db9 | ||
|
8c875e80ae | ||
|
6517119110 | ||
|
8c05efed68 | ||
|
379cd4933a | ||
|
2cbcd2d2a5 | ||
|
04d08f25d1 | ||
|
9c07d80edb | ||
|
3f193540ca | ||
|
9f10122257 |
13
.gitignore
vendored
13
.gitignore
vendored
@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
|
|||||||
_UpgradeReport_Files/
|
_UpgradeReport_Files/
|
||||||
Backup*/
|
Backup*/
|
||||||
UpgradeLog*.XML
|
UpgradeLog*.XML
|
||||||
|
############
|
||||||
|
## pycharm
|
||||||
|
############
|
||||||
|
.idea
|
||||||
|
|
||||||
############
|
############
|
||||||
## Windows
|
## Windows
|
||||||
@ -161,3 +163,10 @@ pip-log.txt
|
|||||||
|
|
||||||
# Mac crap
|
# Mac crap
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
*.log
|
||||||
|
test/tmp/*
|
||||||
|
|
||||||
|
#jython
|
||||||
|
*.class
|
||||||
|
|
||||||
|
MANIFEST
|
||||||
|
196
Changelog
Normal file
196
Changelog
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
2019-1-20: version 0.42.1
|
||||||
|
1. 修复setup.py在python2.7版本无法工作的问题 (issue #809)
|
||||||
|
|
||||||
|
2019-1-13: version 0.42
|
||||||
|
1. 修复paddle模式空字符串coredump问题 @JesseyXujin
|
||||||
|
2. 修复cut_all模式切分丢字问题 @fxsjy
|
||||||
|
3. paddle安装检测优化 @vissssa
|
||||||
|
|
||||||
|
2019-1-8: version 0.41
|
||||||
|
1. 开启paddle模式更友好
|
||||||
|
2. 修复cut_all模式不支持中英混合词的bug
|
||||||
|
|
||||||
|
2019-12-25: version 0.40
|
||||||
|
1. 支持基于paddle的深度学习分词模式(use_paddle=True); by @JesseyXujin, @xyzhou-puck
|
||||||
|
2. 修复自定义Tokenizer实例的add_word方法指向全局的问题; by @linhx13
|
||||||
|
3. 修复whoosh测试用例的引用bug; by @ZhengZixiang
|
||||||
|
4. 修复自定义词库不支持含"-"符号的问题;by @JimCurryWang
|
||||||
|
|
||||||
|
2017-08-28: version 0.39
|
||||||
|
1. del_word支持强行拆开词语; by @gumblex,@fxsjy
|
||||||
|
2. 修复百分数的切词; by @fxsjy
|
||||||
|
3. 修复HMM=False在多进程模式下的bug; by @huntzhan
|
||||||
|
|
||||||
|
2015-12-16: version 0.38
|
||||||
|
1. 通过pkg_resources载入默认词典,支持在Spark等平台上运行, by @gumblex;
|
||||||
|
2. 扩充识别的汉字unicode范围:[\u4E00-\u9FD5], by @gumblex;
|
||||||
|
3. 关键词提取支持返回词性,修复posseg分词得到的pair做dict关键字的问题,by @jerryday;
|
||||||
|
4. 修复load_userdict加载用户词典不能识别含有空格等特殊字符的问题, by @gumblex;
|
||||||
|
5. 命令行分词支持返回词性, by @gumblex;
|
||||||
|
|
||||||
|
2015-06-27: version 0.37
|
||||||
|
1. 代码重构,分词器封装为Class,支持实例化,by @gumblex (https://github.com/fxsjy/jieba/commit/94840a734c32cfece05c0c3ec236ffc3d36b4ae6)
|
||||||
|
2. 修复cut_for_search的bug,完善posseg; by @gumblex
|
||||||
|
3. 修复posseg在0.36中引入的一处bug; by @wangbin
|
||||||
|
4. 修复load_userdict异常处理的bug; by @gip0
|
||||||
|
5. 修复生成词典二进制cache文件时跨文件系统的bug, 支持自定义; by @gumblex
|
||||||
|
|
||||||
|
2015-03-20: version 0.36
|
||||||
|
1. 代码同时兼容python2与python3, 若干性能优化; by @gumblex
|
||||||
|
2. 解决用户添加词的概率自动计算问题,分词更加准确;by @gumblex
|
||||||
|
3. 可自定义cache_file的文件系统路径; by @changyy
|
||||||
|
4. TextRank算法实现完善; by @sing1ee,@walkskyer
|
||||||
|
|
||||||
|
2014-11-15: version 0.35.1
|
||||||
|
1. 修复 Python 3.2 的兼容性问题
|
||||||
|
|
||||||
|
2014-11-13: version 0.35
|
||||||
|
1. 改进词典cache的dump和加载机制;by @gumblex
|
||||||
|
2. 提升关键词提取的性能; by @gumblex
|
||||||
|
3. 关键词提取新增基于textrank算法的子模块; by @singlee
|
||||||
|
4. 修复自定义stopwords功能的bug; by @walkskyer
|
||||||
|
|
||||||
|
|
||||||
|
2014-10-20: version 0.34
|
||||||
|
1. 提升性能,词典结构由Trie改为Prefix Set,内存占用减少2/3, 详见:https://github.com/fxsjy/jieba/pull/187;by @gumblex
|
||||||
|
2. 修复关键词提取功能的性能问题
|
||||||
|
|
||||||
|
2014-08-31: version 0.33
|
||||||
|
1. 支持自定义stop words; by @fukuball
|
||||||
|
2. 支持自定义idf词典; by @fukuball
|
||||||
|
3. 修复自定义词典的词性不能正常显示的bug; by @ShuraChow
|
||||||
|
|
||||||
|
|
||||||
|
2014-02-07: version 0.32
|
||||||
|
1. 新增分词选项:可以关闭新词发现功能;详见:https://github.com/fxsjy/jieba/blob/master/test/test_no_hmm.py#L8
|
||||||
|
2. 修复posseg子模块的Bug;详见: https://github.com/fxsjy/jieba/issues/111 https://github.com/fxsjy/jieba/issues/132
|
||||||
|
3. ChineseAnalyzer提供了更好的英文支持(感谢@jannson),例如单词Stemming; 详见:https://github.com/fxsjy/jieba/pull/106
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
2013-07-01: version 0.31
|
||||||
|
1. 修改了代码缩进格式,遵循PEP8标准
|
||||||
|
2. 支持Jython解析器,感谢 @piaolingxue
|
||||||
|
3. 修复中英混合词汇不能识别数字在前词语的Bug
|
||||||
|
4. 部分代码重构,感谢 @chao78787
|
||||||
|
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
||||||
|
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
2013-07-01: version 0.30
|
||||||
|
==========================
|
||||||
|
1) 新增jieba.tokenize方法,返回每个词的起始位置
|
||||||
|
2) 新增ChineseAnalyzer,用于支持whoosh搜索引擎
|
||||||
|
3)添加了更多的中英混合词汇
|
||||||
|
4)修改了一些py文件的加载方法,从而支持py2exe,cxfree打包为exe
|
||||||
|
|
||||||
|
2013-06-17: version 0.29.1
|
||||||
|
==========================
|
||||||
|
1) 优化了viterbi算法的代码,分词速度提升15%
|
||||||
|
2) 去除了词典中的一些低质词
|
||||||
|
|
||||||
|
2013-06-07: version 0.29
|
||||||
|
==========================
|
||||||
|
1) 提升了finalseg子模块命名体识别的准确度
|
||||||
|
2) 修正了一些badcase
|
||||||
|
|
||||||
|
2013-06-01: version 0.28.4
|
||||||
|
==========================
|
||||||
|
1) 修正了一些badcase
|
||||||
|
2) add wraps decorator, by @cloudaice
|
||||||
|
3) unittest, by @cloudaice
|
||||||
|
|
||||||
|
2013-05-02: version 0.28.3
|
||||||
|
==========================
|
||||||
|
1) 修正了临时cache文件生成在pypy解析器下出错的问题
|
||||||
|
|
||||||
|
2013-04-28: version 0.28.2
|
||||||
|
==========================
|
||||||
|
1) 修正了initialize函数默认参数绑定的bug.
|
||||||
|
|
||||||
|
2013-04-27: version 0.28
|
||||||
|
========================
|
||||||
|
1) 新增词典lazy load功能,用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
|
||||||
|
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
||||||
|
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
||||||
|
|
||||||
|
2013-04-22: version 0.27
|
||||||
|
========================
|
||||||
|
1) 新增并行分词功能,可以在多核计算机上显著提高分词速度
|
||||||
|
2) 修正了“的”字频过高引起的bug;修正了对小数点和下划线的处理
|
||||||
|
3) 修正了python2.6存在的兼容性问题
|
||||||
|
|
||||||
|
|
||||||
|
2013-04-07: version 0.26
|
||||||
|
========================
|
||||||
|
1) 改进了对标点符号的处理,之前的版本会过滤掉所有的标点符号;
|
||||||
|
2) 允许用户在自定义词典中添加词性;
|
||||||
|
3) 改进了关键词提取的功能jieba.analyse.extract_tags;
|
||||||
|
4) 修复了一个在pypy解释器下运行的bug.
|
||||||
|
|
||||||
|
|
||||||
|
2013-02-18: version 0.25
|
||||||
|
========================
|
||||||
|
1)支持繁体中文的分词
|
||||||
|
2)修正了多python进程时生成cache文件失败的bug
|
||||||
|
|
||||||
|
|
||||||
|
2012-12-28: version 0.24
|
||||||
|
========================
|
||||||
|
1) 解决了没有标点的长句子分词效果差的问题,问题在于连续的小概率乘法可能会导致浮点下溢或为0.
|
||||||
|
2) 修复了0.23的全模式下英文分词的bug
|
||||||
|
|
||||||
|
|
||||||
|
2012-12-12: version 0.23
|
||||||
|
========================
|
||||||
|
1) 修复了之前版本不能识别中英混合词语的问题
|
||||||
|
|
||||||
|
|
||||||
|
2012-11-28: version 0.22
|
||||||
|
========================
|
||||||
|
1) 新增jieba.cut_for_search方法, 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
|
||||||
|
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
||||||
|
|
||||||
|
|
||||||
|
2012-11-23: version 0.21
|
||||||
|
========================
|
||||||
|
1) 修复了全模式分词中散字过多的问题
|
||||||
|
2) 用户自定义词典函数load_userdict支持file-like object作为输入
|
||||||
|
|
||||||
|
|
||||||
|
2012-11-06: version 0.20
|
||||||
|
========================
|
||||||
|
1) 新增词性标注功能
|
||||||
|
|
||||||
|
|
||||||
|
2012-10-25: version 0.19
|
||||||
|
========================
|
||||||
|
1) 提升了模块加载的速度
|
||||||
|
2) 增加了用户自定义词典的接口
|
||||||
|
|
||||||
|
|
||||||
|
2012-10-16: version 0.18
|
||||||
|
========================
|
||||||
|
1) 增加关键词提取功能
|
||||||
|
|
||||||
|
|
||||||
|
2012-10-12: version 0.17
|
||||||
|
========================
|
||||||
|
1) 将词典文件dict.txt排序后存储,提升了Trie树构建速度,使得组件初始化时间缩短了10%;
|
||||||
|
2) 增强了人名词语的训练,增强了未登录人名词语的识别能力
|
||||||
|
|
||||||
|
|
||||||
|
2012-10-09: version 0.16
|
||||||
|
========================
|
||||||
|
1)将求最优切分路径的记忆化递归搜索算法改用循环实现,使分词速度提高了15%
|
||||||
|
|
||||||
|
2) 修复了Viterbi算法实现上的一个Bug
|
||||||
|
|
||||||
|
|
||||||
|
2012-10-07: version 0.14
|
||||||
|
========================
|
||||||
|
1) 结巴分词被发布到了pypi,用户可以通过easy_install或者pip快速安装该组件;
|
||||||
|
2) 合并了搜狗开源词库2006版,删除了一些低频词
|
||||||
|
3) 优化了代码,缩短了程序初始化时间。
|
||||||
|
4) 增加了在线效果演示
|
20
LICENSE
Normal file
20
LICENSE
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2013 Sun Junyi
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||||
|
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
2
MANIFEST.in
Normal file
2
MANIFEST.in
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
graft README.md
|
||||||
|
graft Changelog
|
825
README.md
825
README.md
@ -1,125 +1,483 @@
|
|||||||
jieba
|
jieba
|
||||||
========
|
========
|
||||||
"结巴"中文分词:做最好的Python中文分词组件
|
“结巴”中文分词:做最好的 Python 中文分词组件
|
||||||
|
|
||||||
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
|
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
|
||||||
|
|
||||||
- _Scroll down for English documentation._
|
- _Scroll down for English documentation._
|
||||||
|
|
||||||
Feature
|
|
||||||
========
|
|
||||||
* 支持两种分词模式:
|
|
||||||
* 1)精确模式,试图将句子最精确地切开,适合文本分析;
|
|
||||||
* 2)全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
|
|
||||||
* 3) 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
|
||||||
|
|
||||||
Usage
|
特点
|
||||||
========
|
========
|
||||||
* 全自动安装:`easy_install jieba` 或者 `pip install jieba`
|
* 支持四种分词模式:
|
||||||
* 半自动安装:先下载http://pypi.python.org/pypi/jieba/ ,解压后运行python setup.py install
|
* 精确模式,试图将句子最精确地切开,适合文本分析;
|
||||||
* 手动安装:将jieba目录放置于当前目录或者site-packages目录
|
* 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
|
||||||
* 通过import jieba 来引用 (第一次import时需要构建Trie树,需要几秒时间)
|
* 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
||||||
|
* paddle模式,利用PaddlePaddle深度学习框架,训练序列标注(双向GRU)网络模型实现分词。同时支持词性标注。paddle模式使用需安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。目前paddle模式支持jieba v0.40及以上版本。jieba v0.40以下版本,请升级jieba,`pip install jieba --upgrade` 。[PaddlePaddle官网](https://www.paddlepaddle.org.cn/)
|
||||||
|
* 支持繁体分词
|
||||||
|
* 支持自定义词典
|
||||||
|
* MIT 授权协议
|
||||||
|
|
||||||
Algorithm
|
安装说明
|
||||||
|
=======
|
||||||
|
|
||||||
|
代码对 Python 2/3 均兼容
|
||||||
|
|
||||||
|
* 全自动安装:`easy_install jieba` 或者 `pip install jieba` / `pip3 install jieba`
|
||||||
|
* 半自动安装:先下载 http://pypi.python.org/pypi/jieba/ ,解压后运行 `python setup.py install`
|
||||||
|
* 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
|
||||||
|
* 通过 `import jieba` 来引用
|
||||||
|
* 如果需要使用paddle模式下的分词和词性标注功能,请先安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。
|
||||||
|
|
||||||
|
算法
|
||||||
========
|
========
|
||||||
* 基于Trie树结构实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图(DAG)
|
* 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG)
|
||||||
* 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
|
* 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
|
||||||
* 对于未登录词,采用了基于汉字成词能力的HMM模型,使用了Viterbi算法
|
* 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法
|
||||||
|
|
||||||
功能 1):分词
|
主要功能
|
||||||
==========
|
=======
|
||||||
* `jieba.cut`方法接受两个输入参数: 1) 第一个参数为需要分词的字符串 2)cut_all参数用来控制是否采用全模式
|
1. 分词
|
||||||
* `jieba.cut_for_search`方法接受一个参数:需要分词的字符串,该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
|
--------
|
||||||
* 注意:待分词的字符串可以是gbk字符串、utf-8字符串或者unicode
|
* `jieba.cut` 方法接受四个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型;use_paddle 参数用来控制是否使用paddle模式下的分词模式,paddle模式采用延迟加载方式,通过enable_paddle接口安装paddlepaddle-tiny,并且import相关代码;
|
||||||
* `jieba.cut`以及`jieba.cut_for_search`返回的结构都是一个可迭代的generator,可以使用for循环来获得分词后得到的每一个词语(unicode),也可以用list(jieba.cut(...))转化为list
|
* `jieba.cut_for_search` 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
|
||||||
|
* 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
|
||||||
|
* `jieba.cut` 以及 `jieba.cut_for_search` 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
|
||||||
|
* `jieba.lcut` 以及 `jieba.lcut_for_search` 直接返回 list
|
||||||
|
* `jieba.Tokenizer(dictionary=DEFAULT_DICT)` 新建自定义分词器,可用于同时使用不同词典。`jieba.dt` 为默认分词器,所有全局分词相关函数都是该分词器的映射。
|
||||||
|
|
||||||
代码示例( 分词 )
|
代码示例
|
||||||
|
|
||||||
#encoding=utf-8
|
```python
|
||||||
import jieba
|
# encoding=utf-8
|
||||||
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
jieba.enable_paddle()# 启动paddle模式。 0.40版之后开始支持,早期版本不支持
|
||||||
print "Full Mode:", "/ ".join(seg_list) #全模式
|
strs=["我来到北京清华大学","乒乓球拍卖完了","中国科学技术大学"]
|
||||||
|
for str in strs:
|
||||||
|
seg_list = jieba.cut(str,use_paddle=True) # 使用paddle模式
|
||||||
|
print("Paddle Mode: " + '/'.join(list(seg_list)))
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print "Default Mode:", "/ ".join(seg_list) #精确模式
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||||
print ", ".join(seg_list)
|
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
Output:
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||||
|
print(", ".join(seg_list))
|
||||||
|
```
|
||||||
|
|
||||||
【全模式】: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
|
输出:
|
||||||
|
|
||||||
【精确模式】: 我/ 来到/ 北京/ 清华大学
|
【全模式】: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
|
||||||
|
|
||||||
【新词识别】:他, 来到, 了, 网易, 杭研, 大厦 (此处,“杭研”并没有在词典中,但是也被Viterbi算法识别出来了)
|
【精确模式】: 我/ 来到/ 北京/ 清华大学
|
||||||
|
|
||||||
【搜索引擎模式】: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在
|
【新词识别】:他, 来到, 了, 网易, 杭研, 大厦 (此处,“杭研”并没有在词典中,但是也被Viterbi算法识别出来了)
|
||||||
, 日本, 京都, 大学, 日本京都大学, 深造
|
|
||||||
|
|
||||||
功能 2) :添加自定义词典
|
【搜索引擎模式】: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造
|
||||||
================
|
|
||||||
|
2. 添加自定义词典
|
||||||
|
----------------
|
||||||
|
|
||||||
|
### 载入词典
|
||||||
|
|
||||||
|
* 开发者可以指定自己自定义的词典,以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力,但是自行添加新词可以保证更高的正确率
|
||||||
|
* 用法: jieba.load_userdict(file_name) # file_name 为文件类对象或自定义词典的路径
|
||||||
|
* 词典格式和 `dict.txt` 一样,一个词占一行;每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒。`file_name` 若为路径或二进制方式打开的文件,则文件必须为 UTF-8 编码。
|
||||||
|
* 词频省略时使用自动计算的能保证分出该词的词频。
|
||||||
|
|
||||||
|
**例如:**
|
||||||
|
|
||||||
|
```
|
||||||
|
创新办 3 i
|
||||||
|
云计算 5
|
||||||
|
凱特琳 nz
|
||||||
|
台中
|
||||||
|
```
|
||||||
|
|
||||||
|
* 更改分词器(默认为 `jieba.dt`)的 `tmp_dir` 和 `cache_file` 属性,可分别指定缓存文件所在的文件夹及其文件名,用于受限的文件系统。
|
||||||
|
|
||||||
* 开发者可以指定自己自定义的词典,以便包含jieba词库里没有的词。虽然jieba有新词识别能力,但是自行添加新词可以保证更高的正确率
|
|
||||||
* 用法: jieba.load_userdict(file_name) # file_name为自定义词典的路径
|
|
||||||
* 词典格式和`analyse/idf.txt`一样,一个词占一行;每一行分为两部分,一部分为词语,另一部分为词频,用空格隔开
|
|
||||||
* 范例:
|
* 范例:
|
||||||
|
|
||||||
云计算 5
|
* 自定义词典:https://github.com/fxsjy/jieba/blob/master/test/userdict.txt
|
||||||
李小福 2
|
|
||||||
创新办 3
|
|
||||||
|
|
||||||
之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_userdict.py
|
||||||
|
|
||||||
加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
|
||||||
|
|
||||||
* 代码示例:"通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14
|
|
||||||
|
|
||||||
功能 3) :关键词提取
|
* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||||
================
|
|
||||||
* jieba.analyse.extract_tags(sentence,topK) #需要先import jieba.analyse
|
* 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||||
* setence为待提取的文本
|
|
||||||
* topK为返回几个TF/IDF权重最大的关键词,默认值为20
|
### 调整词典
|
||||||
|
|
||||||
|
* 使用 `add_word(word, freq=None, tag=None)` 和 `del_word(word)` 可在程序中动态修改词典。
|
||||||
|
* 使用 `suggest_freq(segment, tune=True)` 可调节单个词语的词频,使其能(或不能)被分出来。
|
||||||
|
|
||||||
|
* 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。
|
||||||
|
|
||||||
|
代码示例:
|
||||||
|
|
||||||
|
```pycon
|
||||||
|
>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
|
||||||
|
如果/放到/post/中将/出错/。
|
||||||
|
>>> jieba.suggest_freq(('中', '将'), True)
|
||||||
|
494
|
||||||
|
>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
|
||||||
|
如果/放到/post/中/将/出错/。
|
||||||
|
>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
|
||||||
|
「/台/中/」/正确/应该/不会/被/切开
|
||||||
|
>>> jieba.suggest_freq('台中', True)
|
||||||
|
69
|
||||||
|
>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
|
||||||
|
「/台中/」/正确/应该/不会/被/切开
|
||||||
|
```
|
||||||
|
|
||||||
|
* "通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14
|
||||||
|
|
||||||
|
3. 关键词提取
|
||||||
|
-------------
|
||||||
|
### 基于 TF-IDF 算法的关键词抽取
|
||||||
|
|
||||||
|
`import jieba.analyse`
|
||||||
|
|
||||||
|
* jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())
|
||||||
|
* sentence 为待提取的文本
|
||||||
|
* topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
|
||||||
|
* withWeight 为是否一并返回关键词权重值,默认值为 False
|
||||||
|
* allowPOS 仅包括指定词性的词,默认值为空,即不筛选
|
||||||
|
* jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例,idf_path 为 IDF 频率文件
|
||||||
|
|
||||||
代码示例 (关键词提取)
|
代码示例 (关键词提取)
|
||||||
|
|
||||||
https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
||||||
|
|
||||||
功能 4) : 词性标注
|
关键词提取所使用逆向文件频率(IDF)文本语料库可以切换成自定义语料库的路径
|
||||||
================
|
|
||||||
* 标注句子分词后每个词的词性,采用和ictclas兼容的标记法
|
* 用法: jieba.analyse.set_idf_path(file_name) # file_name为自定义语料库的路径
|
||||||
|
* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big
|
||||||
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py
|
||||||
|
|
||||||
|
关键词提取所使用停止词(Stop Words)文本语料库可以切换成自定义语料库的路径
|
||||||
|
|
||||||
|
* 用法: jieba.analyse.set_stop_words(file_name) # file_name为自定义语料库的路径
|
||||||
|
* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
|
||||||
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
|
||||||
|
|
||||||
|
关键词一并返回关键词权重值示例
|
||||||
|
|
||||||
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
|
||||||
|
|
||||||
|
### 基于 TextRank 算法的关键词抽取
|
||||||
|
|
||||||
|
* jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 直接使用,接口相同,注意默认过滤词性。
|
||||||
|
* jieba.analyse.TextRank() 新建自定义 TextRank 实例
|
||||||
|
|
||||||
|
算法论文: [TextRank: Bringing Order into Texts](http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)
|
||||||
|
|
||||||
|
#### 基本思想:
|
||||||
|
|
||||||
|
1. 将待抽取关键词的文本进行分词
|
||||||
|
2. 以固定窗口大小(默认为5,通过span属性调整),词之间的共现关系,构建图
|
||||||
|
3. 计算图中节点的PageRank,注意是无向带权图
|
||||||
|
|
||||||
|
#### 使用示例:
|
||||||
|
|
||||||
|
见 [test/demo.py](https://github.com/fxsjy/jieba/blob/master/test/demo.py)
|
||||||
|
|
||||||
|
4. 词性标注
|
||||||
|
-----------
|
||||||
|
* `jieba.posseg.POSTokenizer(tokenizer=None)` 新建自定义分词器,`tokenizer` 参数可指定内部使用的 `jieba.Tokenizer` 分词器。`jieba.posseg.dt` 为默认词性标注分词器。
|
||||||
|
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。
|
||||||
|
* 除了jieba默认分词模式,提供paddle模式下的词性标注功能。paddle模式采用延迟加载方式,通过enable_paddle()安装paddlepaddle-tiny,并且import相关代码;
|
||||||
* 用法示例
|
* 用法示例
|
||||||
|
|
||||||
>>> import jieba.posseg as pseg
|
```pycon
|
||||||
>>> words =pseg.cut("我爱北京天安门")
|
>>> import jieba
|
||||||
>>> for w in words:
|
>>> import jieba.posseg as pseg
|
||||||
... print w.word,w.flag
|
>>> words = pseg.cut("我爱北京天安门") #jieba默认模式
|
||||||
...
|
>>> jieba.enable_paddle() #启动paddle模式。 0.40版之后开始支持,早期版本不支持
|
||||||
我 r
|
>>> words = pseg.cut("我爱北京天安门",use_paddle=True) #paddle模式
|
||||||
爱 v
|
>>> for word, flag in words:
|
||||||
北京 ns
|
... print('%s %s' % (word, flag))
|
||||||
天安门 ns
|
...
|
||||||
|
我 r
|
||||||
|
爱 v
|
||||||
|
北京 ns
|
||||||
|
天安门 ns
|
||||||
|
```
|
||||||
|
|
||||||
|
paddle模式词性标注对应表如下:
|
||||||
|
|
||||||
|
paddle模式词性和专名类别标签集合如下表,其中词性标签 24 个(小写字母),专名类别标签 4 个(大写字母)。
|
||||||
|
|
||||||
|
| 标签 | 含义 | 标签 | 含义 | 标签 | 含义 | 标签 | 含义 |
|
||||||
|
| ---- | -------- | ---- | -------- | ---- | -------- | ---- | -------- |
|
||||||
|
| n | 普通名词 | f | 方位名词 | s | 处所名词 | t | 时间 |
|
||||||
|
| nr | 人名 | ns | 地名 | nt | 机构名 | nw | 作品名 |
|
||||||
|
| nz | 其他专名 | v | 普通动词 | vd | 动副词 | vn | 名动词 |
|
||||||
|
| a | 形容词 | ad | 副形词 | an | 名形词 | d | 副词 |
|
||||||
|
| m | 数量词 | q | 量词 | r | 代词 | p | 介词 |
|
||||||
|
| c | 连词 | u | 助词 | xc | 其他虚词 | w | 标点符号 |
|
||||||
|
| PER | 人名 | LOC | 地名 | ORG | 机构名 | TIME | 时间 |
|
||||||
|
|
||||||
|
|
||||||
|
5. 并行分词
|
||||||
|
-----------
|
||||||
|
* 原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升
|
||||||
|
* 基于 python 自带的 multiprocessing 模块,目前暂不支持 Windows
|
||||||
|
* 用法:
|
||||||
|
* `jieba.enable_parallel(4)` # 开启并行分词模式,参数为并行进程数
|
||||||
|
* `jieba.disable_parallel()` # 关闭并行分词模式
|
||||||
|
|
||||||
|
* 例子:https://github.com/fxsjy/jieba/blob/master/test/parallel/test_file.py
|
||||||
|
|
||||||
|
* 实验结果:在 4 核 3.4GHz Linux 机器上,对金庸全集进行精确分词,获得了 1MB/s 的速度,是单进程版的 3.3 倍。
|
||||||
|
|
||||||
|
* **注意**:并行分词仅支持默认分词器 `jieba.dt` 和 `jieba.posseg.dt`。
|
||||||
|
|
||||||
|
6. Tokenize:返回词语在原文的起止位置
|
||||||
|
----------------------------------
|
||||||
|
* 注意,输入参数只接受 unicode
|
||||||
|
* 默认模式
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = jieba.tokenize(u'永和服装饰品有限公司')
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
word 永和 start: 0 end:2
|
||||||
|
word 服装 start: 2 end:4
|
||||||
|
word 饰品 start: 4 end:6
|
||||||
|
word 有限公司 start: 6 end:10
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
* 搜索模式
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
word 永和 start: 0 end:2
|
||||||
|
word 服装 start: 2 end:4
|
||||||
|
word 饰品 start: 4 end:6
|
||||||
|
word 有限 start: 6 end:8
|
||||||
|
word 公司 start: 8 end:10
|
||||||
|
word 有限公司 start: 6 end:10
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
7. ChineseAnalyzer for Whoosh 搜索引擎
|
||||||
|
--------------------------------------------
|
||||||
|
* 引用: `from jieba.analyse import ChineseAnalyzer`
|
||||||
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
|
||||||
|
|
||||||
|
8. 命令行分词
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
使用示例:`python -m jieba news.txt > cut_result.txt`
|
||||||
|
|
||||||
|
命令行选项(翻译):
|
||||||
|
|
||||||
|
使用: python -m jieba [options] filename
|
||||||
|
|
||||||
|
结巴命令行界面。
|
||||||
|
|
||||||
|
固定参数:
|
||||||
|
filename 输入文件
|
||||||
|
|
||||||
|
可选参数:
|
||||||
|
-h, --help 显示此帮助信息并退出
|
||||||
|
-d [DELIM], --delimiter [DELIM]
|
||||||
|
使用 DELIM 分隔词语,而不是用默认的' / '。
|
||||||
|
若不指定 DELIM,则使用一个空格分隔。
|
||||||
|
-p [DELIM], --pos [DELIM]
|
||||||
|
启用词性标注;如果指定 DELIM,词语和词性之间
|
||||||
|
用它分隔,否则用 _ 分隔
|
||||||
|
-D DICT, --dict DICT 使用 DICT 代替默认词典
|
||||||
|
-u USER_DICT, --user-dict USER_DICT
|
||||||
|
使用 USER_DICT 作为附加词典,与默认词典或自定义词典配合使用
|
||||||
|
-a, --cut-all 全模式分词(不支持词性标注)
|
||||||
|
-n, --no-hmm 不使用隐含马尔可夫模型
|
||||||
|
-q, --quiet 不输出载入信息到 STDERR
|
||||||
|
-V, --version 显示版本信息并退出
|
||||||
|
|
||||||
|
如果没有指定文件名,则使用标准输入。
|
||||||
|
|
||||||
|
`--help` 选项输出:
|
||||||
|
|
||||||
|
$> python -m jieba --help
|
||||||
|
Jieba command line interface.
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
filename input file
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-d [DELIM], --delimiter [DELIM]
|
||||||
|
use DELIM instead of ' / ' for word delimiter; or a
|
||||||
|
space if it is used without DELIM
|
||||||
|
-p [DELIM], --pos [DELIM]
|
||||||
|
enable POS tagging; if DELIM is specified, use DELIM
|
||||||
|
instead of '_' for POS delimiter
|
||||||
|
-D DICT, --dict DICT use DICT as dictionary
|
||||||
|
-u USER_DICT, --user-dict USER_DICT
|
||||||
|
use USER_DICT together with the default dictionary or
|
||||||
|
DICT (if specified)
|
||||||
|
-a, --cut-all full pattern cutting (ignored with POS tagging)
|
||||||
|
-n, --no-hmm don't use the Hidden Markov Model
|
||||||
|
-q, --quiet don't print loading messages to stderr
|
||||||
|
-V, --version show program's version number and exit
|
||||||
|
|
||||||
|
If no filename specified, use STDIN instead.
|
||||||
|
|
||||||
|
延迟加载机制
|
||||||
|
------------
|
||||||
|
|
||||||
|
jieba 采用延迟加载,`import jieba` 和 `jieba.Tokenizer()` 不会立即触发词典的加载,一旦有必要才开始加载词典构建前缀字典。如果你想手工初始 jieba,也可以手动初始化。
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
jieba.initialize() # 手动初始化(可选)
|
||||||
|
|
||||||
|
|
||||||
|
在 0.28 之前的版本是不能指定主词典的路径的,有了延迟加载机制后,你可以改变主词典的路径:
|
||||||
|
|
||||||
|
jieba.set_dictionary('data/dict.txt.big')
|
||||||
|
|
||||||
|
例子: https://github.com/fxsjy/jieba/blob/master/test/test_change_dictpath.py
|
||||||
|
|
||||||
|
其他词典
|
||||||
|
========
|
||||||
|
1. 占用内存较小的词典文件
|
||||||
|
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
|
||||||
|
|
||||||
|
2. 支持繁体分词更好的词典文件
|
||||||
|
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
|
||||||
|
|
||||||
|
下载你所需要的词典,然后覆盖 jieba/dict.txt 即可;或者用 `jieba.set_dictionary('data/dict.txt.big')`
|
||||||
|
|
||||||
|
其他语言实现
|
||||||
|
==========
|
||||||
|
|
||||||
|
结巴分词 Java 版本
|
||||||
|
----------------
|
||||||
|
作者:piaolingxue
|
||||||
|
地址:https://github.com/huaban/jieba-analysis
|
||||||
|
|
||||||
|
结巴分词 C++ 版本
|
||||||
|
----------------
|
||||||
|
作者:yanyiwu
|
||||||
|
地址:https://github.com/yanyiwu/cppjieba
|
||||||
|
|
||||||
|
结巴分词 Rust 版本
|
||||||
|
----------------
|
||||||
|
作者:messense, MnO2
|
||||||
|
地址:https://github.com/messense/jieba-rs
|
||||||
|
|
||||||
|
结巴分词 Node.js 版本
|
||||||
|
----------------
|
||||||
|
作者:yanyiwu
|
||||||
|
地址:https://github.com/yanyiwu/nodejieba
|
||||||
|
|
||||||
|
结巴分词 Erlang 版本
|
||||||
|
----------------
|
||||||
|
作者:falood
|
||||||
|
地址:https://github.com/falood/exjieba
|
||||||
|
|
||||||
|
结巴分词 R 版本
|
||||||
|
----------------
|
||||||
|
作者:qinwf
|
||||||
|
地址:https://github.com/qinwf/jiebaR
|
||||||
|
|
||||||
|
结巴分词 iOS 版本
|
||||||
|
----------------
|
||||||
|
作者:yanyiwu
|
||||||
|
地址:https://github.com/yanyiwu/iosjieba
|
||||||
|
|
||||||
|
结巴分词 PHP 版本
|
||||||
|
----------------
|
||||||
|
作者:fukuball
|
||||||
|
地址:https://github.com/fukuball/jieba-php
|
||||||
|
|
||||||
|
结巴分词 .NET(C#) 版本
|
||||||
|
----------------
|
||||||
|
作者:anderscui
|
||||||
|
地址:https://github.com/anderscui/jieba.NET/
|
||||||
|
|
||||||
|
结巴分词 Go 版本
|
||||||
|
----------------
|
||||||
|
|
||||||
|
+ 作者: wangbin 地址: https://github.com/wangbin/jiebago
|
||||||
|
+ 作者: yanyiwu 地址: https://github.com/yanyiwu/gojieba
|
||||||
|
|
||||||
|
结巴分词Android版本
|
||||||
|
------------------
|
||||||
|
+ 作者 Dongliang.W 地址:https://github.com/452896915/jieba-android
|
||||||
|
|
||||||
|
|
||||||
|
友情链接
|
||||||
|
=========
|
||||||
|
* https://github.com/baidu/lac 百度中文词法分析(分词+词性+专名)系统
|
||||||
|
* https://github.com/baidu/AnyQ 百度FAQ自动问答系统
|
||||||
|
* https://github.com/baidu/Senta 百度情感识别系统
|
||||||
|
|
||||||
|
系统集成
|
||||||
|
========
|
||||||
|
1. Solr: https://github.com/sing1ee/jieba-solr
|
||||||
|
|
||||||
分词速度
|
分词速度
|
||||||
=========
|
=========
|
||||||
* 1.5 MB / Second in Full Mode
|
* 1.5 MB / Second in Full Mode
|
||||||
* 400 KB / Second in Default Mode
|
* 400 KB / Second in Default Mode
|
||||||
* Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt
|
* 测试环境: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt
|
||||||
|
|
||||||
在线演示
|
|
||||||
=========
|
|
||||||
http://209.222.69.242:9000/
|
|
||||||
|
|
||||||
常见问题
|
常见问题
|
||||||
=========
|
=========
|
||||||
1)模型的数据是如何生成的?https://github.com/fxsjy/jieba/issues/7
|
|
||||||
|
|
||||||
2)这个库的授权是? https://github.com/fxsjy/jieba/issues/2
|
|
||||||
|
|
||||||
|
## 1. 模型的数据是如何生成的?
|
||||||
|
|
||||||
|
详见: https://github.com/fxsjy/jieba/issues/7
|
||||||
|
|
||||||
|
## 2. “台中”总是被切成“台 中”?(以及类似情况)
|
||||||
|
|
||||||
|
P(台中) < P(台)×P(中),“台中”词频不够导致其成词概率较低
|
||||||
|
|
||||||
|
解决方法:强制调高词频
|
||||||
|
|
||||||
|
`jieba.add_word('台中')` 或者 `jieba.suggest_freq('台中', True)`
|
||||||
|
|
||||||
|
## 3. “今天天气 不错”应该被切成“今天 天气 不错”?(以及类似情况)
|
||||||
|
|
||||||
|
解决方法:强制调低词频
|
||||||
|
|
||||||
|
`jieba.suggest_freq(('今天', '天气'), True)`
|
||||||
|
|
||||||
|
或者直接删除该词 `jieba.del_word('今天天气')`
|
||||||
|
|
||||||
|
## 4. 切出了词典中没有的词语,效果不理想?
|
||||||
|
|
||||||
|
解决方法:关闭新词发现
|
||||||
|
|
||||||
|
`jieba.cut('丰田太省了', HMM=False)`
|
||||||
|
`jieba.cut('我们中出了一个叛徒', HMM=False)`
|
||||||
|
|
||||||
|
**更多问题请点击**:https://github.com/fxsjy/jieba/issues?sort=updated&state=closed
|
||||||
|
|
||||||
|
修订历史
|
||||||
|
==========
|
||||||
|
https://github.com/fxsjy/jieba/blob/master/Changelog
|
||||||
|
|
||||||
|
--------------------
|
||||||
|
|
||||||
jieba
|
jieba
|
||||||
========
|
========
|
||||||
@ -128,85 +486,299 @@ jieba
|
|||||||
Features
|
Features
|
||||||
========
|
========
|
||||||
* Support three types of segmentation mode:
|
* Support three types of segmentation mode:
|
||||||
* 1) Accurate Mode, attempt to cut the sentence into the most accurate segmentation, which is suitable for text analysis;
|
|
||||||
* 2) Full Mode, break the words of the sentence into words scanned
|
1. Accurate Mode attempts to cut the sentence into the most accurate segmentations, which is suitable for text analysis.
|
||||||
* 3) Search Engine Mode, based on the Accurate Mode, with an attempt to cut the long words into several short words, which can enhance the recall rate
|
2. Full Mode gets all the possible words from the sentence. Fast but not accurate.
|
||||||
|
3. Search Engine Mode, based on the Accurate Mode, attempts to cut long words into several short words, which can raise the recall rate. Suitable for search engines.
|
||||||
|
|
||||||
|
* Supports Traditional Chinese
|
||||||
|
* Supports customized dictionaries
|
||||||
|
* MIT License
|
||||||
|
|
||||||
|
|
||||||
|
Online demo
|
||||||
|
=========
|
||||||
|
http://jiebademo.ap01.aws.af.cm/
|
||||||
|
|
||||||
|
(Powered by Appfog)
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
========
|
========
|
||||||
* Fully automatic installation: `easy_install jieba` or `pip install jieba`
|
* Fully automatic installation: `easy_install jieba` or `pip install jieba`
|
||||||
* Semi-automatic installation: Download http://pypi.python.org/pypi/jieba/ , after extracting run `python setup.py install`
|
* Semi-automatic installation: Download http://pypi.python.org/pypi/jieba/ , run `python setup.py install` after extracting.
|
||||||
* Manutal installation: place the `jieba` directory in the current directory or python site-packages directory.
|
* Manual installation: place the `jieba` directory in the current directory or python `site-packages` directory.
|
||||||
* Use `import jieba` to import, which will first build the Trie tree only on first import (takes a few seconds).
|
* `import jieba`.
|
||||||
|
|
||||||
Algorithm
|
Algorithm
|
||||||
========
|
========
|
||||||
* Based on the Trie tree structure to achieve efficient word graph scanning; sentences using Chinese characters constitute a directed acyclic graph (DAG)
|
* Based on a prefix dictionary structure to achieve efficient word graph scanning. Build a directed acyclic graph (DAG) for all possible word combinations.
|
||||||
* Employs memory search to calculate the maximum probability path, in order to identify the maximum tangential points based on word frequency combination
|
* Use dynamic programming to find the most probable combination based on the word frequency.
|
||||||
* For unknown words, the character position HMM-based model is used, using the Viterbi algorithm
|
* For unknown words, a HMM-based model is used with the Viterbi algorithm.
|
||||||
|
|
||||||
Function 1): cut
|
Main Functions
|
||||||
==========
|
==============
|
||||||
* The `jieba.cut` method accepts to input parameters: 1) the first parameter is the string that requires segmentation, and the 2) second parameter is `cut_all`, a parameter used to control the segmentation pattern.
|
|
||||||
* `jieba.cut` returned structure is an iterative generator, where you can use a `for` loop to get the word segmentation (in unicode), or `list(jieba.cut( ... ))` to create a list.
|
|
||||||
* `jieba.cut_for_search` accpets only on parameter: the string that requires segmentation, and it will cut the sentence into short words
|
|
||||||
|
|
||||||
Code example: segmentation
|
1. Cut
|
||||||
==========
|
--------
|
||||||
|
* The `jieba.cut` function accepts three input parameters: the first parameter is the string to be cut; the second parameter is `cut_all`, controlling the cut mode; the third parameter is to control whether to use the Hidden Markov Model.
|
||||||
|
* `jieba.cut_for_search` accepts two parameter: the string to be cut; whether to use the Hidden Markov Model. This will cut the sentence into short words suitable for search engines.
|
||||||
|
* The input string can be an unicode/str object, or a str/bytes object which is encoded in UTF-8 or GBK. Note that using GBK encoding is not recommended because it may be unexpectly decoded as UTF-8.
|
||||||
|
* `jieba.cut` and `jieba.cut_for_search` returns an generator, from which you can use a `for` loop to get the segmentation result (in unicode).
|
||||||
|
* `jieba.lcut` and `jieba.lcut_for_search` returns a list.
|
||||||
|
* `jieba.Tokenizer(dictionary=DEFAULT_DICT)` creates a new customized Tokenizer, which enables you to use different dictionaries at the same time. `jieba.dt` is the default Tokenizer, to which almost all global functions are mapped.
|
||||||
|
|
||||||
#encoding=utf-8
|
|
||||||
import jieba
|
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
**Code example: segmentation**
|
||||||
print "Full Mode:", "/ ".join(seg_list) #全模式
|
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
```python
|
||||||
print "Default Mode:", "/ ".join(seg_list) #默认模式
|
#encoding=utf-8
|
||||||
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print ", ".join(seg_list)
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||||
print ", ".join(seg_list)
|
print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
|
||||||
|
|
||||||
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||||
|
print(", ".join(seg_list))
|
||||||
|
```
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
|
|
||||||
[Full Mode]: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
|
[Full Mode]: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
|
||||||
|
|
||||||
[Accurate Mode]: 我/ 来到/ 北京/ 清华大学
|
[Accurate Mode]: 我/ 来到/ 北京/ 清华大学
|
||||||
|
|
||||||
[Unknown Words Recognize] 他, 来到, 了, 网易, 杭研, 大厦 (In this case, "杭研" is not in the dictionary, but is identified by the Viterbi algorithm)
|
[Unknown Words Recognize] 他, 来到, 了, 网易, 杭研, 大厦 (In this case, "杭研" is not in the dictionary, but is identified by the Viterbi algorithm)
|
||||||
|
|
||||||
[Search Engine Mode]: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在
|
[Search Engine Mode]: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造
|
||||||
, 日本, 京都, 大学, 日本京都大学, 深造
|
|
||||||
|
|
||||||
|
|
||||||
Function 2): Add a custom dictionary
|
2. Add a custom dictionary
|
||||||
==========
|
----------------------------
|
||||||
|
|
||||||
* Developers can specify their own custom dictionary to include in the jieba thesaurus. jieba has the ability to identify new words, but adding your own new words can ensure a higher rate of correct segmentation.
|
### Load dictionary
|
||||||
* Usage: `jieba.load_userdict(file_name) # file_name is a custom dictionary path`
|
|
||||||
* The dictionary format is the same as that of `analyse/idf.txt`: one word per line; each line is divided into two parts, the first is the word itself, the other is the word frequency, separated by a space
|
|
||||||
* Example:
|
|
||||||
|
|
||||||
云计算 5
|
* Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but you can add your own new words can ensure a higher accuracy.
|
||||||
李小福 2
|
* Usage: `jieba.load_userdict(file_name)` # file_name is a file-like object or the path of the custom dictionary
|
||||||
创新办 3
|
* The dictionary format is the same as that of `dict.txt`: one word per line; each line is divided into three parts separated by a space: word, word frequency, POS tag. If `file_name` is a path or a file opened in binary mode, the dictionary must be UTF-8 encoded.
|
||||||
|
* The word frequency and POS tag can be omitted respectively. The word frequency will be filled with a suitable value if omitted.
|
||||||
|
|
||||||
之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
**For example:**
|
||||||
|
|
||||||
加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
```
|
||||||
|
创新办 3 i
|
||||||
|
云计算 5
|
||||||
|
凱特琳 nz
|
||||||
|
台中
|
||||||
|
```
|
||||||
|
|
||||||
Function 3): Keyword Extraction
|
|
||||||
================
|
|
||||||
* `jieba.analyse.extract_tags(sentence,topK) # needs to first import jieba.analyse`
|
|
||||||
* `setence`: the text to be extracted
|
|
||||||
* `topK`: To return several TF / IDF weights for the biggest keywords, the default value is 20
|
|
||||||
|
|
||||||
Code sample (keyword extraction)
|
* Change a Tokenizer's `tmp_dir` and `cache_file` to specify the path of the cache file, for using on a restricted file system.
|
||||||
|
|
||||||
https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
* Example:
|
||||||
|
|
||||||
|
云计算 5
|
||||||
|
李小福 2
|
||||||
|
创新办 3
|
||||||
|
|
||||||
|
[Before]: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
|
||||||
|
|
||||||
|
[After]: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
|
||||||
|
|
||||||
|
|
||||||
|
### Modify dictionary
|
||||||
|
|
||||||
|
* Use `add_word(word, freq=None, tag=None)` and `del_word(word)` to modify the dictionary dynamically in programs.
|
||||||
|
* Use `suggest_freq(segment, tune=True)` to adjust the frequency of a single word so that it can (or cannot) be segmented.
|
||||||
|
|
||||||
|
* Note that HMM may affect the final result.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```pycon
|
||||||
|
>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
|
||||||
|
如果/放到/post/中将/出错/。
|
||||||
|
>>> jieba.suggest_freq(('中', '将'), True)
|
||||||
|
494
|
||||||
|
>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
|
||||||
|
如果/放到/post/中/将/出错/。
|
||||||
|
>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
|
||||||
|
「/台/中/」/正确/应该/不会/被/切开
|
||||||
|
>>> jieba.suggest_freq('台中', True)
|
||||||
|
69
|
||||||
|
>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
|
||||||
|
「/台中/」/正确/应该/不会/被/切开
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Keyword Extraction
|
||||||
|
-----------------------
|
||||||
|
`import jieba.analyse`
|
||||||
|
|
||||||
|
* `jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())`
|
||||||
|
* `sentence`: the text to be extracted
|
||||||
|
* `topK`: return how many keywords with the highest TF/IDF weights. The default value is 20
|
||||||
|
* `withWeight`: whether return TF/IDF weights with the keywords. The default value is False
|
||||||
|
* `allowPOS`: filter words with which POSs are included. Empty for no filtering.
|
||||||
|
* `jieba.analyse.TFIDF(idf_path=None)` creates a new TFIDF instance, `idf_path` specifies IDF file path.
|
||||||
|
|
||||||
|
Example (keyword extraction)
|
||||||
|
|
||||||
|
https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
||||||
|
|
||||||
|
Developers can specify their own custom IDF corpus in jieba keyword extraction
|
||||||
|
|
||||||
|
* Usage: `jieba.analyse.set_idf_path(file_name) # file_name is the path for the custom corpus`
|
||||||
|
* Custom Corpus Sample:https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big
|
||||||
|
* Sample Code:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py
|
||||||
|
|
||||||
|
Developers can specify their own custom stop words corpus in jieba keyword extraction
|
||||||
|
|
||||||
|
* Usage: `jieba.analyse.set_stop_words(file_name) # file_name is the path for the custom corpus`
|
||||||
|
* Custom Corpus Sample:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
|
||||||
|
* Sample Code:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
|
||||||
|
|
||||||
|
There's also a [TextRank](http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf) implementation available.
|
||||||
|
|
||||||
|
Use: `jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))`
|
||||||
|
|
||||||
|
Note that it filters POS by default.
|
||||||
|
|
||||||
|
`jieba.analyse.TextRank()` creates a new TextRank instance.
|
||||||
|
|
||||||
|
4. Part of Speech Tagging
|
||||||
|
-------------------------
|
||||||
|
* `jieba.posseg.POSTokenizer(tokenizer=None)` creates a new customized Tokenizer. `tokenizer` specifies the jieba.Tokenizer to internally use. `jieba.posseg.dt` is the default POSTokenizer.
|
||||||
|
* Tags the POS of each word after segmentation, using labels compatible with ictclas.
|
||||||
|
* Example:
|
||||||
|
|
||||||
|
```pycon
|
||||||
|
>>> import jieba.posseg as pseg
|
||||||
|
>>> words = pseg.cut("我爱北京天安门")
|
||||||
|
>>> for w in words:
|
||||||
|
... print('%s %s' % (w.word, w.flag))
|
||||||
|
...
|
||||||
|
我 r
|
||||||
|
爱 v
|
||||||
|
北京 ns
|
||||||
|
天安门 ns
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Parallel Processing
|
||||||
|
----------------------
|
||||||
|
* Principle: Split target text by line, assign the lines into multiple Python processes, and then merge the results, which is considerably faster.
|
||||||
|
* Based on the multiprocessing module of Python.
|
||||||
|
* Usage:
|
||||||
|
* `jieba.enable_parallel(4)` # Enable parallel processing. The parameter is the number of processes.
|
||||||
|
* `jieba.disable_parallel()` # Disable parallel processing.
|
||||||
|
|
||||||
|
* Example:
|
||||||
|
https://github.com/fxsjy/jieba/blob/master/test/parallel/test_file.py
|
||||||
|
|
||||||
|
* Result: On a four-core 3.4GHz Linux machine, do accurate word segmentation on Complete Works of Jin Yong, and the speed reaches 1MB/s, which is 3.3 times faster than the single-process version.
|
||||||
|
|
||||||
|
* **Note** that parallel processing supports only default tokenizers, `jieba.dt` and `jieba.posseg.dt`.
|
||||||
|
|
||||||
|
6. Tokenize: return words with position
|
||||||
|
----------------------------------------
|
||||||
|
* The input must be unicode
|
||||||
|
* Default mode
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = jieba.tokenize(u'永和服装饰品有限公司')
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
word 永和 start: 0 end:2
|
||||||
|
word 服装 start: 2 end:4
|
||||||
|
word 饰品 start: 4 end:6
|
||||||
|
word 有限公司 start: 6 end:10
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
* Search mode
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
word 永和 start: 0 end:2
|
||||||
|
word 服装 start: 2 end:4
|
||||||
|
word 饰品 start: 4 end:6
|
||||||
|
word 有限 start: 6 end:8
|
||||||
|
word 公司 start: 8 end:10
|
||||||
|
word 有限公司 start: 6 end:10
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
7. ChineseAnalyzer for Whoosh
|
||||||
|
-------------------------------
|
||||||
|
* `from jieba.analyse import ChineseAnalyzer`
|
||||||
|
* Example: https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
|
||||||
|
|
||||||
|
8. Command Line Interface
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
$> python -m jieba --help
|
||||||
|
Jieba command line interface.
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
filename input file
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-d [DELIM], --delimiter [DELIM]
|
||||||
|
use DELIM instead of ' / ' for word delimiter; or a
|
||||||
|
space if it is used without DELIM
|
||||||
|
-p [DELIM], --pos [DELIM]
|
||||||
|
enable POS tagging; if DELIM is specified, use DELIM
|
||||||
|
instead of '_' for POS delimiter
|
||||||
|
-D DICT, --dict DICT use DICT as dictionary
|
||||||
|
-u USER_DICT, --user-dict USER_DICT
|
||||||
|
use USER_DICT together with the default dictionary or
|
||||||
|
DICT (if specified)
|
||||||
|
-a, --cut-all full pattern cutting (ignored with POS tagging)
|
||||||
|
-n, --no-hmm don't use the Hidden Markov Model
|
||||||
|
-q, --quiet don't print loading messages to stderr
|
||||||
|
-V, --version show program's version number and exit
|
||||||
|
|
||||||
|
If no filename specified, use STDIN instead.
|
||||||
|
|
||||||
|
Initialization
|
||||||
|
---------------
|
||||||
|
By default, Jieba don't build the prefix dictionary unless it's necessary. This takes 1-3 seconds, after which it is not initialized again. If you want to initialize Jieba manually, you can call:
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
jieba.initialize() # (optional)
|
||||||
|
|
||||||
|
You can also specify the dictionary (not supported before version 0.28) :
|
||||||
|
|
||||||
|
jieba.set_dictionary('data/dict.txt.big')
|
||||||
|
|
||||||
|
|
||||||
|
Using Other Dictionaries
|
||||||
|
===========================
|
||||||
|
|
||||||
|
It is possible to use your own dictionary with Jieba, and there are also two dictionaries ready for download:
|
||||||
|
|
||||||
|
1. A smaller dictionary for a smaller memory footprint:
|
||||||
|
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
|
||||||
|
|
||||||
|
2. There is also a bigger dictionary that has better support for traditional Chinese (繁體):
|
||||||
|
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
|
||||||
|
|
||||||
|
By default, an in-between dictionary is used, called `dict.txt` and included in the distribution.
|
||||||
|
|
||||||
|
In either case, download the file you want, and then call `jieba.set_dictionary('data/dict.txt.big')` or just replace the existing `dict.txt`.
|
||||||
|
|
||||||
Segmentation speed
|
Segmentation speed
|
||||||
=========
|
=========
|
||||||
@ -214,6 +786,3 @@ Segmentation speed
|
|||||||
* 400 KB / Second in Default Mode
|
* 400 KB / Second in Default Mode
|
||||||
* Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt
|
* Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt
|
||||||
|
|
||||||
Online demo
|
|
||||||
=========
|
|
||||||
http://209.222.69.242:9000/
|
|
||||||
|
584429
extra_dict/dict.txt.big
Normal file
584429
extra_dict/dict.txt.big
Normal file
File diff suppressed because it is too large
Load Diff
109750
extra_dict/dict.txt.small
Normal file
109750
extra_dict/dict.txt.small
Normal file
File diff suppressed because it is too large
Load Diff
176239
extra_dict/idf.txt.big
Normal file
176239
extra_dict/idf.txt.big
Normal file
File diff suppressed because it is too large
Load Diff
51
extra_dict/stop_words.txt
Normal file
51
extra_dict/stop_words.txt
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
the
|
||||||
|
of
|
||||||
|
is
|
||||||
|
and
|
||||||
|
to
|
||||||
|
in
|
||||||
|
that
|
||||||
|
we
|
||||||
|
for
|
||||||
|
an
|
||||||
|
are
|
||||||
|
by
|
||||||
|
be
|
||||||
|
as
|
||||||
|
on
|
||||||
|
with
|
||||||
|
can
|
||||||
|
if
|
||||||
|
from
|
||||||
|
which
|
||||||
|
you
|
||||||
|
it
|
||||||
|
this
|
||||||
|
then
|
||||||
|
at
|
||||||
|
have
|
||||||
|
all
|
||||||
|
not
|
||||||
|
one
|
||||||
|
has
|
||||||
|
or
|
||||||
|
that
|
||||||
|
的
|
||||||
|
了
|
||||||
|
和
|
||||||
|
是
|
||||||
|
就
|
||||||
|
都
|
||||||
|
而
|
||||||
|
及
|
||||||
|
與
|
||||||
|
著
|
||||||
|
或
|
||||||
|
一個
|
||||||
|
沒有
|
||||||
|
我們
|
||||||
|
你們
|
||||||
|
妳們
|
||||||
|
他們
|
||||||
|
她們
|
||||||
|
是否
|
@ -1,190 +1,619 @@
|
|||||||
import re
|
from __future__ import absolute_import, unicode_literals
|
||||||
import math
|
|
||||||
import os,sys
|
__version__ = '0.42.1'
|
||||||
import pprint
|
__license__ = 'MIT'
|
||||||
import finalseg
|
|
||||||
import time
|
|
||||||
import tempfile
|
|
||||||
import marshal
|
import marshal
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from hashlib import md5
|
||||||
|
from math import log
|
||||||
|
|
||||||
FREQ = {}
|
from . import finalseg
|
||||||
total =0.0
|
from ._compat import *
|
||||||
|
|
||||||
def gen_trie(f_name):
|
if os.name == 'nt':
|
||||||
lfreq = {}
|
from shutil import move as _replace_file
|
||||||
trie = {}
|
else:
|
||||||
ltotal = 0.0
|
_replace_file = os.rename
|
||||||
content = open(f_name,'rb').read().decode('utf-8')
|
|
||||||
for line in content.split("\n"):
|
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
||||||
word,freq,_ = line.split(" ")
|
|
||||||
freq = float(freq)
|
DEFAULT_DICT = None
|
||||||
lfreq[word] = freq
|
DEFAULT_DICT_NAME = "dict.txt"
|
||||||
ltotal+=freq
|
|
||||||
p = trie
|
log_console = logging.StreamHandler(sys.stderr)
|
||||||
for c in word:
|
default_logger = logging.getLogger(__name__)
|
||||||
if not c in p:
|
default_logger.setLevel(logging.DEBUG)
|
||||||
p[c] ={}
|
default_logger.addHandler(log_console)
|
||||||
p = p[c]
|
|
||||||
p['']='' #ending flag
|
DICT_WRITING = {}
|
||||||
return trie, lfreq,ltotal
|
|
||||||
|
pool = None
|
||||||
|
|
||||||
|
re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
|
||||||
|
|
||||||
|
re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
||||||
|
|
||||||
|
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||||
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
|
# re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
|
||||||
|
# Adding "-" symbol in re_han_default
|
||||||
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
||||||
|
|
||||||
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
||||||
|
|
||||||
|
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
def setLogLevel(log_level):
|
||||||
|
default_logger.setLevel(log_level)
|
||||||
print >> sys.stderr, "Building Trie..."
|
|
||||||
t1 = time.time()
|
|
||||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
|
||||||
load_from_cache_fail = True
|
|
||||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
|
|
||||||
print >> sys.stderr, "loading model from cache"
|
|
||||||
try:
|
|
||||||
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
|
||||||
load_from_cache_fail = False
|
|
||||||
except:
|
|
||||||
load_from_cache_fail = True
|
|
||||||
|
|
||||||
if load_from_cache_fail:
|
|
||||||
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
|
|
||||||
FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize
|
|
||||||
min_freq = min(FREQ.itervalues())
|
|
||||||
print >> sys.stderr, "dumping model to file cache"
|
|
||||||
marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb'))
|
|
||||||
|
|
||||||
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
|
|
||||||
print >> sys.stderr, "Trie has been built succesfully."
|
|
||||||
|
|
||||||
|
|
||||||
def __cut_all(sentence):
|
class Tokenizer(object):
|
||||||
dag = get_DAG(sentence)
|
|
||||||
old_j = -1
|
|
||||||
for k,L in dag.iteritems():
|
|
||||||
if len(L)==1 and k>old_j:
|
|
||||||
yield sentence[k:L[0]+1]
|
|
||||||
old_j = L[0]
|
|
||||||
else:
|
|
||||||
for j in L:
|
|
||||||
if j>k:
|
|
||||||
yield sentence[k:j+1]
|
|
||||||
old_j = j
|
|
||||||
|
|
||||||
def calc(sentence,DAG,idx,route):
|
def __init__(self, dictionary=DEFAULT_DICT):
|
||||||
N = len(sentence)
|
self.lock = threading.RLock()
|
||||||
route[N] = (1.0,'')
|
if dictionary == DEFAULT_DICT:
|
||||||
for idx in xrange(N-1,-1,-1):
|
self.dictionary = dictionary
|
||||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
else:
|
||||||
route[idx] = max(candidates)
|
self.dictionary = _get_abs_path(dictionary)
|
||||||
|
self.FREQ = {}
|
||||||
|
self.total = 0
|
||||||
|
self.user_word_tag_tab = {}
|
||||||
|
self.initialized = False
|
||||||
|
self.tmp_dir = None
|
||||||
|
self.cache_file = None
|
||||||
|
|
||||||
def get_DAG(sentence):
|
def __repr__(self):
|
||||||
N = len(sentence)
|
return '<Tokenizer dictionary=%r>' % self.dictionary
|
||||||
i,j=0,0
|
|
||||||
p = trie
|
|
||||||
DAG = {}
|
|
||||||
while i<N:
|
|
||||||
c = sentence[j]
|
|
||||||
if c in p:
|
|
||||||
p = p[c]
|
|
||||||
if '' in p:
|
|
||||||
if not i in DAG:
|
|
||||||
DAG[i]=[]
|
|
||||||
DAG[i].append(j)
|
|
||||||
j+=1
|
|
||||||
if j>=N:
|
|
||||||
i+=1
|
|
||||||
j=i
|
|
||||||
p=trie
|
|
||||||
else:
|
|
||||||
p = trie
|
|
||||||
i+=1
|
|
||||||
j=i
|
|
||||||
for i in xrange(len(sentence)):
|
|
||||||
if not i in DAG:
|
|
||||||
DAG[i] =[i]
|
|
||||||
return DAG
|
|
||||||
|
|
||||||
def __cut_DAG(sentence):
|
@staticmethod
|
||||||
DAG = get_DAG(sentence)
|
def gen_pfdict(f):
|
||||||
route ={}
|
lfreq = {}
|
||||||
calc(sentence,DAG,0,route=route)
|
ltotal = 0
|
||||||
x = 0
|
f_name = resolve_filename(f)
|
||||||
buf =u''
|
for lineno, line in enumerate(f, 1):
|
||||||
N = len(sentence)
|
try:
|
||||||
while x<N:
|
line = line.strip().decode('utf-8')
|
||||||
y = route[x][1]+1
|
word, freq = line.split(' ')[:2]
|
||||||
l_word = sentence[x:y]
|
freq = int(freq)
|
||||||
if y-x==1:
|
lfreq[word] = freq
|
||||||
buf+= l_word
|
ltotal += freq
|
||||||
else:
|
for ch in xrange(len(word)):
|
||||||
if len(buf)>0:
|
wfrag = word[:ch + 1]
|
||||||
if len(buf)==1:
|
if wfrag not in lfreq:
|
||||||
yield buf
|
lfreq[wfrag] = 0
|
||||||
buf=u''
|
except ValueError:
|
||||||
else:
|
raise ValueError(
|
||||||
regognized = finalseg.__cut(buf)
|
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
|
||||||
for t in regognized:
|
f.close()
|
||||||
yield t
|
return lfreq, ltotal
|
||||||
buf=u''
|
|
||||||
yield l_word
|
|
||||||
x =y
|
|
||||||
|
|
||||||
if len(buf)>0:
|
def initialize(self, dictionary=None):
|
||||||
if len(buf)==1:
|
if dictionary:
|
||||||
yield buf
|
abs_path = _get_abs_path(dictionary)
|
||||||
else:
|
if self.dictionary == abs_path and self.initialized:
|
||||||
regognized = finalseg.__cut(buf)
|
return
|
||||||
for t in regognized:
|
else:
|
||||||
yield t
|
self.dictionary = abs_path
|
||||||
|
self.initialized = False
|
||||||
|
else:
|
||||||
|
abs_path = self.dictionary
|
||||||
|
|
||||||
|
with self.lock:
|
||||||
|
try:
|
||||||
|
with DICT_WRITING[abs_path]:
|
||||||
|
pass
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if self.initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
|
||||||
|
t1 = time.time()
|
||||||
|
if self.cache_file:
|
||||||
|
cache_file = self.cache_file
|
||||||
|
# default dictionary
|
||||||
|
elif abs_path == DEFAULT_DICT:
|
||||||
|
cache_file = "jieba.cache"
|
||||||
|
# custom dictionary
|
||||||
|
else:
|
||||||
|
cache_file = "jieba.u%s.cache" % md5(
|
||||||
|
abs_path.encode('utf-8', 'replace')).hexdigest()
|
||||||
|
cache_file = os.path.join(
|
||||||
|
self.tmp_dir or tempfile.gettempdir(), cache_file)
|
||||||
|
# prevent absolute path in self.cache_file
|
||||||
|
tmpdir = os.path.dirname(cache_file)
|
||||||
|
|
||||||
|
load_from_cache_fail = True
|
||||||
|
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
|
||||||
|
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
|
||||||
|
default_logger.debug(
|
||||||
|
"Loading model from cache %s" % cache_file)
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'rb') as cf:
|
||||||
|
self.FREQ, self.total = marshal.load(cf)
|
||||||
|
load_from_cache_fail = False
|
||||||
|
except Exception:
|
||||||
|
load_from_cache_fail = True
|
||||||
|
|
||||||
|
if load_from_cache_fail:
|
||||||
|
wlock = DICT_WRITING.get(abs_path, threading.RLock())
|
||||||
|
DICT_WRITING[abs_path] = wlock
|
||||||
|
with wlock:
|
||||||
|
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
|
||||||
|
default_logger.debug(
|
||||||
|
"Dumping model to file cache %s" % cache_file)
|
||||||
|
try:
|
||||||
|
# prevent moving across different filesystems
|
||||||
|
fd, fpath = tempfile.mkstemp(dir=tmpdir)
|
||||||
|
with os.fdopen(fd, 'wb') as temp_cache_file:
|
||||||
|
marshal.dump(
|
||||||
|
(self.FREQ, self.total), temp_cache_file)
|
||||||
|
_replace_file(fpath, cache_file)
|
||||||
|
except Exception:
|
||||||
|
default_logger.exception("Dump cache file failed.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
del DICT_WRITING[abs_path]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.initialized = True
|
||||||
|
default_logger.debug(
|
||||||
|
"Loading model cost %.3f seconds." % (time.time() - t1))
|
||||||
|
default_logger.debug("Prefix dict has been built successfully.")
|
||||||
|
|
||||||
|
def check_initialized(self):
|
||||||
|
if not self.initialized:
|
||||||
|
self.initialize()
|
||||||
|
|
||||||
|
def calc(self, sentence, DAG, route):
|
||||||
|
N = len(sentence)
|
||||||
|
route[N] = (0, 0)
|
||||||
|
logtotal = log(self.total)
|
||||||
|
for idx in xrange(N - 1, -1, -1):
|
||||||
|
route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
|
||||||
|
logtotal + route[x + 1][0], x) for x in DAG[idx])
|
||||||
|
|
||||||
|
def get_DAG(self, sentence):
|
||||||
|
self.check_initialized()
|
||||||
|
DAG = {}
|
||||||
|
N = len(sentence)
|
||||||
|
for k in xrange(N):
|
||||||
|
tmplist = []
|
||||||
|
i = k
|
||||||
|
frag = sentence[k]
|
||||||
|
while i < N and frag in self.FREQ:
|
||||||
|
if self.FREQ[frag]:
|
||||||
|
tmplist.append(i)
|
||||||
|
i += 1
|
||||||
|
frag = sentence[k:i + 1]
|
||||||
|
if not tmplist:
|
||||||
|
tmplist.append(k)
|
||||||
|
DAG[k] = tmplist
|
||||||
|
return DAG
|
||||||
|
|
||||||
|
def __cut_all(self, sentence):
|
||||||
|
dag = self.get_DAG(sentence)
|
||||||
|
old_j = -1
|
||||||
|
eng_scan = 0
|
||||||
|
eng_buf = u''
|
||||||
|
for k, L in iteritems(dag):
|
||||||
|
if eng_scan == 1 and not re_eng.match(sentence[k]):
|
||||||
|
eng_scan = 0
|
||||||
|
yield eng_buf
|
||||||
|
if len(L) == 1 and k > old_j:
|
||||||
|
word = sentence[k:L[0] + 1]
|
||||||
|
if re_eng.match(word):
|
||||||
|
if eng_scan == 0:
|
||||||
|
eng_scan = 1
|
||||||
|
eng_buf = word
|
||||||
|
else:
|
||||||
|
eng_buf += word
|
||||||
|
if eng_scan == 0:
|
||||||
|
yield word
|
||||||
|
old_j = L[0]
|
||||||
|
else:
|
||||||
|
for j in L:
|
||||||
|
if j > k:
|
||||||
|
yield sentence[k:j + 1]
|
||||||
|
old_j = j
|
||||||
|
if eng_scan == 1:
|
||||||
|
yield eng_buf
|
||||||
|
|
||||||
|
def __cut_DAG_NO_HMM(self, sentence):
|
||||||
|
DAG = self.get_DAG(sentence)
|
||||||
|
route = {}
|
||||||
|
self.calc(sentence, DAG, route)
|
||||||
|
x = 0
|
||||||
|
N = len(sentence)
|
||||||
|
buf = ''
|
||||||
|
while x < N:
|
||||||
|
y = route[x][1] + 1
|
||||||
|
l_word = sentence[x:y]
|
||||||
|
if re_eng.match(l_word) and len(l_word) == 1:
|
||||||
|
buf += l_word
|
||||||
|
x = y
|
||||||
|
else:
|
||||||
|
if buf:
|
||||||
|
yield buf
|
||||||
|
buf = ''
|
||||||
|
yield l_word
|
||||||
|
x = y
|
||||||
|
if buf:
|
||||||
|
yield buf
|
||||||
|
buf = ''
|
||||||
|
|
||||||
|
def __cut_DAG(self, sentence):
|
||||||
|
DAG = self.get_DAG(sentence)
|
||||||
|
route = {}
|
||||||
|
self.calc(sentence, DAG, route)
|
||||||
|
x = 0
|
||||||
|
buf = ''
|
||||||
|
N = len(sentence)
|
||||||
|
while x < N:
|
||||||
|
y = route[x][1] + 1
|
||||||
|
l_word = sentence[x:y]
|
||||||
|
if y - x == 1:
|
||||||
|
buf += l_word
|
||||||
|
else:
|
||||||
|
if buf:
|
||||||
|
if len(buf) == 1:
|
||||||
|
yield buf
|
||||||
|
buf = ''
|
||||||
|
else:
|
||||||
|
if not self.FREQ.get(buf):
|
||||||
|
recognized = finalseg.cut(buf)
|
||||||
|
for t in recognized:
|
||||||
|
yield t
|
||||||
|
else:
|
||||||
|
for elem in buf:
|
||||||
|
yield elem
|
||||||
|
buf = ''
|
||||||
|
yield l_word
|
||||||
|
x = y
|
||||||
|
|
||||||
|
if buf:
|
||||||
|
if len(buf) == 1:
|
||||||
|
yield buf
|
||||||
|
elif not self.FREQ.get(buf):
|
||||||
|
recognized = finalseg.cut(buf)
|
||||||
|
for t in recognized:
|
||||||
|
yield t
|
||||||
|
else:
|
||||||
|
for elem in buf:
|
||||||
|
yield elem
|
||||||
|
|
||||||
|
def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False):
|
||||||
|
"""
|
||||||
|
The main function that segments an entire sentence that contains
|
||||||
|
Chinese characters into separated words.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
- sentence: The str(unicode) to be segmented.
|
||||||
|
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||||
|
- HMM: Whether to use the Hidden Markov Model.
|
||||||
|
"""
|
||||||
|
is_paddle_installed = check_paddle_install['is_paddle_installed']
|
||||||
|
sentence = strdecode(sentence)
|
||||||
|
if use_paddle and is_paddle_installed:
|
||||||
|
# if sentence is null, it will raise core exception in paddle.
|
||||||
|
if sentence is None or len(sentence) == 0:
|
||||||
|
return
|
||||||
|
import jieba.lac_small.predict as predict
|
||||||
|
results = predict.get_sent(sentence)
|
||||||
|
for sent in results:
|
||||||
|
if sent is None:
|
||||||
|
continue
|
||||||
|
yield sent
|
||||||
|
return
|
||||||
|
re_han = re_han_default
|
||||||
|
re_skip = re_skip_default
|
||||||
|
if cut_all:
|
||||||
|
cut_block = self.__cut_all
|
||||||
|
elif HMM:
|
||||||
|
cut_block = self.__cut_DAG
|
||||||
|
else:
|
||||||
|
cut_block = self.__cut_DAG_NO_HMM
|
||||||
|
blocks = re_han.split(sentence)
|
||||||
|
for blk in blocks:
|
||||||
|
if not blk:
|
||||||
|
continue
|
||||||
|
if re_han.match(blk):
|
||||||
|
for word in cut_block(blk):
|
||||||
|
yield word
|
||||||
|
else:
|
||||||
|
tmp = re_skip.split(blk)
|
||||||
|
for x in tmp:
|
||||||
|
if re_skip.match(x):
|
||||||
|
yield x
|
||||||
|
elif not cut_all:
|
||||||
|
for xx in x:
|
||||||
|
yield xx
|
||||||
|
else:
|
||||||
|
yield x
|
||||||
|
|
||||||
|
def cut_for_search(self, sentence, HMM=True):
|
||||||
|
"""
|
||||||
|
Finer segmentation for search engines.
|
||||||
|
"""
|
||||||
|
words = self.cut(sentence, HMM=HMM)
|
||||||
|
for w in words:
|
||||||
|
if len(w) > 2:
|
||||||
|
for i in xrange(len(w) - 1):
|
||||||
|
gram2 = w[i:i + 2]
|
||||||
|
if self.FREQ.get(gram2):
|
||||||
|
yield gram2
|
||||||
|
if len(w) > 3:
|
||||||
|
for i in xrange(len(w) - 2):
|
||||||
|
gram3 = w[i:i + 3]
|
||||||
|
if self.FREQ.get(gram3):
|
||||||
|
yield gram3
|
||||||
|
yield w
|
||||||
|
|
||||||
|
def lcut(self, *args, **kwargs):
|
||||||
|
return list(self.cut(*args, **kwargs))
|
||||||
|
|
||||||
|
def lcut_for_search(self, *args, **kwargs):
|
||||||
|
return list(self.cut_for_search(*args, **kwargs))
|
||||||
|
|
||||||
|
_lcut = lcut
|
||||||
|
_lcut_for_search = lcut_for_search
|
||||||
|
|
||||||
|
def _lcut_no_hmm(self, sentence):
|
||||||
|
return self.lcut(sentence, False, False)
|
||||||
|
|
||||||
|
def _lcut_all(self, sentence):
|
||||||
|
return self.lcut(sentence, True)
|
||||||
|
|
||||||
|
def _lcut_for_search_no_hmm(self, sentence):
|
||||||
|
return self.lcut_for_search(sentence, False)
|
||||||
|
|
||||||
|
def get_dict_file(self):
|
||||||
|
if self.dictionary == DEFAULT_DICT:
|
||||||
|
return get_module_res(DEFAULT_DICT_NAME)
|
||||||
|
else:
|
||||||
|
return open(self.dictionary, 'rb')
|
||||||
|
|
||||||
|
def load_userdict(self, f):
|
||||||
|
'''
|
||||||
|
Load personalized dict to improve detect rate.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
- f : A plain text file contains words and their ocurrences.
|
||||||
|
Can be a file-like object, or the path of the dictionary file,
|
||||||
|
whose encoding must be utf-8.
|
||||||
|
|
||||||
|
Structure of dict file:
|
||||||
|
word1 freq1 word_type1
|
||||||
|
word2 freq2 word_type2
|
||||||
|
...
|
||||||
|
Word type may be ignored
|
||||||
|
'''
|
||||||
|
self.check_initialized()
|
||||||
|
if isinstance(f, string_types):
|
||||||
|
f_name = f
|
||||||
|
f = open(f, 'rb')
|
||||||
|
else:
|
||||||
|
f_name = resolve_filename(f)
|
||||||
|
for lineno, ln in enumerate(f, 1):
|
||||||
|
line = ln.strip()
|
||||||
|
if not isinstance(line, text_type):
|
||||||
|
try:
|
||||||
|
line = line.decode('utf-8').lstrip('\ufeff')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
raise ValueError('dictionary file %s must be utf-8' % f_name)
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
# match won't be None because there's at least one character
|
||||||
|
word, freq, tag = re_userdict.match(line).groups()
|
||||||
|
if freq is not None:
|
||||||
|
freq = freq.strip()
|
||||||
|
if tag is not None:
|
||||||
|
tag = tag.strip()
|
||||||
|
self.add_word(word, freq, tag)
|
||||||
|
|
||||||
|
def add_word(self, word, freq=None, tag=None):
|
||||||
|
"""
|
||||||
|
Add a word to dictionary.
|
||||||
|
|
||||||
|
freq and tag can be omitted, freq defaults to be a calculated value
|
||||||
|
that ensures the word can be cut out.
|
||||||
|
"""
|
||||||
|
self.check_initialized()
|
||||||
|
word = strdecode(word)
|
||||||
|
freq = int(freq) if freq is not None else self.suggest_freq(word, False)
|
||||||
|
self.FREQ[word] = freq
|
||||||
|
self.total += freq
|
||||||
|
if tag:
|
||||||
|
self.user_word_tag_tab[word] = tag
|
||||||
|
for ch in xrange(len(word)):
|
||||||
|
wfrag = word[:ch + 1]
|
||||||
|
if wfrag not in self.FREQ:
|
||||||
|
self.FREQ[wfrag] = 0
|
||||||
|
if freq == 0:
|
||||||
|
finalseg.add_force_split(word)
|
||||||
|
|
||||||
|
def del_word(self, word):
|
||||||
|
"""
|
||||||
|
Convenient function for deleting a word.
|
||||||
|
"""
|
||||||
|
self.add_word(word, 0)
|
||||||
|
|
||||||
|
def suggest_freq(self, segment, tune=False):
|
||||||
|
"""
|
||||||
|
Suggest word frequency to force the characters in a word to be
|
||||||
|
joined or splitted.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
- segment : The segments that the word is expected to be cut into,
|
||||||
|
If the word should be treated as a whole, use a str.
|
||||||
|
- tune : If True, tune the word frequency.
|
||||||
|
|
||||||
|
Note that HMM may affect the final result. If the result doesn't change,
|
||||||
|
set HMM=False.
|
||||||
|
"""
|
||||||
|
self.check_initialized()
|
||||||
|
ftotal = float(self.total)
|
||||||
|
freq = 1
|
||||||
|
if isinstance(segment, string_types):
|
||||||
|
word = segment
|
||||||
|
for seg in self.cut(word, HMM=False):
|
||||||
|
freq *= self.FREQ.get(seg, 1) / ftotal
|
||||||
|
freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1))
|
||||||
|
else:
|
||||||
|
segment = tuple(map(strdecode, segment))
|
||||||
|
word = ''.join(segment)
|
||||||
|
for seg in segment:
|
||||||
|
freq *= self.FREQ.get(seg, 1) / ftotal
|
||||||
|
freq = min(int(freq * self.total), self.FREQ.get(word, 0))
|
||||||
|
if tune:
|
||||||
|
self.add_word(word, freq)
|
||||||
|
return freq
|
||||||
|
|
||||||
|
def tokenize(self, unicode_sentence, mode="default", HMM=True):
|
||||||
|
"""
|
||||||
|
Tokenize a sentence and yields tuples of (word, start, end)
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
- sentence: the str(unicode) to be segmented.
|
||||||
|
- mode: "default" or "search", "search" is for finer segmentation.
|
||||||
|
- HMM: whether to use the Hidden Markov Model.
|
||||||
|
"""
|
||||||
|
if not isinstance(unicode_sentence, text_type):
|
||||||
|
raise ValueError("jieba: the input parameter should be unicode.")
|
||||||
|
start = 0
|
||||||
|
if mode == 'default':
|
||||||
|
for w in self.cut(unicode_sentence, HMM=HMM):
|
||||||
|
width = len(w)
|
||||||
|
yield (w, start, start + width)
|
||||||
|
start += width
|
||||||
|
else:
|
||||||
|
for w in self.cut(unicode_sentence, HMM=HMM):
|
||||||
|
width = len(w)
|
||||||
|
if len(w) > 2:
|
||||||
|
for i in xrange(len(w) - 1):
|
||||||
|
gram2 = w[i:i + 2]
|
||||||
|
if self.FREQ.get(gram2):
|
||||||
|
yield (gram2, start + i, start + i + 2)
|
||||||
|
if len(w) > 3:
|
||||||
|
for i in xrange(len(w) - 2):
|
||||||
|
gram3 = w[i:i + 3]
|
||||||
|
if self.FREQ.get(gram3):
|
||||||
|
yield (gram3, start + i, start + i + 3)
|
||||||
|
yield (w, start, start + width)
|
||||||
|
start += width
|
||||||
|
|
||||||
|
def set_dictionary(self, dictionary_path):
|
||||||
|
with self.lock:
|
||||||
|
abs_path = _get_abs_path(dictionary_path)
|
||||||
|
if not os.path.isfile(abs_path):
|
||||||
|
raise Exception("jieba: file does not exist: " + abs_path)
|
||||||
|
self.dictionary = abs_path
|
||||||
|
self.initialized = False
|
||||||
|
|
||||||
|
|
||||||
def cut(sentence,cut_all=False):
|
# default Tokenizer instance
|
||||||
if not ( type(sentence) is unicode):
|
|
||||||
try:
|
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except:
|
|
||||||
sentence = sentence.decode('gbk','ignore')
|
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
|
||||||
blocks = re_han.split(sentence)
|
|
||||||
cut_block = __cut_DAG
|
|
||||||
if cut_all:
|
|
||||||
cut_block = __cut_all
|
|
||||||
for blk in blocks:
|
|
||||||
if re_han.match(blk):
|
|
||||||
#pprint.pprint(__cut_DAG(blk))
|
|
||||||
for word in cut_block(blk):
|
|
||||||
yield word
|
|
||||||
else:
|
|
||||||
tmp = re_skip.split(blk)
|
|
||||||
for x in tmp:
|
|
||||||
if x!="":
|
|
||||||
yield x
|
|
||||||
|
|
||||||
def cut_for_search(sentence):
|
dt = Tokenizer()
|
||||||
words = cut(sentence)
|
|
||||||
for w in words:
|
|
||||||
if len(w)>2:
|
|
||||||
for i in xrange(len(w)-1):
|
|
||||||
gram2 = w[i:i+2]
|
|
||||||
if gram2 in FREQ:
|
|
||||||
yield gram2
|
|
||||||
if len(w)>3:
|
|
||||||
for i in xrange(len(w)-2):
|
|
||||||
gram3 = w[i:i+3]
|
|
||||||
if gram3 in FREQ:
|
|
||||||
yield gram3
|
|
||||||
yield w
|
|
||||||
|
|
||||||
def load_userdict(f):
|
# global functions
|
||||||
global trie,total,FREQ
|
|
||||||
if isinstance(f, (str, unicode)):
|
get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
|
||||||
f = open(f, 'rb')
|
add_word = dt.add_word
|
||||||
content = f.read().decode('utf-8')
|
calc = dt.calc
|
||||||
for line in content.split("\n"):
|
cut = dt.cut
|
||||||
if line.rstrip()=='': continue
|
lcut = dt.lcut
|
||||||
word,freq = line.split(" ")
|
cut_for_search = dt.cut_for_search
|
||||||
freq = float(freq)
|
lcut_for_search = dt.lcut_for_search
|
||||||
FREQ[word] = freq / total
|
del_word = dt.del_word
|
||||||
p = trie
|
get_DAG = dt.get_DAG
|
||||||
for c in word:
|
get_dict_file = dt.get_dict_file
|
||||||
if not c in p:
|
initialize = dt.initialize
|
||||||
p[c] ={}
|
load_userdict = dt.load_userdict
|
||||||
p = p[c]
|
set_dictionary = dt.set_dictionary
|
||||||
p['']='' #ending flag
|
suggest_freq = dt.suggest_freq
|
||||||
|
tokenize = dt.tokenize
|
||||||
|
user_word_tag_tab = dt.user_word_tag_tab
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_all(s):
|
||||||
|
return dt._lcut_all(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut(s):
|
||||||
|
return dt._lcut(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_no_hmm(s):
|
||||||
|
return dt._lcut_no_hmm(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_all(s):
|
||||||
|
return dt._lcut_all(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_for_search(s):
|
||||||
|
return dt._lcut_for_search(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_for_search_no_hmm(s):
|
||||||
|
return dt._lcut_for_search_no_hmm(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _pcut(sentence, cut_all=False, HMM=True):
|
||||||
|
parts = strdecode(sentence).splitlines(True)
|
||||||
|
if cut_all:
|
||||||
|
result = pool.map(_lcut_all, parts)
|
||||||
|
elif HMM:
|
||||||
|
result = pool.map(_lcut, parts)
|
||||||
|
else:
|
||||||
|
result = pool.map(_lcut_no_hmm, parts)
|
||||||
|
for r in result:
|
||||||
|
for w in r:
|
||||||
|
yield w
|
||||||
|
|
||||||
|
|
||||||
|
def _pcut_for_search(sentence, HMM=True):
|
||||||
|
parts = strdecode(sentence).splitlines(True)
|
||||||
|
if HMM:
|
||||||
|
result = pool.map(_lcut_for_search, parts)
|
||||||
|
else:
|
||||||
|
result = pool.map(_lcut_for_search_no_hmm, parts)
|
||||||
|
for r in result:
|
||||||
|
for w in r:
|
||||||
|
yield w
|
||||||
|
|
||||||
|
|
||||||
|
def enable_parallel(processnum=None):
|
||||||
|
"""
|
||||||
|
Change the module's `cut` and `cut_for_search` functions to the
|
||||||
|
parallel version.
|
||||||
|
|
||||||
|
Note that this only works using dt, custom Tokenizer
|
||||||
|
instances are not supported.
|
||||||
|
"""
|
||||||
|
global pool, dt, cut, cut_for_search
|
||||||
|
from multiprocessing import cpu_count
|
||||||
|
if os.name == 'nt':
|
||||||
|
raise NotImplementedError(
|
||||||
|
"jieba: parallel mode only supports posix system")
|
||||||
|
else:
|
||||||
|
from multiprocessing import Pool
|
||||||
|
dt.check_initialized()
|
||||||
|
if processnum is None:
|
||||||
|
processnum = cpu_count()
|
||||||
|
pool = Pool(processnum)
|
||||||
|
cut = _pcut
|
||||||
|
cut_for_search = _pcut_for_search
|
||||||
|
|
||||||
|
|
||||||
|
def disable_parallel():
|
||||||
|
global pool, dt, cut, cut_for_search
|
||||||
|
if pool:
|
||||||
|
pool.close()
|
||||||
|
pool = None
|
||||||
|
cut = dt.cut
|
||||||
|
cut_for_search = dt.cut_for_search
|
||||||
|
61
jieba/__main__.py
Normal file
61
jieba/__main__.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
"""Jieba command line interface."""
|
||||||
|
import sys
|
||||||
|
import jieba
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from ._compat import *
|
||||||
|
|
||||||
|
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
|
||||||
|
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
||||||
|
nargs='?', const=' ',
|
||||||
|
help="use DELIM instead of ' / ' for word delimiter; or a space if it is used without DELIM")
|
||||||
|
parser.add_argument("-p", "--pos", metavar="DELIM", nargs='?', const='_',
|
||||||
|
help="enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter")
|
||||||
|
parser.add_argument("-D", "--dict", help="use DICT as dictionary")
|
||||||
|
parser.add_argument("-u", "--user-dict",
|
||||||
|
help="use USER_DICT together with the default dictionary or DICT (if specified)")
|
||||||
|
parser.add_argument("-a", "--cut-all",
|
||||||
|
action="store_true", dest="cutall", default=False,
|
||||||
|
help="full pattern cutting (ignored with POS tagging)")
|
||||||
|
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
||||||
|
default=True, help="don't use the Hidden Markov Model")
|
||||||
|
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
||||||
|
help="don't print loading messages to stderr")
|
||||||
|
parser.add_argument("-V", '--version', action='version',
|
||||||
|
version="Jieba " + jieba.__version__)
|
||||||
|
parser.add_argument("filename", nargs='?', help="input file")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.quiet:
|
||||||
|
jieba.setLogLevel(60)
|
||||||
|
if args.pos:
|
||||||
|
import jieba.posseg
|
||||||
|
posdelim = args.pos
|
||||||
|
def cutfunc(sentence, _, HMM=True):
|
||||||
|
for w, f in jieba.posseg.cut(sentence, HMM):
|
||||||
|
yield w + posdelim + f
|
||||||
|
else:
|
||||||
|
cutfunc = jieba.cut
|
||||||
|
|
||||||
|
delim = text_type(args.delimiter)
|
||||||
|
cutall = args.cutall
|
||||||
|
hmm = args.hmm
|
||||||
|
fp = open(args.filename, 'r') if args.filename else sys.stdin
|
||||||
|
|
||||||
|
if args.dict:
|
||||||
|
jieba.initialize(args.dict)
|
||||||
|
else:
|
||||||
|
jieba.initialize()
|
||||||
|
if args.user_dict:
|
||||||
|
jieba.load_userdict(args.user_dict)
|
||||||
|
|
||||||
|
ln = fp.readline()
|
||||||
|
while ln:
|
||||||
|
l = ln.rstrip('\r\n')
|
||||||
|
result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
|
||||||
|
if PY2:
|
||||||
|
result = result.encode(default_encoding)
|
||||||
|
print(result)
|
||||||
|
ln = fp.readline()
|
||||||
|
|
||||||
|
fp.close()
|
89
jieba/_compat.py
Normal file
89
jieba/_compat.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
log_console = logging.StreamHandler(sys.stderr)
|
||||||
|
default_logger = logging.getLogger(__name__)
|
||||||
|
default_logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
def setLogLevel(log_level):
|
||||||
|
default_logger.setLevel(log_level)
|
||||||
|
|
||||||
|
|
||||||
|
check_paddle_install = {'is_paddle_installed': False}
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
|
get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
|
||||||
|
os.path.join(*res))
|
||||||
|
except ImportError:
|
||||||
|
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
|
||||||
|
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
|
||||||
|
|
||||||
|
|
||||||
|
def enable_paddle():
|
||||||
|
try:
|
||||||
|
import paddle
|
||||||
|
except ImportError:
|
||||||
|
default_logger.debug("Installing paddle-tiny, please wait a minute......")
|
||||||
|
os.system("pip install paddlepaddle-tiny")
|
||||||
|
try:
|
||||||
|
import paddle
|
||||||
|
except ImportError:
|
||||||
|
default_logger.debug(
|
||||||
|
"Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1."
|
||||||
|
"Now, back to jieba basic cut......")
|
||||||
|
if paddle.__version__ < '1.6.1':
|
||||||
|
default_logger.debug("Find your own paddle version doesn't satisfy the minimum requirement (1.6.1), "
|
||||||
|
"please install paddle tiny by 'pip install --upgrade paddlepaddle-tiny', "
|
||||||
|
"or upgrade paddle full version by "
|
||||||
|
"'pip install --upgrade paddlepaddle (-gpu for GPU version)' ")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import jieba.lac_small.predict as predict
|
||||||
|
default_logger.debug("Paddle enabled successfully......")
|
||||||
|
check_paddle_install['is_paddle_installed'] = True
|
||||||
|
except ImportError:
|
||||||
|
default_logger.debug("Import error, cannot find paddle.fluid and jieba.lac_small.predict module. "
|
||||||
|
"Now, back to jieba basic cut......")
|
||||||
|
|
||||||
|
|
||||||
|
PY2 = sys.version_info[0] == 2
|
||||||
|
|
||||||
|
default_encoding = sys.getfilesystemencoding()
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
text_type = unicode
|
||||||
|
string_types = (str, unicode)
|
||||||
|
|
||||||
|
iterkeys = lambda d: d.iterkeys()
|
||||||
|
itervalues = lambda d: d.itervalues()
|
||||||
|
iteritems = lambda d: d.iteritems()
|
||||||
|
|
||||||
|
else:
|
||||||
|
text_type = str
|
||||||
|
string_types = (str,)
|
||||||
|
xrange = range
|
||||||
|
|
||||||
|
iterkeys = lambda d: iter(d.keys())
|
||||||
|
itervalues = lambda d: iter(d.values())
|
||||||
|
iteritems = lambda d: iter(d.items())
|
||||||
|
|
||||||
|
|
||||||
|
def strdecode(sentence):
|
||||||
|
if not isinstance(sentence, text_type):
|
||||||
|
try:
|
||||||
|
sentence = sentence.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_filename(f):
|
||||||
|
try:
|
||||||
|
return f.name
|
||||||
|
except AttributeError:
|
||||||
|
return repr(f)
|
42
jieba/analyse/__init__.py
Normal file → Executable file
42
jieba/analyse/__init__.py
Normal file → Executable file
@ -1,30 +1,18 @@
|
|||||||
import jieba
|
from __future__ import absolute_import
|
||||||
import os
|
from .tfidf import TFIDF
|
||||||
|
from .textrank import TextRank
|
||||||
|
try:
|
||||||
|
from .analyzer import ChineseAnalyzer
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
default_tfidf = TFIDF()
|
||||||
f_name = os.path.join(_curpath,"idf.txt")
|
default_textrank = TextRank()
|
||||||
content = open(f_name,'rb').read().decode('utf-8')
|
|
||||||
|
|
||||||
idf_freq = {}
|
extract_tags = tfidf = default_tfidf.extract_tags
|
||||||
lines = content.split('\n')
|
set_idf_path = default_tfidf.set_idf_path
|
||||||
for line in lines:
|
textrank = default_textrank.extract_tags
|
||||||
word,freq = line.split(' ')
|
|
||||||
idf_freq[word] = float(freq)
|
|
||||||
max_idf = max(idf_freq.values())
|
|
||||||
|
|
||||||
def extract_tags(sentence,topK=20):
|
|
||||||
words = jieba.cut(sentence)
|
|
||||||
freq = {}
|
|
||||||
for w in words:
|
|
||||||
if len(w.strip())<2: continue
|
|
||||||
freq[w]=freq.get(w,0.0)+1.0
|
|
||||||
total = sum(freq.values())
|
|
||||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
|
||||||
|
|
||||||
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
|
|
||||||
st_list = sorted(tf_idf_list,reverse=True)
|
|
||||||
|
|
||||||
top_tuples= st_list[:topK]
|
|
||||||
tags = [a[1] for a in top_tuples]
|
|
||||||
return tags
|
|
||||||
|
|
||||||
|
def set_stop_words(stop_words_path):
|
||||||
|
default_tfidf.set_stop_words(stop_words_path)
|
||||||
|
default_textrank.set_stop_words(stop_words_path)
|
||||||
|
37
jieba/analyse/analyzer.py
Normal file
37
jieba/analyse/analyzer.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# encoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from whoosh.analysis import RegexAnalyzer, LowercaseFilter, StopFilter, StemFilter
|
||||||
|
from whoosh.analysis import Tokenizer, Token
|
||||||
|
from whoosh.lang.porter import stem
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import re
|
||||||
|
|
||||||
|
STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
|
||||||
|
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
|
||||||
|
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
|
||||||
|
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
||||||
|
'you', 'your', '的', '了', '和'))
|
||||||
|
|
||||||
|
accepted_chars = re.compile(r"[\u4E00-\u9FD5]+")
|
||||||
|
|
||||||
|
|
||||||
|
class ChineseTokenizer(Tokenizer):
|
||||||
|
|
||||||
|
def __call__(self, text, **kargs):
|
||||||
|
words = jieba.tokenize(text, mode="search")
|
||||||
|
token = Token()
|
||||||
|
for (w, start_pos, stop_pos) in words:
|
||||||
|
if not accepted_chars.match(w) and len(w) <= 1:
|
||||||
|
continue
|
||||||
|
token.original = token.text = w
|
||||||
|
token.pos = start_pos
|
||||||
|
token.startchar = start_pos
|
||||||
|
token.endchar = stop_pos
|
||||||
|
yield token
|
||||||
|
|
||||||
|
|
||||||
|
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
|
||||||
|
return (ChineseTokenizer() | LowercaseFilter() |
|
||||||
|
StopFilter(stoplist=stoplist, minsize=minsize) |
|
||||||
|
StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))
|
110
jieba/analyse/textrank.py
Normal file
110
jieba/analyse/textrank.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
import sys
|
||||||
|
from operator import itemgetter
|
||||||
|
from collections import defaultdict
|
||||||
|
import jieba.posseg
|
||||||
|
from .tfidf import KeywordExtractor
|
||||||
|
from .._compat import *
|
||||||
|
|
||||||
|
|
||||||
|
class UndirectWeightedGraph:
|
||||||
|
d = 0.85
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.graph = defaultdict(list)
|
||||||
|
|
||||||
|
def addEdge(self, start, end, weight):
|
||||||
|
# use a tuple (start, end, weight) instead of a Edge object
|
||||||
|
self.graph[start].append((start, end, weight))
|
||||||
|
self.graph[end].append((end, start, weight))
|
||||||
|
|
||||||
|
def rank(self):
|
||||||
|
ws = defaultdict(float)
|
||||||
|
outSum = defaultdict(float)
|
||||||
|
|
||||||
|
wsdef = 1.0 / (len(self.graph) or 1.0)
|
||||||
|
for n, out in self.graph.items():
|
||||||
|
ws[n] = wsdef
|
||||||
|
outSum[n] = sum((e[2] for e in out), 0.0)
|
||||||
|
|
||||||
|
# this line for build stable iteration
|
||||||
|
sorted_keys = sorted(self.graph.keys())
|
||||||
|
for x in xrange(10): # 10 iters
|
||||||
|
for n in sorted_keys:
|
||||||
|
s = 0
|
||||||
|
for e in self.graph[n]:
|
||||||
|
s += e[2] / outSum[e[1]] * ws[e[1]]
|
||||||
|
ws[n] = (1 - self.d) + self.d * s
|
||||||
|
|
||||||
|
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
|
||||||
|
|
||||||
|
for w in itervalues(ws):
|
||||||
|
if w < min_rank:
|
||||||
|
min_rank = w
|
||||||
|
if w > max_rank:
|
||||||
|
max_rank = w
|
||||||
|
|
||||||
|
for n, w in ws.items():
|
||||||
|
# to unify the weights, don't *100.
|
||||||
|
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
||||||
|
|
||||||
|
return ws
|
||||||
|
|
||||||
|
|
||||||
|
class TextRank(KeywordExtractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.tokenizer = self.postokenizer = jieba.posseg.dt
|
||||||
|
self.stop_words = self.STOP_WORDS.copy()
|
||||||
|
self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
|
||||||
|
self.span = 5
|
||||||
|
|
||||||
|
def pairfilter(self, wp):
|
||||||
|
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
|
||||||
|
and wp.word.lower() not in self.stop_words)
|
||||||
|
|
||||||
|
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
|
||||||
|
"""
|
||||||
|
Extract keywords from sentence using TextRank algorithm.
|
||||||
|
Parameter:
|
||||||
|
- topK: return how many top keywords. `None` for all possible words.
|
||||||
|
- withWeight: if True, return a list of (word, weight);
|
||||||
|
if False, return a list of words.
|
||||||
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
||||||
|
if the POS of w is not in this list, it will be filtered.
|
||||||
|
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
|
||||||
|
if False, return a list of words
|
||||||
|
"""
|
||||||
|
self.pos_filt = frozenset(allowPOS)
|
||||||
|
g = UndirectWeightedGraph()
|
||||||
|
cm = defaultdict(int)
|
||||||
|
words = tuple(self.tokenizer.cut(sentence))
|
||||||
|
for i, wp in enumerate(words):
|
||||||
|
if self.pairfilter(wp):
|
||||||
|
for j in xrange(i + 1, i + self.span):
|
||||||
|
if j >= len(words):
|
||||||
|
break
|
||||||
|
if not self.pairfilter(words[j]):
|
||||||
|
continue
|
||||||
|
if allowPOS and withFlag:
|
||||||
|
cm[(wp, words[j])] += 1
|
||||||
|
else:
|
||||||
|
cm[(wp.word, words[j].word)] += 1
|
||||||
|
|
||||||
|
for terms, w in cm.items():
|
||||||
|
g.addEdge(terms[0], terms[1], w)
|
||||||
|
nodes_rank = g.rank()
|
||||||
|
if withWeight:
|
||||||
|
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
||||||
|
else:
|
||||||
|
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
||||||
|
|
||||||
|
if topK:
|
||||||
|
return tags[:topK]
|
||||||
|
else:
|
||||||
|
return tags
|
||||||
|
|
||||||
|
extract_tags = textrank
|
116
jieba/analyse/tfidf.py
Executable file
116
jieba/analyse/tfidf.py
Executable file
@ -0,0 +1,116 @@
|
|||||||
|
# encoding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import os
|
||||||
|
import jieba
|
||||||
|
import jieba.posseg
|
||||||
|
from operator import itemgetter
|
||||||
|
|
||||||
|
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
|
||||||
|
os.path.dirname(__file__), path))
|
||||||
|
_get_abs_path = jieba._get_abs_path
|
||||||
|
|
||||||
|
DEFAULT_IDF = _get_module_path("idf.txt")
|
||||||
|
|
||||||
|
|
||||||
|
class KeywordExtractor(object):
|
||||||
|
|
||||||
|
STOP_WORDS = set((
|
||||||
|
"the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
|
||||||
|
"by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
|
||||||
|
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
|
||||||
|
))
|
||||||
|
|
||||||
|
def set_stop_words(self, stop_words_path):
|
||||||
|
abs_path = _get_abs_path(stop_words_path)
|
||||||
|
if not os.path.isfile(abs_path):
|
||||||
|
raise Exception("jieba: file does not exist: " + abs_path)
|
||||||
|
content = open(abs_path, 'rb').read().decode('utf-8')
|
||||||
|
for line in content.splitlines():
|
||||||
|
self.stop_words.add(line)
|
||||||
|
|
||||||
|
def extract_tags(self, *args, **kwargs):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class IDFLoader(object):
|
||||||
|
|
||||||
|
def __init__(self, idf_path=None):
|
||||||
|
self.path = ""
|
||||||
|
self.idf_freq = {}
|
||||||
|
self.median_idf = 0.0
|
||||||
|
if idf_path:
|
||||||
|
self.set_new_path(idf_path)
|
||||||
|
|
||||||
|
def set_new_path(self, new_idf_path):
|
||||||
|
if self.path != new_idf_path:
|
||||||
|
self.path = new_idf_path
|
||||||
|
content = open(new_idf_path, 'rb').read().decode('utf-8')
|
||||||
|
self.idf_freq = {}
|
||||||
|
for line in content.splitlines():
|
||||||
|
word, freq = line.strip().split(' ')
|
||||||
|
self.idf_freq[word] = float(freq)
|
||||||
|
self.median_idf = sorted(
|
||||||
|
self.idf_freq.values())[len(self.idf_freq) // 2]
|
||||||
|
|
||||||
|
def get_idf(self):
|
||||||
|
return self.idf_freq, self.median_idf
|
||||||
|
|
||||||
|
|
||||||
|
class TFIDF(KeywordExtractor):
|
||||||
|
|
||||||
|
def __init__(self, idf_path=None):
|
||||||
|
self.tokenizer = jieba.dt
|
||||||
|
self.postokenizer = jieba.posseg.dt
|
||||||
|
self.stop_words = self.STOP_WORDS.copy()
|
||||||
|
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
|
||||||
|
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
|
||||||
|
|
||||||
|
def set_idf_path(self, idf_path):
|
||||||
|
new_abs_path = _get_abs_path(idf_path)
|
||||||
|
if not os.path.isfile(new_abs_path):
|
||||||
|
raise Exception("jieba: file does not exist: " + new_abs_path)
|
||||||
|
self.idf_loader.set_new_path(new_abs_path)
|
||||||
|
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
|
||||||
|
|
||||||
|
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
|
||||||
|
"""
|
||||||
|
Extract keywords from sentence using TF-IDF algorithm.
|
||||||
|
Parameter:
|
||||||
|
- topK: return how many top keywords. `None` for all possible words.
|
||||||
|
- withWeight: if True, return a list of (word, weight);
|
||||||
|
if False, return a list of words.
|
||||||
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
|
||||||
|
if the POS of w is not in this list,it will be filtered.
|
||||||
|
- withFlag: only work with allowPOS is not empty.
|
||||||
|
if True, return a list of pair(word, weight) like posseg.cut
|
||||||
|
if False, return a list of words
|
||||||
|
"""
|
||||||
|
if allowPOS:
|
||||||
|
allowPOS = frozenset(allowPOS)
|
||||||
|
words = self.postokenizer.cut(sentence)
|
||||||
|
else:
|
||||||
|
words = self.tokenizer.cut(sentence)
|
||||||
|
freq = {}
|
||||||
|
for w in words:
|
||||||
|
if allowPOS:
|
||||||
|
if w.flag not in allowPOS:
|
||||||
|
continue
|
||||||
|
elif not withFlag:
|
||||||
|
w = w.word
|
||||||
|
wc = w.word if allowPOS and withFlag else w
|
||||||
|
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
|
||||||
|
continue
|
||||||
|
freq[w] = freq.get(w, 0.0) + 1.0
|
||||||
|
total = sum(freq.values())
|
||||||
|
for k in freq:
|
||||||
|
kw = k.word if allowPOS and withFlag else k
|
||||||
|
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
|
||||||
|
|
||||||
|
if withWeight:
|
||||||
|
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
||||||
|
else:
|
||||||
|
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
||||||
|
if topK:
|
||||||
|
return tags[:topK]
|
||||||
|
else:
|
||||||
|
return tags
|
18544
jieba/dict.txt
18544
jieba/dict.txt
File diff suppressed because it is too large
Load Diff
@ -1,68 +1,100 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from .._compat import *
|
||||||
|
|
||||||
def load_model(f_name):
|
MIN_FLOAT = -3.14e100
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
||||||
prob_p_path = os.path.join(_curpath,f_name)
|
|
||||||
return eval(open(prob_p_path,"rb").read())
|
|
||||||
|
|
||||||
prob_start = load_model("prob_start.py")
|
PROB_START_P = "prob_start.p"
|
||||||
prob_trans = load_model("prob_trans.py")
|
PROB_TRANS_P = "prob_trans.p"
|
||||||
prob_emit = load_model("prob_emit.py")
|
PROB_EMIT_P = "prob_emit.p"
|
||||||
|
|
||||||
|
|
||||||
|
PrevStatus = {
|
||||||
|
'B': 'ES',
|
||||||
|
'M': 'MB',
|
||||||
|
'S': 'SE',
|
||||||
|
'E': 'BM'
|
||||||
|
}
|
||||||
|
|
||||||
|
Force_Split_Words = set([])
|
||||||
|
def load_model():
|
||||||
|
start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
|
||||||
|
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
|
||||||
|
emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
|
||||||
|
return start_p, trans_p, emit_p
|
||||||
|
|
||||||
|
if sys.platform.startswith("java"):
|
||||||
|
start_P, trans_P, emit_P = load_model()
|
||||||
|
else:
|
||||||
|
from .prob_start import P as start_P
|
||||||
|
from .prob_trans import P as trans_P
|
||||||
|
from .prob_emit import P as emit_P
|
||||||
|
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] # tabular
|
||||||
path = {}
|
path = {}
|
||||||
for y in states: #init
|
for y in states: # init
|
||||||
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
|
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
||||||
path[y] = [y]
|
path[y] = [y]
|
||||||
for t in range(1,len(obs)):
|
for t in xrange(1, len(obs)):
|
||||||
V.append({})
|
V.append({})
|
||||||
newpath = {}
|
newpath = {}
|
||||||
for y in states:
|
for y in states:
|
||||||
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ])
|
em_p = emit_p[y].get(obs[t], MIN_FLOAT)
|
||||||
V[t][y] =prob
|
(prob, state) = max(
|
||||||
newpath[y] = path[state] + [y]
|
[(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
|
||||||
path = newpath
|
V[t][y] = prob
|
||||||
|
newpath[y] = path[state] + [y]
|
||||||
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
|
path = newpath
|
||||||
|
|
||||||
return (prob, path[state])
|
(prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')
|
||||||
|
|
||||||
|
return (prob, path[state])
|
||||||
|
|
||||||
|
|
||||||
def __cut(sentence):
|
def __cut(sentence):
|
||||||
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
|
global emit_P
|
||||||
begin, next = 0,0
|
prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P)
|
||||||
#print pos_list, sentence
|
begin, nexti = 0, 0
|
||||||
for i,char in enumerate(sentence):
|
# print pos_list, sentence
|
||||||
pos = pos_list[i]
|
for i, char in enumerate(sentence):
|
||||||
if pos=='B':
|
pos = pos_list[i]
|
||||||
begin = i
|
if pos == 'B':
|
||||||
elif pos=='E':
|
begin = i
|
||||||
yield sentence[begin:i+1]
|
elif pos == 'E':
|
||||||
next = i+1
|
yield sentence[begin:i + 1]
|
||||||
elif pos=='S':
|
nexti = i + 1
|
||||||
yield char
|
elif pos == 'S':
|
||||||
next = i+1
|
yield char
|
||||||
if next<len(sentence):
|
nexti = i + 1
|
||||||
yield sentence[next:]
|
if nexti < len(sentence):
|
||||||
|
yield sentence[nexti:]
|
||||||
|
|
||||||
|
re_han = re.compile("([\u4E00-\u9FD5]+)")
|
||||||
|
re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
|
||||||
|
|
||||||
|
|
||||||
|
def add_force_split(word):
|
||||||
|
global Force_Split_Words
|
||||||
|
Force_Split_Words.add(word)
|
||||||
|
|
||||||
def cut(sentence):
|
def cut(sentence):
|
||||||
if not ( type(sentence) is unicode):
|
sentence = strdecode(sentence)
|
||||||
try:
|
blocks = re_han.split(sentence)
|
||||||
sentence = sentence.decode('utf-8')
|
for blk in blocks:
|
||||||
except:
|
if re_han.match(blk):
|
||||||
sentence = sentence.decode('gbk','ignore')
|
for word in __cut(blk):
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
if word not in Force_Split_Words:
|
||||||
blocks = re_han.split(sentence)
|
yield word
|
||||||
for blk in blocks:
|
else:
|
||||||
if re_han.match(blk):
|
for c in word:
|
||||||
for word in __cut(blk):
|
yield c
|
||||||
yield word
|
else:
|
||||||
else:
|
tmp = re_skip.split(blk)
|
||||||
tmp = re_skip.split(blk)
|
for x in tmp:
|
||||||
for x in tmp:
|
if x:
|
||||||
if x!="":
|
yield x
|
||||||
yield x
|
|
||||||
|
105686
jieba/finalseg/prob_emit.p
Normal file
105686
jieba/finalseg/prob_emit.p
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
14
jieba/finalseg/prob_start.p
Normal file
14
jieba/finalseg/prob_start.p
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
(dp0
|
||||||
|
S'B'
|
||||||
|
p1
|
||||||
|
F-0.26268660809250016
|
||||||
|
sS'E'
|
||||||
|
p2
|
||||||
|
F-3.14e+100
|
||||||
|
sS'M'
|
||||||
|
p3
|
||||||
|
F-3.14e+100
|
||||||
|
sS'S'
|
||||||
|
p4
|
||||||
|
F-1.4652633398537678
|
||||||
|
s.
|
@ -1 +1,4 @@
|
|||||||
{'B': 0.7689828525554734, 'E': 0.0, 'M': 0.0, 'S': 0.23101714744452656}
|
P={'B': -0.26268660809250016,
|
||||||
|
'E': -3.14e+100,
|
||||||
|
'M': -3.14e+100,
|
||||||
|
'S': -1.4652633398537678}
|
||||||
|
30
jieba/finalseg/prob_trans.p
Normal file
30
jieba/finalseg/prob_trans.p
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
(dp0
|
||||||
|
S'B'
|
||||||
|
p1
|
||||||
|
(dp2
|
||||||
|
S'E'
|
||||||
|
p3
|
||||||
|
F-0.51082562376599
|
||||||
|
sS'M'
|
||||||
|
p4
|
||||||
|
F-0.916290731874155
|
||||||
|
ssg3
|
||||||
|
(dp5
|
||||||
|
g1
|
||||||
|
F-0.5897149736854513
|
||||||
|
sS'S'
|
||||||
|
p6
|
||||||
|
F-0.8085250474669937
|
||||||
|
ssg4
|
||||||
|
(dp7
|
||||||
|
g3
|
||||||
|
F-0.33344856811948514
|
||||||
|
sg4
|
||||||
|
F-1.2603623820268226
|
||||||
|
ssg6
|
||||||
|
(dp8
|
||||||
|
g1
|
||||||
|
F-0.7211965654669841
|
||||||
|
sg6
|
||||||
|
F-0.6658631448798212
|
||||||
|
ss.
|
@ -1,4 +1,4 @@
|
|||||||
{'B': {'E': 0.8518218565181658, 'M': 0.14817814348183422},
|
P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
|
||||||
'E': {'B': 0.5544853051164425, 'S': 0.44551469488355755},
|
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
|
||||||
'M': {'E': 0.7164487459986911, 'M': 0.2835512540013088},
|
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
|
||||||
'S': {'B': 0.48617017333894563, 'S': 0.5138298266610544}}
|
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
|
||||||
|
0
jieba/lac_small/__init__.py
Normal file
0
jieba/lac_small/__init__.py
Normal file
46
jieba/lac_small/creator.py
Normal file
46
jieba/lac_small/creator.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Define the function to create lexical analysis model and model's data reader
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from paddle.fluid.initializer import NormalInitializer
|
||||||
|
import jieba.lac_small.nets as nets
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(vocab_size, num_labels, mode='train'):
|
||||||
|
"""create lac model"""
|
||||||
|
|
||||||
|
# model's input data
|
||||||
|
words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
|
||||||
|
targets = fluid.data(
|
||||||
|
name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
|
||||||
|
|
||||||
|
# for inference process
|
||||||
|
if mode == 'infer':
|
||||||
|
crf_decode = nets.lex_net(
|
||||||
|
words, vocab_size, num_labels, for_infer=True, target=None)
|
||||||
|
return {
|
||||||
|
"feed_list": [words],
|
||||||
|
"words": words,
|
||||||
|
"crf_decode": crf_decode,
|
||||||
|
}
|
||||||
|
return ret
|
||||||
|
|
BIN
jieba/lac_small/model_baseline/crfw
Normal file
BIN
jieba/lac_small/model_baseline/crfw
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_0.b_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_0.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_0.w_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_0.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_1.b_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_1.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_1.w_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_1.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_2.b_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_2.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_2.w_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_2.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_3.b_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_3.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_3.w_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_3.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_4.b_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_4.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_4.w_0
Normal file
BIN
jieba/lac_small/model_baseline/fc_4.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_0.b_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_0.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_0.w_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_0.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_1.b_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_1.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_1.w_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_1.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_2.b_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_2.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_2.w_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_2.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_3.b_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_3.b_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_3.w_0
Normal file
BIN
jieba/lac_small/model_baseline/gru_3.w_0
Normal file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/word_emb
Normal file
BIN
jieba/lac_small/model_baseline/word_emb
Normal file
Binary file not shown.
122
jieba/lac_small/nets.py
Normal file
122
jieba/lac_small/nets.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
The function lex_net(args) define the lexical analysis network structure
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
from paddle.fluid.initializer import NormalInitializer
|
||||||
|
|
||||||
|
|
||||||
|
def lex_net(word, vocab_size, num_labels, for_infer=True, target=None):
|
||||||
|
"""
|
||||||
|
define the lexical analysis network structure
|
||||||
|
word: stores the input of the model
|
||||||
|
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
|
||||||
|
|
||||||
|
return:
|
||||||
|
for infer: return the prediction
|
||||||
|
otherwise: return the prediction
|
||||||
|
"""
|
||||||
|
|
||||||
|
word_emb_dim=128
|
||||||
|
grnn_hidden_dim=128
|
||||||
|
bigru_num=2
|
||||||
|
emb_lr = 1.0
|
||||||
|
crf_lr = 1.0
|
||||||
|
init_bound = 0.1
|
||||||
|
IS_SPARSE = True
|
||||||
|
|
||||||
|
def _bigru_layer(input_feature):
|
||||||
|
"""
|
||||||
|
define the bidirectional gru layer
|
||||||
|
"""
|
||||||
|
pre_gru = fluid.layers.fc(
|
||||||
|
input=input_feature,
|
||||||
|
size=grnn_hidden_dim * 3,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.Uniform(
|
||||||
|
low=-init_bound, high=init_bound),
|
||||||
|
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||||
|
regularization_coeff=1e-4)))
|
||||||
|
gru = fluid.layers.dynamic_gru(
|
||||||
|
input=pre_gru,
|
||||||
|
size=grnn_hidden_dim,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.Uniform(
|
||||||
|
low=-init_bound, high=init_bound),
|
||||||
|
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||||
|
regularization_coeff=1e-4)))
|
||||||
|
|
||||||
|
pre_gru_r = fluid.layers.fc(
|
||||||
|
input=input_feature,
|
||||||
|
size=grnn_hidden_dim * 3,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.Uniform(
|
||||||
|
low=-init_bound, high=init_bound),
|
||||||
|
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||||
|
regularization_coeff=1e-4)))
|
||||||
|
gru_r = fluid.layers.dynamic_gru(
|
||||||
|
input=pre_gru_r,
|
||||||
|
size=grnn_hidden_dim,
|
||||||
|
is_reverse=True,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.Uniform(
|
||||||
|
low=-init_bound, high=init_bound),
|
||||||
|
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||||
|
regularization_coeff=1e-4)))
|
||||||
|
|
||||||
|
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
|
||||||
|
return bi_merge
|
||||||
|
|
||||||
|
def _net_conf(word, target=None):
|
||||||
|
"""
|
||||||
|
Configure the network
|
||||||
|
"""
|
||||||
|
word_embedding = fluid.embedding(
|
||||||
|
input=word,
|
||||||
|
size=[vocab_size, word_emb_dim],
|
||||||
|
dtype='float32',
|
||||||
|
is_sparse=IS_SPARSE,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
learning_rate=emb_lr,
|
||||||
|
name="word_emb",
|
||||||
|
initializer=fluid.initializer.Uniform(
|
||||||
|
low=-init_bound, high=init_bound)))
|
||||||
|
|
||||||
|
input_feature = word_embedding
|
||||||
|
for i in range(bigru_num):
|
||||||
|
bigru_output = _bigru_layer(input_feature)
|
||||||
|
input_feature = bigru_output
|
||||||
|
|
||||||
|
emission = fluid.layers.fc(
|
||||||
|
size=num_labels,
|
||||||
|
input=bigru_output,
|
||||||
|
param_attr=fluid.ParamAttr(
|
||||||
|
initializer=fluid.initializer.Uniform(
|
||||||
|
low=-init_bound, high=init_bound),
|
||||||
|
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||||
|
regularization_coeff=1e-4)))
|
||||||
|
|
||||||
|
size = emission.shape[1]
|
||||||
|
fluid.layers.create_parameter(
|
||||||
|
shape=[size + 2, size], dtype=emission.dtype, name='crfw')
|
||||||
|
crf_decode = fluid.layers.crf_decoding(
|
||||||
|
input=emission, param_attr=fluid.ParamAttr(name='crfw'))
|
||||||
|
|
||||||
|
return crf_decode
|
||||||
|
return _net_conf(word)
|
82
jieba/lac_small/predict.py
Normal file
82
jieba/lac_small/predict.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
import jieba.lac_small.utils as utils
|
||||||
|
import jieba.lac_small.creator as creator
|
||||||
|
import jieba.lac_small.reader_small as reader_small
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
word_emb_dim=128
|
||||||
|
grnn_hidden_dim=128
|
||||||
|
bigru_num=2
|
||||||
|
use_cuda=False
|
||||||
|
basepath = os.path.abspath(__file__)
|
||||||
|
folder = os.path.dirname(basepath)
|
||||||
|
init_checkpoint = os.path.join(folder, "model_baseline")
|
||||||
|
batch_size=1
|
||||||
|
|
||||||
|
dataset = reader_small.Dataset()
|
||||||
|
infer_program = fluid.Program()
|
||||||
|
with fluid.program_guard(infer_program, fluid.default_startup_program()):
|
||||||
|
with fluid.unique_name.guard():
|
||||||
|
infer_ret = creator.create_model(dataset.vocab_size, dataset.num_labels, mode='infer')
|
||||||
|
infer_program = infer_program.clone(for_test=True)
|
||||||
|
place = fluid.CPUPlace()
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
utils.init_checkpoint(exe, init_checkpoint, infer_program)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
def get_sent(str1):
|
||||||
|
feed_data=dataset.get_vars(str1)
|
||||||
|
a = numpy.array(feed_data).astype(numpy.int64)
|
||||||
|
a=a.reshape(-1,1)
|
||||||
|
c = fluid.create_lod_tensor(a, [[a.shape[0]]], place)
|
||||||
|
|
||||||
|
words, crf_decode = exe.run(
|
||||||
|
infer_program,
|
||||||
|
fetch_list=[infer_ret['words'], infer_ret['crf_decode']],
|
||||||
|
feed={"words":c, },
|
||||||
|
return_numpy=False,
|
||||||
|
use_program_cache=True)
|
||||||
|
sents=[]
|
||||||
|
sent,tag = utils.parse_result(words, crf_decode, dataset)
|
||||||
|
sents = sents + sent
|
||||||
|
return sents
|
||||||
|
|
||||||
|
def get_result(str1):
|
||||||
|
feed_data=dataset.get_vars(str1)
|
||||||
|
a = numpy.array(feed_data).astype(numpy.int64)
|
||||||
|
a=a.reshape(-1,1)
|
||||||
|
c = fluid.create_lod_tensor(a, [[a.shape[0]]], place)
|
||||||
|
|
||||||
|
words, crf_decode = exe.run(
|
||||||
|
infer_program,
|
||||||
|
fetch_list=[infer_ret['words'], infer_ret['crf_decode']],
|
||||||
|
feed={"words":c, },
|
||||||
|
return_numpy=False,
|
||||||
|
use_program_cache=True)
|
||||||
|
results=[]
|
||||||
|
results += utils.parse_result(words, crf_decode, dataset)
|
||||||
|
return results
|
100
jieba/lac_small/reader_small.py
Normal file
100
jieba/lac_small/reader_small.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
The file_reader converts raw corpus to input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import __future__
|
||||||
|
import io
|
||||||
|
import paddle
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
|
||||||
|
def load_kv_dict(dict_path,
|
||||||
|
reverse=False,
|
||||||
|
delimiter="\t",
|
||||||
|
key_func=None,
|
||||||
|
value_func=None):
|
||||||
|
"""
|
||||||
|
Load key-value dict from file
|
||||||
|
"""
|
||||||
|
result_dict = {}
|
||||||
|
for line in io.open(dict_path, "r", encoding='utf8'):
|
||||||
|
terms = line.strip("\n").split(delimiter)
|
||||||
|
if len(terms) != 2:
|
||||||
|
continue
|
||||||
|
if reverse:
|
||||||
|
value, key = terms
|
||||||
|
else:
|
||||||
|
key, value = terms
|
||||||
|
if key in result_dict:
|
||||||
|
raise KeyError("key duplicated with [%s]" % (key))
|
||||||
|
if key_func:
|
||||||
|
key = key_func(key)
|
||||||
|
if value_func:
|
||||||
|
value = value_func(value)
|
||||||
|
result_dict[key] = value
|
||||||
|
return result_dict
|
||||||
|
|
||||||
|
class Dataset(object):
|
||||||
|
"""data reader"""
|
||||||
|
def __init__(self):
|
||||||
|
# read dict
|
||||||
|
basepath = os.path.abspath(__file__)
|
||||||
|
folder = os.path.dirname(basepath)
|
||||||
|
word_dict_path = os.path.join(folder, "word.dic")
|
||||||
|
label_dict_path = os.path.join(folder, "tag.dic")
|
||||||
|
self.word2id_dict = load_kv_dict(
|
||||||
|
word_dict_path, reverse=True, value_func=int)
|
||||||
|
self.id2word_dict = load_kv_dict(word_dict_path)
|
||||||
|
self.label2id_dict = load_kv_dict(
|
||||||
|
label_dict_path, reverse=True, value_func=int)
|
||||||
|
self.id2label_dict = load_kv_dict(label_dict_path)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
"""vocabulary size"""
|
||||||
|
return max(self.word2id_dict.values()) + 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_labels(self):
|
||||||
|
"""num_labels"""
|
||||||
|
return max(self.label2id_dict.values()) + 1
|
||||||
|
|
||||||
|
def word_to_ids(self, words):
|
||||||
|
"""convert word to word index"""
|
||||||
|
word_ids = []
|
||||||
|
for word in words:
|
||||||
|
if word not in self.word2id_dict:
|
||||||
|
word = "OOV"
|
||||||
|
word_id = self.word2id_dict[word]
|
||||||
|
word_ids.append(word_id)
|
||||||
|
return word_ids
|
||||||
|
|
||||||
|
def label_to_ids(self, labels):
|
||||||
|
"""convert label to label index"""
|
||||||
|
label_ids = []
|
||||||
|
for label in labels:
|
||||||
|
if label not in self.label2id_dict:
|
||||||
|
label = "O"
|
||||||
|
label_id = self.label2id_dict[label]
|
||||||
|
label_ids.append(label_id)
|
||||||
|
return label_ids
|
||||||
|
|
||||||
|
def get_vars(self,str1):
|
||||||
|
words = str1.strip()
|
||||||
|
word_ids = self.word_to_ids(words)
|
||||||
|
return word_ids
|
||||||
|
|
||||||
|
|
57
jieba/lac_small/tag.dic
Normal file
57
jieba/lac_small/tag.dic
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
0 a-B
|
||||||
|
1 a-I
|
||||||
|
2 ad-B
|
||||||
|
3 ad-I
|
||||||
|
4 an-B
|
||||||
|
5 an-I
|
||||||
|
6 c-B
|
||||||
|
7 c-I
|
||||||
|
8 d-B
|
||||||
|
9 d-I
|
||||||
|
10 f-B
|
||||||
|
11 f-I
|
||||||
|
12 m-B
|
||||||
|
13 m-I
|
||||||
|
14 n-B
|
||||||
|
15 n-I
|
||||||
|
16 nr-B
|
||||||
|
17 nr-I
|
||||||
|
18 ns-B
|
||||||
|
19 ns-I
|
||||||
|
20 nt-B
|
||||||
|
21 nt-I
|
||||||
|
22 nw-B
|
||||||
|
23 nw-I
|
||||||
|
24 nz-B
|
||||||
|
25 nz-I
|
||||||
|
26 p-B
|
||||||
|
27 p-I
|
||||||
|
28 q-B
|
||||||
|
29 q-I
|
||||||
|
30 r-B
|
||||||
|
31 r-I
|
||||||
|
32 s-B
|
||||||
|
33 s-I
|
||||||
|
34 t-B
|
||||||
|
35 t-I
|
||||||
|
36 u-B
|
||||||
|
37 u-I
|
||||||
|
38 v-B
|
||||||
|
39 v-I
|
||||||
|
40 vd-B
|
||||||
|
41 vd-I
|
||||||
|
42 vn-B
|
||||||
|
43 vn-I
|
||||||
|
44 w-B
|
||||||
|
45 w-I
|
||||||
|
46 xc-B
|
||||||
|
47 xc-I
|
||||||
|
48 PER-B
|
||||||
|
49 PER-I
|
||||||
|
50 LOC-B
|
||||||
|
51 LOC-I
|
||||||
|
52 ORG-B
|
||||||
|
53 ORG-I
|
||||||
|
54 TIME-B
|
||||||
|
55 TIME-I
|
||||||
|
56 O
|
142
jieba/lac_small/utils.py
Normal file
142
jieba/lac_small/utils.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
util tools
|
||||||
|
"""
|
||||||
|
from __future__ import print_function
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import paddle.fluid as fluid
|
||||||
|
import io
|
||||||
|
|
||||||
|
|
||||||
|
def str2bool(v):
|
||||||
|
"""
|
||||||
|
argparse does not support True or False in python
|
||||||
|
"""
|
||||||
|
return v.lower() in ("true", "t", "1")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_result(words, crf_decode, dataset):
|
||||||
|
""" parse result """
|
||||||
|
offset_list = (crf_decode.lod())[0]
|
||||||
|
words = np.array(words)
|
||||||
|
crf_decode = np.array(crf_decode)
|
||||||
|
batch_size = len(offset_list) - 1
|
||||||
|
|
||||||
|
for sent_index in range(batch_size):
|
||||||
|
begin, end = offset_list[sent_index], offset_list[sent_index + 1]
|
||||||
|
sent=[]
|
||||||
|
for id in words[begin:end]:
|
||||||
|
if dataset.id2word_dict[str(id[0])]=='OOV':
|
||||||
|
sent.append(' ')
|
||||||
|
else:
|
||||||
|
sent.append(dataset.id2word_dict[str(id[0])])
|
||||||
|
tags = [
|
||||||
|
dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]
|
||||||
|
]
|
||||||
|
|
||||||
|
sent_out = []
|
||||||
|
tags_out = []
|
||||||
|
parital_word = ""
|
||||||
|
for ind, tag in enumerate(tags):
|
||||||
|
# for the first word
|
||||||
|
if parital_word == "":
|
||||||
|
parital_word = sent[ind]
|
||||||
|
tags_out.append(tag.split('-')[0])
|
||||||
|
continue
|
||||||
|
|
||||||
|
# for the beginning of word
|
||||||
|
if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
|
||||||
|
sent_out.append(parital_word)
|
||||||
|
tags_out.append(tag.split('-')[0])
|
||||||
|
parital_word = sent[ind]
|
||||||
|
continue
|
||||||
|
|
||||||
|
parital_word += sent[ind]
|
||||||
|
|
||||||
|
# append the last word, except for len(tags)=0
|
||||||
|
if len(sent_out) < len(tags_out):
|
||||||
|
sent_out.append(parital_word)
|
||||||
|
return sent_out,tags_out
|
||||||
|
|
||||||
|
def parse_padding_result(words, crf_decode, seq_lens, dataset):
|
||||||
|
""" parse padding result """
|
||||||
|
words = np.squeeze(words)
|
||||||
|
batch_size = len(seq_lens)
|
||||||
|
|
||||||
|
batch_out = []
|
||||||
|
for sent_index in range(batch_size):
|
||||||
|
|
||||||
|
sent=[]
|
||||||
|
for id in words[begin:end]:
|
||||||
|
if dataset.id2word_dict[str(id[0])]=='OOV':
|
||||||
|
sent.append(' ')
|
||||||
|
else:
|
||||||
|
sent.append(dataset.id2word_dict[str(id[0])])
|
||||||
|
tags = [
|
||||||
|
dataset.id2label_dict[str(id)]
|
||||||
|
for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1]
|
||||||
|
]
|
||||||
|
|
||||||
|
sent_out = []
|
||||||
|
tags_out = []
|
||||||
|
parital_word = ""
|
||||||
|
for ind, tag in enumerate(tags):
|
||||||
|
# for the first word
|
||||||
|
if parital_word == "":
|
||||||
|
parital_word = sent[ind]
|
||||||
|
tags_out.append(tag.split('-')[0])
|
||||||
|
continue
|
||||||
|
|
||||||
|
# for the beginning of word
|
||||||
|
if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
|
||||||
|
sent_out.append(parital_word)
|
||||||
|
tags_out.append(tag.split('-')[0])
|
||||||
|
parital_word = sent[ind]
|
||||||
|
continue
|
||||||
|
|
||||||
|
parital_word += sent[ind]
|
||||||
|
|
||||||
|
# append the last word, except for len(tags)=0
|
||||||
|
if len(sent_out) < len(tags_out):
|
||||||
|
sent_out.append(parital_word)
|
||||||
|
|
||||||
|
batch_out.append([sent_out, tags_out])
|
||||||
|
return batch_out
|
||||||
|
|
||||||
|
|
||||||
|
def init_checkpoint(exe, init_checkpoint_path, main_program):
|
||||||
|
"""
|
||||||
|
Init CheckPoint
|
||||||
|
"""
|
||||||
|
assert os.path.exists(
|
||||||
|
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
|
||||||
|
|
||||||
|
def existed_persitables(var):
|
||||||
|
"""
|
||||||
|
If existed presitabels
|
||||||
|
"""
|
||||||
|
if not fluid.io.is_persistable(var):
|
||||||
|
return False
|
||||||
|
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
|
||||||
|
|
||||||
|
fluid.io.load_vars(
|
||||||
|
exe,
|
||||||
|
init_checkpoint_path,
|
||||||
|
main_program=main_program,
|
||||||
|
predicate=existed_persitables)
|
||||||
|
|
20940
jieba/lac_small/word.dic
Normal file
20940
jieba/lac_small/word.dic
Normal file
File diff suppressed because it is too large
Load Diff
394
jieba/posseg/__init__.py
Normal file → Executable file
394
jieba/posseg/__init__.py
Normal file → Executable file
@ -1,120 +1,310 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
import pickle
|
||||||
import re
|
import re
|
||||||
import os
|
|
||||||
import viterbi
|
|
||||||
import jieba
|
import jieba
|
||||||
import sys
|
from .viterbi import viterbi
|
||||||
default_encoding = sys.getfilesystemencoding()
|
from .._compat import *
|
||||||
|
|
||||||
def load_model(f_name):
|
PROB_START_P = "prob_start.p"
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
PROB_TRANS_P = "prob_trans.p"
|
||||||
prob_p_path = os.path.join(_curpath,f_name)
|
PROB_EMIT_P = "prob_emit.p"
|
||||||
if f_name.endswith(".py"):
|
CHAR_STATE_TAB_P = "char_state_tab.p"
|
||||||
return eval(open(prob_p_path,"rb").read())
|
|
||||||
else:
|
re_han_detail = re.compile("([\u4E00-\u9FD5]+)")
|
||||||
result = {}
|
re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||||
for line in open(prob_p_path,"rb"):
|
re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
|
||||||
line = line.strip()
|
re_skip_internal = re.compile("(\r\n|\s)")
|
||||||
if line=="":continue
|
|
||||||
word, _, tag = line.split(' ')
|
re_eng = re.compile("[a-zA-Z0-9]+")
|
||||||
result[word.decode('utf-8')]=tag
|
re_num = re.compile("[\.0-9]+")
|
||||||
return result
|
|
||||||
|
re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U)
|
||||||
|
|
||||||
|
|
||||||
prob_start = load_model("prob_start.py")
|
def load_model():
|
||||||
prob_trans = load_model("prob_trans.py")
|
# For Jython
|
||||||
prob_emit = load_model("prob_emit.py")
|
start_p = pickle.load(get_module_res("posseg", PROB_START_P))
|
||||||
char_state_tab = load_model("char_state_tab.py")
|
trans_p = pickle.load(get_module_res("posseg", PROB_TRANS_P))
|
||||||
word_tag_tab = load_model("../dict.txt")
|
emit_p = pickle.load(get_module_res("posseg", PROB_EMIT_P))
|
||||||
|
state = pickle.load(get_module_res("posseg", CHAR_STATE_TAB_P))
|
||||||
|
return state, start_p, trans_p, emit_p
|
||||||
|
|
||||||
|
|
||||||
|
if sys.platform.startswith("java"):
|
||||||
|
char_state_tab_P, start_P, trans_P, emit_P = load_model()
|
||||||
|
else:
|
||||||
|
from .char_state_tab import P as char_state_tab_P
|
||||||
|
from .prob_start import P as start_P
|
||||||
|
from .prob_trans import P as trans_P
|
||||||
|
from .prob_emit import P as emit_P
|
||||||
|
|
||||||
|
|
||||||
class pair(object):
|
class pair(object):
|
||||||
def __init__(self,word,flag):
|
|
||||||
self.word = word
|
|
||||||
self.flag = flag
|
|
||||||
|
|
||||||
def __unicode__(self):
|
def __init__(self, word, flag):
|
||||||
return self.word+u"/"+self.flag
|
self.word = word
|
||||||
|
self.flag = flag
|
||||||
|
|
||||||
def __repr__(self):
|
def __unicode__(self):
|
||||||
return self.__str__()
|
return '%s/%s' % (self.word, self.flag)
|
||||||
|
|
||||||
def __str__(self):
|
def __repr__(self):
|
||||||
return self.__unicode__().encode(default_encoding)
|
return 'pair(%r, %r)' % (self.word, self.flag)
|
||||||
|
|
||||||
def encode(self,arg):
|
def __str__(self):
|
||||||
return self.__unicode__().encode(arg)
|
if PY2:
|
||||||
|
return self.__unicode__().encode(default_encoding)
|
||||||
|
else:
|
||||||
|
return self.__unicode__()
|
||||||
|
|
||||||
def __cut(sentence):
|
def __iter__(self):
|
||||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
|
return iter((self.word, self.flag))
|
||||||
begin, next = 0,0
|
|
||||||
|
|
||||||
for i,char in enumerate(sentence):
|
def __lt__(self, other):
|
||||||
pos = pos_list[i][0]
|
return self.word < other.word
|
||||||
if pos=='B':
|
|
||||||
begin = i
|
|
||||||
elif pos=='E':
|
|
||||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
|
||||||
next = i+1
|
|
||||||
elif pos=='S':
|
|
||||||
yield pair(char,pos_list[i][1])
|
|
||||||
next = i+1
|
|
||||||
if next<len(sentence):
|
|
||||||
yield pair(sentence[next:], pos_list[next][1] )
|
|
||||||
|
|
||||||
def __cut_DAG(sentence):
|
def __eq__(self, other):
|
||||||
DAG = jieba.get_DAG(sentence)
|
return isinstance(other, pair) and self.word == other.word and self.flag == other.flag
|
||||||
route ={}
|
|
||||||
jieba.calc(sentence,DAG,0,route=route)
|
|
||||||
x = 0
|
|
||||||
buf =u''
|
|
||||||
N = len(sentence)
|
|
||||||
while x<N:
|
|
||||||
y = route[x][1]+1
|
|
||||||
l_word = sentence[x:y]
|
|
||||||
if y-x==1:
|
|
||||||
buf+= l_word
|
|
||||||
else:
|
|
||||||
if len(buf)>0:
|
|
||||||
if len(buf)==1:
|
|
||||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
|
||||||
buf=u''
|
|
||||||
else:
|
|
||||||
regognized = __cut(buf)
|
|
||||||
for t in regognized:
|
|
||||||
yield t
|
|
||||||
buf=u''
|
|
||||||
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
|
||||||
x =y
|
|
||||||
|
|
||||||
if len(buf)>0:
|
def __hash__(self):
|
||||||
if len(buf)==1:
|
return hash(self.word)
|
||||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
|
||||||
else:
|
def encode(self, arg):
|
||||||
regognized = __cut(buf)
|
return self.__unicode__().encode(arg)
|
||||||
for t in regognized:
|
|
||||||
yield t
|
|
||||||
|
|
||||||
|
|
||||||
def cut(sentence):
|
class POSTokenizer(object):
|
||||||
if not ( type(sentence) is unicode):
|
|
||||||
try:
|
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except:
|
|
||||||
sentence = sentence.decode('gbk','ignore')
|
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
|
||||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
|
||||||
blocks = re_han.split(sentence)
|
|
||||||
|
|
||||||
for blk in blocks:
|
def __init__(self, tokenizer=None):
|
||||||
if re_han.match(blk):
|
self.tokenizer = tokenizer or jieba.Tokenizer()
|
||||||
for word in __cut_DAG(blk):
|
self.load_word_tag(self.tokenizer.get_dict_file())
|
||||||
yield word
|
|
||||||
else:
|
def __repr__(self):
|
||||||
tmp = re_skip.split(blk)
|
return '<POSTokenizer tokenizer=%r>' % self.tokenizer
|
||||||
for x in tmp:
|
|
||||||
if x!="":
|
def __getattr__(self, name):
|
||||||
if re_num.match(x):
|
if name in ('cut_for_search', 'lcut_for_search', 'tokenize'):
|
||||||
yield pair(x,'m')
|
# may be possible?
|
||||||
elif re_eng.match(x):
|
raise NotImplementedError
|
||||||
yield pair(x,'eng')
|
return getattr(self.tokenizer, name)
|
||||||
else:
|
|
||||||
yield pair(x,'x')
|
def initialize(self, dictionary=None):
|
||||||
|
self.tokenizer.initialize(dictionary)
|
||||||
|
self.load_word_tag(self.tokenizer.get_dict_file())
|
||||||
|
|
||||||
|
def load_word_tag(self, f):
|
||||||
|
self.word_tag_tab = {}
|
||||||
|
f_name = resolve_filename(f)
|
||||||
|
for lineno, line in enumerate(f, 1):
|
||||||
|
try:
|
||||||
|
line = line.strip().decode("utf-8")
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
word, _, tag = line.split(" ")
|
||||||
|
self.word_tag_tab[word] = tag
|
||||||
|
except Exception:
|
||||||
|
raise ValueError(
|
||||||
|
'invalid POS dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
def makesure_userdict_loaded(self):
|
||||||
|
if self.tokenizer.user_word_tag_tab:
|
||||||
|
self.word_tag_tab.update(self.tokenizer.user_word_tag_tab)
|
||||||
|
self.tokenizer.user_word_tag_tab = {}
|
||||||
|
|
||||||
|
def __cut(self, sentence):
|
||||||
|
prob, pos_list = viterbi(
|
||||||
|
sentence, char_state_tab_P, start_P, trans_P, emit_P)
|
||||||
|
begin, nexti = 0, 0
|
||||||
|
|
||||||
|
for i, char in enumerate(sentence):
|
||||||
|
pos = pos_list[i][0]
|
||||||
|
if pos == 'B':
|
||||||
|
begin = i
|
||||||
|
elif pos == 'E':
|
||||||
|
yield pair(sentence[begin:i + 1], pos_list[i][1])
|
||||||
|
nexti = i + 1
|
||||||
|
elif pos == 'S':
|
||||||
|
yield pair(char, pos_list[i][1])
|
||||||
|
nexti = i + 1
|
||||||
|
if nexti < len(sentence):
|
||||||
|
yield pair(sentence[nexti:], pos_list[nexti][1])
|
||||||
|
|
||||||
|
def __cut_detail(self, sentence):
|
||||||
|
blocks = re_han_detail.split(sentence)
|
||||||
|
for blk in blocks:
|
||||||
|
if re_han_detail.match(blk):
|
||||||
|
for word in self.__cut(blk):
|
||||||
|
yield word
|
||||||
|
else:
|
||||||
|
tmp = re_skip_detail.split(blk)
|
||||||
|
for x in tmp:
|
||||||
|
if x:
|
||||||
|
if re_num.match(x):
|
||||||
|
yield pair(x, 'm')
|
||||||
|
elif re_eng.match(x):
|
||||||
|
yield pair(x, 'eng')
|
||||||
|
else:
|
||||||
|
yield pair(x, 'x')
|
||||||
|
|
||||||
|
def __cut_DAG_NO_HMM(self, sentence):
|
||||||
|
DAG = self.tokenizer.get_DAG(sentence)
|
||||||
|
route = {}
|
||||||
|
self.tokenizer.calc(sentence, DAG, route)
|
||||||
|
x = 0
|
||||||
|
N = len(sentence)
|
||||||
|
buf = ''
|
||||||
|
while x < N:
|
||||||
|
y = route[x][1] + 1
|
||||||
|
l_word = sentence[x:y]
|
||||||
|
if re_eng1.match(l_word):
|
||||||
|
buf += l_word
|
||||||
|
x = y
|
||||||
|
else:
|
||||||
|
if buf:
|
||||||
|
yield pair(buf, 'eng')
|
||||||
|
buf = ''
|
||||||
|
yield pair(l_word, self.word_tag_tab.get(l_word, 'x'))
|
||||||
|
x = y
|
||||||
|
if buf:
|
||||||
|
yield pair(buf, 'eng')
|
||||||
|
buf = ''
|
||||||
|
|
||||||
|
def __cut_DAG(self, sentence):
|
||||||
|
DAG = self.tokenizer.get_DAG(sentence)
|
||||||
|
route = {}
|
||||||
|
|
||||||
|
self.tokenizer.calc(sentence, DAG, route)
|
||||||
|
|
||||||
|
x = 0
|
||||||
|
buf = ''
|
||||||
|
N = len(sentence)
|
||||||
|
while x < N:
|
||||||
|
y = route[x][1] + 1
|
||||||
|
l_word = sentence[x:y]
|
||||||
|
if y - x == 1:
|
||||||
|
buf += l_word
|
||||||
|
else:
|
||||||
|
if buf:
|
||||||
|
if len(buf) == 1:
|
||||||
|
yield pair(buf, self.word_tag_tab.get(buf, 'x'))
|
||||||
|
elif not self.tokenizer.FREQ.get(buf):
|
||||||
|
recognized = self.__cut_detail(buf)
|
||||||
|
for t in recognized:
|
||||||
|
yield t
|
||||||
|
else:
|
||||||
|
for elem in buf:
|
||||||
|
yield pair(elem, self.word_tag_tab.get(elem, 'x'))
|
||||||
|
buf = ''
|
||||||
|
yield pair(l_word, self.word_tag_tab.get(l_word, 'x'))
|
||||||
|
x = y
|
||||||
|
|
||||||
|
if buf:
|
||||||
|
if len(buf) == 1:
|
||||||
|
yield pair(buf, self.word_tag_tab.get(buf, 'x'))
|
||||||
|
elif not self.tokenizer.FREQ.get(buf):
|
||||||
|
recognized = self.__cut_detail(buf)
|
||||||
|
for t in recognized:
|
||||||
|
yield t
|
||||||
|
else:
|
||||||
|
for elem in buf:
|
||||||
|
yield pair(elem, self.word_tag_tab.get(elem, 'x'))
|
||||||
|
|
||||||
|
def __cut_internal(self, sentence, HMM=True):
|
||||||
|
self.makesure_userdict_loaded()
|
||||||
|
sentence = strdecode(sentence)
|
||||||
|
blocks = re_han_internal.split(sentence)
|
||||||
|
if HMM:
|
||||||
|
cut_blk = self.__cut_DAG
|
||||||
|
else:
|
||||||
|
cut_blk = self.__cut_DAG_NO_HMM
|
||||||
|
|
||||||
|
for blk in blocks:
|
||||||
|
if re_han_internal.match(blk):
|
||||||
|
for word in cut_blk(blk):
|
||||||
|
yield word
|
||||||
|
else:
|
||||||
|
tmp = re_skip_internal.split(blk)
|
||||||
|
for x in tmp:
|
||||||
|
if re_skip_internal.match(x):
|
||||||
|
yield pair(x, 'x')
|
||||||
|
else:
|
||||||
|
for xx in x:
|
||||||
|
if re_num.match(xx):
|
||||||
|
yield pair(xx, 'm')
|
||||||
|
elif re_eng.match(x):
|
||||||
|
yield pair(xx, 'eng')
|
||||||
|
else:
|
||||||
|
yield pair(xx, 'x')
|
||||||
|
|
||||||
|
def _lcut_internal(self, sentence):
|
||||||
|
return list(self.__cut_internal(sentence))
|
||||||
|
|
||||||
|
def _lcut_internal_no_hmm(self, sentence):
|
||||||
|
return list(self.__cut_internal(sentence, False))
|
||||||
|
|
||||||
|
def cut(self, sentence, HMM=True):
|
||||||
|
for w in self.__cut_internal(sentence, HMM=HMM):
|
||||||
|
yield w
|
||||||
|
|
||||||
|
def lcut(self, *args, **kwargs):
|
||||||
|
return list(self.cut(*args, **kwargs))
|
||||||
|
|
||||||
|
|
||||||
|
# default Tokenizer instance
|
||||||
|
|
||||||
|
dt = POSTokenizer(jieba.dt)
|
||||||
|
|
||||||
|
# global functions
|
||||||
|
|
||||||
|
initialize = dt.initialize
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_internal(s):
|
||||||
|
return dt._lcut_internal(s)
|
||||||
|
|
||||||
|
|
||||||
|
def _lcut_internal_no_hmm(s):
|
||||||
|
return dt._lcut_internal_no_hmm(s)
|
||||||
|
|
||||||
|
|
||||||
|
def cut(sentence, HMM=True, use_paddle=False):
|
||||||
|
"""
|
||||||
|
Global `cut` function that supports parallel processing.
|
||||||
|
|
||||||
|
Note that this only works using dt, custom POSTokenizer
|
||||||
|
instances are not supported.
|
||||||
|
"""
|
||||||
|
is_paddle_installed = check_paddle_install['is_paddle_installed']
|
||||||
|
if use_paddle and is_paddle_installed:
|
||||||
|
# if sentence is null, it will raise core exception in paddle.
|
||||||
|
if sentence is None or sentence == "" or sentence == u"":
|
||||||
|
return
|
||||||
|
import jieba.lac_small.predict as predict
|
||||||
|
sents, tags = predict.get_result(strdecode(sentence))
|
||||||
|
for i, sent in enumerate(sents):
|
||||||
|
if sent is None or tags[i] is None:
|
||||||
|
continue
|
||||||
|
yield pair(sent, tags[i])
|
||||||
|
return
|
||||||
|
global dt
|
||||||
|
if jieba.pool is None:
|
||||||
|
for w in dt.cut(sentence, HMM=HMM):
|
||||||
|
yield w
|
||||||
|
else:
|
||||||
|
parts = strdecode(sentence).splitlines(True)
|
||||||
|
if HMM:
|
||||||
|
result = jieba.pool.map(_lcut_internal, parts)
|
||||||
|
else:
|
||||||
|
result = jieba.pool.map(_lcut_internal_no_hmm, parts)
|
||||||
|
for r in result:
|
||||||
|
for w in r:
|
||||||
|
yield w
|
||||||
|
|
||||||
|
|
||||||
|
def lcut(sentence, HMM=True, use_paddle=False):
|
||||||
|
if use_paddle:
|
||||||
|
return list(cut(sentence, use_paddle=True))
|
||||||
|
return list(cut(sentence, HMM))
|
||||||
|
335946
jieba/posseg/char_state_tab.p
Normal file
335946
jieba/posseg/char_state_tab.p
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
269408
jieba/posseg/prob_emit.p
Normal file
269408
jieba/posseg/prob_emit.p
Normal file
File diff suppressed because it is too large
Load Diff
155429
jieba/posseg/prob_emit.py
155429
jieba/posseg/prob_emit.py
File diff suppressed because it is too large
Load Diff
1094
jieba/posseg/prob_start.p
Normal file
1094
jieba/posseg/prob_start.p
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,256 +1,256 @@
|
|||||||
{('B', 'a'): 0.008545886571090637,
|
P={('B', 'a'): -4.762305214596967,
|
||||||
('B', 'ad'): 0.0012556950477614949,
|
('B', 'ad'): -6.680066036784177,
|
||||||
('B', 'ag'): 0.0,
|
('B', 'ag'): -3.14e+100,
|
||||||
('B', 'an'): 0.0001670724139577068,
|
('B', 'an'): -8.697083223018778,
|
||||||
('B', 'b'): 0.006615272009801582,
|
('B', 'b'): -5.018374362109218,
|
||||||
('B', 'bg'): 0.0,
|
('B', 'bg'): -3.14e+100,
|
||||||
('B', 'c'): 0.03258575057944956,
|
('B', 'c'): -3.423880184954888,
|
||||||
('B', 'd'): 0.018778408940230508,
|
('B', 'd'): -3.9750475297585357,
|
||||||
('B', 'df'): 0.00013790104009207547,
|
('B', 'df'): -8.888974230828882,
|
||||||
('B', 'dg'): 0.0,
|
('B', 'dg'): -3.14e+100,
|
||||||
('B', 'e'): 0.00019093990166595064,
|
('B', 'e'): -8.563551830394255,
|
||||||
('B', 'en'): 0.0,
|
('B', 'en'): -3.14e+100,
|
||||||
('B', 'f'): 0.004121119544290101,
|
('B', 'f'): -5.491630418482717,
|
||||||
('B', 'g'): 0.0,
|
('B', 'g'): -3.14e+100,
|
||||||
('B', 'h'): 1.3259715393468796e-06,
|
('B', 'h'): -13.533365129970255,
|
||||||
('B', 'i'): 0.0022077426130125543,
|
('B', 'i'): -6.1157847275557105,
|
||||||
('B', 'in'): 0.0,
|
('B', 'in'): -3.14e+100,
|
||||||
('B', 'j'): 0.006360685474246981,
|
('B', 'j'): -5.0576191284681915,
|
||||||
('B', 'jn'): 0.0,
|
('B', 'jn'): -3.14e+100,
|
||||||
('B', 'k'): 0.0,
|
('B', 'k'): -3.14e+100,
|
||||||
('B', 'l'): 0.007402899104173628,
|
('B', 'l'): -4.905883584659895,
|
||||||
('B', 'ln'): 0.0,
|
('B', 'ln'): -3.14e+100,
|
||||||
('B', 'm'): 0.02592804748038888,
|
('B', 'm'): -3.6524299819046386,
|
||||||
('B', 'mg'): 0.0,
|
('B', 'mg'): -3.14e+100,
|
||||||
('B', 'mq'): 0.0011284017799841944,
|
('B', 'mq'): -6.78695300139688,
|
||||||
('B', 'n'): 0.18330097962777328,
|
('B', 'n'): -1.6966257797548328,
|
||||||
('B', 'ng'): 0.0,
|
('B', 'ng'): -3.14e+100,
|
||||||
('B', 'nr'): 0.10741562843095136,
|
('B', 'nr'): -2.2310495913769506,
|
||||||
('B', 'nrfg'): 0.0028123856349547313,
|
('B', 'nrfg'): -5.873722175405573,
|
||||||
('B', 'nrt'): 0.006835383285333164,
|
('B', 'nrt'): -4.985642733519195,
|
||||||
('B', 'ns'): 0.05943667425122387,
|
('B', 'ns'): -2.8228438314969213,
|
||||||
('B', 'nt'): 0.007859033313708954,
|
('B', 'nt'): -4.846091668182416,
|
||||||
('B', 'nz'): 0.0193127754705873,
|
('B', 'nz'): -3.94698846057672,
|
||||||
('B', 'o'): 0.00021745933245288822,
|
('B', 'o'): -8.433498702146057,
|
||||||
('B', 'p'): 0.014980826451541043,
|
('B', 'p'): -4.200984132085048,
|
||||||
('B', 'q'): 0.00091359439061,
|
('B', 'q'): -6.998123858956596,
|
||||||
('B', 'qe'): 0.0,
|
('B', 'qe'): -3.14e+100,
|
||||||
('B', 'qg'): 0.0,
|
('B', 'qg'): -3.14e+100,
|
||||||
('B', 'r'): 0.033047188675142274,
|
('B', 'r'): -3.4098187790818413,
|
||||||
('B', 'rg'): 0.0,
|
('B', 'rg'): -3.14e+100,
|
||||||
('B', 'rr'): 3.977914618040638e-06,
|
('B', 'rr'): -12.434752841302146,
|
||||||
('B', 'rz'): 0.0003540344010056168,
|
('B', 'rz'): -7.946116471570005,
|
||||||
('B', 's'): 0.0039951522480521475,
|
('B', 's'): -5.522673590839954,
|
||||||
('B', 't'): 0.03457072997385184,
|
('B', 't'): -3.3647479094528574,
|
||||||
('B', 'tg'): 0.0,
|
('B', 'tg'): -3.14e+100,
|
||||||
('B', 'u'): 0.00010475175160840347,
|
('B', 'u'): -9.163917277503234,
|
||||||
('B', 'ud'): 0.0,
|
('B', 'ud'): -3.14e+100,
|
||||||
('B', 'ug'): 0.0,
|
('B', 'ug'): -3.14e+100,
|
||||||
('B', 'uj'): 0.0,
|
('B', 'uj'): -3.14e+100,
|
||||||
('B', 'ul'): 0.0,
|
('B', 'ul'): -3.14e+100,
|
||||||
('B', 'uv'): 0.0,
|
('B', 'uv'): -3.14e+100,
|
||||||
('B', 'uz'): 0.0,
|
('B', 'uz'): -3.14e+100,
|
||||||
('B', 'v'): 0.06897173559066729,
|
('B', 'v'): -2.6740584874265685,
|
||||||
('B', 'vd'): 0.00011801146700187228,
|
('B', 'vd'): -9.044728760238115,
|
||||||
('B', 'vg'): 0.0,
|
('B', 'vg'): -3.14e+100,
|
||||||
('B', 'vi'): 3.977914618040638e-06,
|
('B', 'vi'): -12.434752841302146,
|
||||||
('B', 'vn'): 0.01314700781262431,
|
('B', 'vn'): -4.3315610890163585,
|
||||||
('B', 'vq'): 5.303886157387518e-06,
|
('B', 'vq'): -12.147070768850364,
|
||||||
('B', 'w'): 0.0,
|
('B', 'w'): -3.14e+100,
|
||||||
('B', 'x'): 0.0,
|
('B', 'x'): -3.14e+100,
|
||||||
('B', 'y'): 5.303886157387518e-05,
|
('B', 'y'): -9.844485675856319,
|
||||||
('B', 'yg'): 0.0,
|
('B', 'yg'): -3.14e+100,
|
||||||
('B', 'z'): 0.0008711633013508998,
|
('B', 'z'): -7.045681111485645,
|
||||||
('B', 'zg'): 0.0,
|
('B', 'zg'): -3.14e+100,
|
||||||
('E', 'a'): 0.0,
|
('E', 'a'): -3.14e+100,
|
||||||
('E', 'ad'): 0.0,
|
('E', 'ad'): -3.14e+100,
|
||||||
('E', 'ag'): 0.0,
|
('E', 'ag'): -3.14e+100,
|
||||||
('E', 'an'): 0.0,
|
('E', 'an'): -3.14e+100,
|
||||||
('E', 'b'): 0.0,
|
('E', 'b'): -3.14e+100,
|
||||||
('E', 'bg'): 0.0,
|
('E', 'bg'): -3.14e+100,
|
||||||
('E', 'c'): 0.0,
|
('E', 'c'): -3.14e+100,
|
||||||
('E', 'd'): 0.0,
|
('E', 'd'): -3.14e+100,
|
||||||
('E', 'df'): 0.0,
|
('E', 'df'): -3.14e+100,
|
||||||
('E', 'dg'): 0.0,
|
('E', 'dg'): -3.14e+100,
|
||||||
('E', 'e'): 0.0,
|
('E', 'e'): -3.14e+100,
|
||||||
('E', 'en'): 0.0,
|
('E', 'en'): -3.14e+100,
|
||||||
('E', 'f'): 0.0,
|
('E', 'f'): -3.14e+100,
|
||||||
('E', 'g'): 0.0,
|
('E', 'g'): -3.14e+100,
|
||||||
('E', 'h'): 0.0,
|
('E', 'h'): -3.14e+100,
|
||||||
('E', 'i'): 0.0,
|
('E', 'i'): -3.14e+100,
|
||||||
('E', 'in'): 0.0,
|
('E', 'in'): -3.14e+100,
|
||||||
('E', 'j'): 0.0,
|
('E', 'j'): -3.14e+100,
|
||||||
('E', 'jn'): 0.0,
|
('E', 'jn'): -3.14e+100,
|
||||||
('E', 'k'): 0.0,
|
('E', 'k'): -3.14e+100,
|
||||||
('E', 'l'): 0.0,
|
('E', 'l'): -3.14e+100,
|
||||||
('E', 'ln'): 0.0,
|
('E', 'ln'): -3.14e+100,
|
||||||
('E', 'm'): 0.0,
|
('E', 'm'): -3.14e+100,
|
||||||
('E', 'mg'): 0.0,
|
('E', 'mg'): -3.14e+100,
|
||||||
('E', 'mq'): 0.0,
|
('E', 'mq'): -3.14e+100,
|
||||||
('E', 'n'): 0.0,
|
('E', 'n'): -3.14e+100,
|
||||||
('E', 'ng'): 0.0,
|
('E', 'ng'): -3.14e+100,
|
||||||
('E', 'nr'): 0.0,
|
('E', 'nr'): -3.14e+100,
|
||||||
('E', 'nrfg'): 0.0,
|
('E', 'nrfg'): -3.14e+100,
|
||||||
('E', 'nrt'): 0.0,
|
('E', 'nrt'): -3.14e+100,
|
||||||
('E', 'ns'): 0.0,
|
('E', 'ns'): -3.14e+100,
|
||||||
('E', 'nt'): 0.0,
|
('E', 'nt'): -3.14e+100,
|
||||||
('E', 'nz'): 0.0,
|
('E', 'nz'): -3.14e+100,
|
||||||
('E', 'o'): 0.0,
|
('E', 'o'): -3.14e+100,
|
||||||
('E', 'p'): 0.0,
|
('E', 'p'): -3.14e+100,
|
||||||
('E', 'q'): 0.0,
|
('E', 'q'): -3.14e+100,
|
||||||
('E', 'qe'): 0.0,
|
('E', 'qe'): -3.14e+100,
|
||||||
('E', 'qg'): 0.0,
|
('E', 'qg'): -3.14e+100,
|
||||||
('E', 'r'): 0.0,
|
('E', 'r'): -3.14e+100,
|
||||||
('E', 'rg'): 0.0,
|
('E', 'rg'): -3.14e+100,
|
||||||
('E', 'rr'): 0.0,
|
('E', 'rr'): -3.14e+100,
|
||||||
('E', 'rz'): 0.0,
|
('E', 'rz'): -3.14e+100,
|
||||||
('E', 's'): 0.0,
|
('E', 's'): -3.14e+100,
|
||||||
('E', 't'): 0.0,
|
('E', 't'): -3.14e+100,
|
||||||
('E', 'tg'): 0.0,
|
('E', 'tg'): -3.14e+100,
|
||||||
('E', 'u'): 0.0,
|
('E', 'u'): -3.14e+100,
|
||||||
('E', 'ud'): 0.0,
|
('E', 'ud'): -3.14e+100,
|
||||||
('E', 'ug'): 0.0,
|
('E', 'ug'): -3.14e+100,
|
||||||
('E', 'uj'): 0.0,
|
('E', 'uj'): -3.14e+100,
|
||||||
('E', 'ul'): 0.0,
|
('E', 'ul'): -3.14e+100,
|
||||||
('E', 'uv'): 0.0,
|
('E', 'uv'): -3.14e+100,
|
||||||
('E', 'uz'): 0.0,
|
('E', 'uz'): -3.14e+100,
|
||||||
('E', 'v'): 0.0,
|
('E', 'v'): -3.14e+100,
|
||||||
('E', 'vd'): 0.0,
|
('E', 'vd'): -3.14e+100,
|
||||||
('E', 'vg'): 0.0,
|
('E', 'vg'): -3.14e+100,
|
||||||
('E', 'vi'): 0.0,
|
('E', 'vi'): -3.14e+100,
|
||||||
('E', 'vn'): 0.0,
|
('E', 'vn'): -3.14e+100,
|
||||||
('E', 'vq'): 0.0,
|
('E', 'vq'): -3.14e+100,
|
||||||
('E', 'w'): 0.0,
|
('E', 'w'): -3.14e+100,
|
||||||
('E', 'x'): 0.0,
|
('E', 'x'): -3.14e+100,
|
||||||
('E', 'y'): 0.0,
|
('E', 'y'): -3.14e+100,
|
||||||
('E', 'yg'): 0.0,
|
('E', 'yg'): -3.14e+100,
|
||||||
('E', 'z'): 0.0,
|
('E', 'z'): -3.14e+100,
|
||||||
('E', 'zg'): 0.0,
|
('E', 'zg'): -3.14e+100,
|
||||||
('M', 'a'): 0.0,
|
('M', 'a'): -3.14e+100,
|
||||||
('M', 'ad'): 0.0,
|
('M', 'ad'): -3.14e+100,
|
||||||
('M', 'ag'): 0.0,
|
('M', 'ag'): -3.14e+100,
|
||||||
('M', 'an'): 0.0,
|
('M', 'an'): -3.14e+100,
|
||||||
('M', 'b'): 0.0,
|
('M', 'b'): -3.14e+100,
|
||||||
('M', 'bg'): 0.0,
|
('M', 'bg'): -3.14e+100,
|
||||||
('M', 'c'): 0.0,
|
('M', 'c'): -3.14e+100,
|
||||||
('M', 'd'): 0.0,
|
('M', 'd'): -3.14e+100,
|
||||||
('M', 'df'): 0.0,
|
('M', 'df'): -3.14e+100,
|
||||||
('M', 'dg'): 0.0,
|
('M', 'dg'): -3.14e+100,
|
||||||
('M', 'e'): 0.0,
|
('M', 'e'): -3.14e+100,
|
||||||
('M', 'en'): 0.0,
|
('M', 'en'): -3.14e+100,
|
||||||
('M', 'f'): 0.0,
|
('M', 'f'): -3.14e+100,
|
||||||
('M', 'g'): 0.0,
|
('M', 'g'): -3.14e+100,
|
||||||
('M', 'h'): 0.0,
|
('M', 'h'): -3.14e+100,
|
||||||
('M', 'i'): 0.0,
|
('M', 'i'): -3.14e+100,
|
||||||
('M', 'in'): 0.0,
|
('M', 'in'): -3.14e+100,
|
||||||
('M', 'j'): 0.0,
|
('M', 'j'): -3.14e+100,
|
||||||
('M', 'jn'): 0.0,
|
('M', 'jn'): -3.14e+100,
|
||||||
('M', 'k'): 0.0,
|
('M', 'k'): -3.14e+100,
|
||||||
('M', 'l'): 0.0,
|
('M', 'l'): -3.14e+100,
|
||||||
('M', 'ln'): 0.0,
|
('M', 'ln'): -3.14e+100,
|
||||||
('M', 'm'): 0.0,
|
('M', 'm'): -3.14e+100,
|
||||||
('M', 'mg'): 0.0,
|
('M', 'mg'): -3.14e+100,
|
||||||
('M', 'mq'): 0.0,
|
('M', 'mq'): -3.14e+100,
|
||||||
('M', 'n'): 0.0,
|
('M', 'n'): -3.14e+100,
|
||||||
('M', 'ng'): 0.0,
|
('M', 'ng'): -3.14e+100,
|
||||||
('M', 'nr'): 0.0,
|
('M', 'nr'): -3.14e+100,
|
||||||
('M', 'nrfg'): 0.0,
|
('M', 'nrfg'): -3.14e+100,
|
||||||
('M', 'nrt'): 0.0,
|
('M', 'nrt'): -3.14e+100,
|
||||||
('M', 'ns'): 0.0,
|
('M', 'ns'): -3.14e+100,
|
||||||
('M', 'nt'): 0.0,
|
('M', 'nt'): -3.14e+100,
|
||||||
('M', 'nz'): 0.0,
|
('M', 'nz'): -3.14e+100,
|
||||||
('M', 'o'): 0.0,
|
('M', 'o'): -3.14e+100,
|
||||||
('M', 'p'): 0.0,
|
('M', 'p'): -3.14e+100,
|
||||||
('M', 'q'): 0.0,
|
('M', 'q'): -3.14e+100,
|
||||||
('M', 'qe'): 0.0,
|
('M', 'qe'): -3.14e+100,
|
||||||
('M', 'qg'): 0.0,
|
('M', 'qg'): -3.14e+100,
|
||||||
('M', 'r'): 0.0,
|
('M', 'r'): -3.14e+100,
|
||||||
('M', 'rg'): 0.0,
|
('M', 'rg'): -3.14e+100,
|
||||||
('M', 'rr'): 0.0,
|
('M', 'rr'): -3.14e+100,
|
||||||
('M', 'rz'): 0.0,
|
('M', 'rz'): -3.14e+100,
|
||||||
('M', 's'): 0.0,
|
('M', 's'): -3.14e+100,
|
||||||
('M', 't'): 0.0,
|
('M', 't'): -3.14e+100,
|
||||||
('M', 'tg'): 0.0,
|
('M', 'tg'): -3.14e+100,
|
||||||
('M', 'u'): 0.0,
|
('M', 'u'): -3.14e+100,
|
||||||
('M', 'ud'): 0.0,
|
('M', 'ud'): -3.14e+100,
|
||||||
('M', 'ug'): 0.0,
|
('M', 'ug'): -3.14e+100,
|
||||||
('M', 'uj'): 0.0,
|
('M', 'uj'): -3.14e+100,
|
||||||
('M', 'ul'): 0.0,
|
('M', 'ul'): -3.14e+100,
|
||||||
('M', 'uv'): 0.0,
|
('M', 'uv'): -3.14e+100,
|
||||||
('M', 'uz'): 0.0,
|
('M', 'uz'): -3.14e+100,
|
||||||
('M', 'v'): 0.0,
|
('M', 'v'): -3.14e+100,
|
||||||
('M', 'vd'): 0.0,
|
('M', 'vd'): -3.14e+100,
|
||||||
('M', 'vg'): 0.0,
|
('M', 'vg'): -3.14e+100,
|
||||||
('M', 'vi'): 0.0,
|
('M', 'vi'): -3.14e+100,
|
||||||
('M', 'vn'): 0.0,
|
('M', 'vn'): -3.14e+100,
|
||||||
('M', 'vq'): 0.0,
|
('M', 'vq'): -3.14e+100,
|
||||||
('M', 'w'): 0.0,
|
('M', 'w'): -3.14e+100,
|
||||||
('M', 'x'): 0.0,
|
('M', 'x'): -3.14e+100,
|
||||||
('M', 'y'): 0.0,
|
('M', 'y'): -3.14e+100,
|
||||||
('M', 'yg'): 0.0,
|
('M', 'yg'): -3.14e+100,
|
||||||
('M', 'z'): 0.0,
|
('M', 'z'): -3.14e+100,
|
||||||
('M', 'zg'): 0.0,
|
('M', 'zg'): -3.14e+100,
|
||||||
('S', 'a'): 0.020190568629634933,
|
('S', 'a'): -3.9025396831295227,
|
||||||
('S', 'ad'): 1.5911658472162552e-05,
|
('S', 'ad'): -11.048458480182255,
|
||||||
('S', 'ag'): 0.0009546995083297532,
|
('S', 'ag'): -6.954113917960154,
|
||||||
('S', 'an'): 2.651943078693759e-06,
|
('S', 'an'): -12.84021794941031,
|
||||||
('S', 'b'): 0.0015447568433391145,
|
('S', 'b'): -6.472888763970454,
|
||||||
('S', 'bg'): 0.0,
|
('S', 'bg'): -3.14e+100,
|
||||||
('S', 'c'): 0.008337709039413178,
|
('S', 'c'): -4.786966795861212,
|
||||||
('S', 'd'): 0.020162723227308648,
|
('S', 'd'): -3.903919764181873,
|
||||||
('S', 'df'): 0.0,
|
('S', 'df'): -3.14e+100,
|
||||||
('S', 'dg'): 0.0001299452108559942,
|
('S', 'dg'): -8.948397651299683,
|
||||||
('S', 'e'): 0.0026254236479068215,
|
('S', 'e'): -5.942513006281674,
|
||||||
('S', 'en'): 0.0,
|
('S', 'en'): -3.14e+100,
|
||||||
('S', 'f'): 0.0055452129775486496,
|
('S', 'f'): -5.194820249981676,
|
||||||
('S', 'g'): 0.0014917179817652395,
|
('S', 'g'): -6.507826815331734,
|
||||||
('S', 'h'): 0.00017502824319378808,
|
('S', 'h'): -8.650563207383884,
|
||||||
('S', 'i'): 0.0,
|
('S', 'i'): -3.14e+100,
|
||||||
('S', 'in'): 0.0,
|
('S', 'in'): -3.14e+100,
|
||||||
('S', 'j'): 0.007357816071835834,
|
('S', 'j'): -4.911992119644354,
|
||||||
('S', 'jn'): 0.0,
|
('S', 'jn'): -3.14e+100,
|
||||||
('S', 'k'): 0.000967959223723222,
|
('S', 'k'): -6.940320595827818,
|
||||||
('S', 'l'): 0.0,
|
('S', 'l'): -3.14e+100,
|
||||||
('S', 'ln'): 0.0,
|
('S', 'ln'): -3.14e+100,
|
||||||
('S', 'm'): 0.038036819577704585,
|
('S', 'm'): -3.269200652116097,
|
||||||
('S', 'mg'): 1.988957309020319e-05,
|
('S', 'mg'): -10.825314928868044,
|
||||||
('S', 'mq'): 0.0,
|
('S', 'mq'): -3.14e+100,
|
||||||
('S', 'n'): 0.021170461597212278,
|
('S', 'n'): -3.8551483897645107,
|
||||||
('S', 'ng'): 0.007347208299521059,
|
('S', 'ng'): -4.913434861102905,
|
||||||
('S', 'nr'): 0.011291973629078026,
|
('S', 'nr'): -4.483663103956885,
|
||||||
('S', 'nrfg'): 0.0,
|
('S', 'nrfg'): -3.14e+100,
|
||||||
('S', 'nrt'): 0.0,
|
('S', 'nrt'): -3.14e+100,
|
||||||
('S', 'ns'): 0.0,
|
('S', 'ns'): -3.14e+100,
|
||||||
('S', 'nt'): 5.303886157387518e-06,
|
('S', 'nt'): -12.147070768850364,
|
||||||
('S', 'nz'): 0.0,
|
('S', 'nz'): -3.14e+100,
|
||||||
('S', 'o'): 0.00021082947475615385,
|
('S', 'o'): -8.464460927750023,
|
||||||
('S', 'p'): 0.05044658721445203,
|
('S', 'p'): -2.9868401813596317,
|
||||||
('S', 'q'): 0.007531518343490275,
|
('S', 'q'): -4.888658618255058,
|
||||||
('S', 'qe'): 0.0,
|
('S', 'qe'): -3.14e+100,
|
||||||
('S', 'qg'): 0.0,
|
('S', 'qg'): -3.14e+100,
|
||||||
('S', 'r'): 0.06306851029749498,
|
('S', 'r'): -2.7635336784127853,
|
||||||
('S', 'rg'): 3.447526002301887e-05,
|
('S', 'rg'): -10.275268591948773,
|
||||||
('S', 'rr'): 0.0,
|
('S', 'rr'): -3.14e+100,
|
||||||
('S', 'rz'): 0.0,
|
('S', 'rz'): -3.14e+100,
|
||||||
('S', 's'): 0.0,
|
('S', 's'): -3.14e+100,
|
||||||
('S', 't'): 0.0,
|
('S', 't'): -3.14e+100,
|
||||||
('S', 'tg'): 0.0018868575004906095,
|
('S', 'tg'): -6.272842531880403,
|
||||||
('S', 'u'): 0.000967959223723222,
|
('S', 'u'): -6.940320595827818,
|
||||||
('S', 'ud'): 0.000440222551063164,
|
('S', 'ud'): -7.728230161053767,
|
||||||
('S', 'ug'): 0.0005317145872780986,
|
('S', 'ug'): -7.5394037026636855,
|
||||||
('S', 'uj'): 0.001056799316859463,
|
('S', 'uj'): -6.85251045118004,
|
||||||
('S', 'ul'): 0.00022143724707092888,
|
('S', 'ul'): -8.4153713175535,
|
||||||
('S', 'uv'): 0.00028640985249892595,
|
('S', 'uv'): -8.15808672228609,
|
||||||
('S', 'uz'): 9.149203621493468e-05,
|
('S', 'uz'): -9.299258625372996,
|
||||||
('S', 'v'): 0.04720326082920956,
|
('S', 'v'): -3.053292303412302,
|
||||||
('S', 'vd'): 0.0,
|
('S', 'vd'): -3.14e+100,
|
||||||
('S', 'vg'): 0.0026240976763674743,
|
('S', 'vg'): -5.9430181843676895,
|
||||||
('S', 'vi'): 0.0,
|
('S', 'vi'): -3.14e+100,
|
||||||
('S', 'vn'): 1.0607772314775036e-05,
|
('S', 'vn'): -11.453923588290419,
|
||||||
('S', 'vq'): 0.0,
|
('S', 'vq'): -3.14e+100,
|
||||||
('S', 'w'): 0.0,
|
('S', 'w'): -3.14e+100,
|
||||||
('S', 'x'): 0.0002187853039922351,
|
('S', 'x'): -8.427419656069674,
|
||||||
('S', 'y'): 0.00203536631289746,
|
('S', 'y'): -6.1970794699489575,
|
||||||
('S', 'yg'): 1.3259715393468796e-06,
|
('S', 'yg'): -13.533365129970255,
|
||||||
('S', 'z'): 0.0,
|
('S', 'z'): -3.14e+100,
|
||||||
('S', 'zg'): 0.0}
|
('S', 'zg'): -3.14e+100}
|
||||||
|
11530
jieba/posseg/prob_trans.p
Normal file
11530
jieba/posseg/prob_trans.p
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,42 +1,53 @@
|
|||||||
|
import sys
|
||||||
import operator
|
import operator
|
||||||
|
MIN_FLOAT = -3.14e100
|
||||||
|
MIN_INF = float("-inf")
|
||||||
|
|
||||||
|
if sys.version_info[0] > 2:
|
||||||
|
xrange = range
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_states(t_state_v, K=4):
|
||||||
|
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
|
||||||
|
|
||||||
def get_top_states(t_state_v,K=4):
|
|
||||||
items = t_state_v.items()
|
|
||||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
|
||||||
return [x[0] for x in topK]
|
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] # tabular
|
||||||
mem_path = [{}]
|
mem_path = [{}]
|
||||||
all_states = trans_p.keys()
|
all_states = trans_p.keys()
|
||||||
for y in states.get(obs[0],all_states): #init
|
for y in states.get(obs[0], all_states): # init
|
||||||
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
|
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
||||||
mem_path[0][y] = ''
|
mem_path[0][y] = ''
|
||||||
for t in range(1,len(obs)):
|
for t in xrange(1, len(obs)):
|
||||||
V.append({})
|
V.append({})
|
||||||
mem_path.append({})
|
mem_path.append({})
|
||||||
prev_states = get_top_states(V[t-1])
|
#prev_states = get_top_states(V[t-1])
|
||||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
prev_states = [
|
||||||
|
x for x in mem_path[t - 1].keys() if len(trans_p[x]) > 0]
|
||||||
|
|
||||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
prev_states_expect_next = set(
|
||||||
obs_states = states.get(obs[t],all_states)
|
(y for x in prev_states for y in trans_p[x].keys()))
|
||||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
obs_states = set(
|
||||||
|
states.get(obs[t], all_states)) & prev_states_expect_next
|
||||||
|
|
||||||
if len(obs_states)==0: obs_states = all_states
|
if not obs_states:
|
||||||
for y in obs_states:
|
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
||||||
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
|
|
||||||
V[t][y] =prob
|
|
||||||
mem_path[t][y] = state
|
|
||||||
|
|
||||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
for y in obs_states:
|
||||||
#if len(last)==0:
|
prob, state = max((V[t - 1][y0] + trans_p[y0].get(y, MIN_INF) +
|
||||||
#print obs
|
emit_p[y].get(obs[t], MIN_FLOAT), y0) for y0 in prev_states)
|
||||||
(prob, state) = max(last)
|
V[t][y] = prob
|
||||||
|
mem_path[t][y] = state
|
||||||
|
|
||||||
route = [None] * len(obs)
|
last = [(V[-1][y], y) for y in mem_path[-1].keys()]
|
||||||
i = len(obs)-1
|
# if len(last)==0:
|
||||||
while i>=0:
|
# print obs
|
||||||
route[i] = state
|
prob, state = max(last)
|
||||||
state = mem_path[i][state]
|
|
||||||
i-=1
|
route = [None] * len(obs)
|
||||||
return (prob, route)
|
i = len(obs) - 1
|
||||||
|
while i >= 0:
|
||||||
|
route[i] = state
|
||||||
|
state = mem_path[i][state]
|
||||||
|
i -= 1
|
||||||
|
return (prob, route)
|
||||||
|
84
setup.py
84
setup.py
@ -1,11 +1,75 @@
|
|||||||
from distutils.core import setup
|
# -*- coding: utf-8 -*-
|
||||||
setup(name='jieba',
|
from distutils.core import setup
|
||||||
version='0.22',
|
LONGDOC = """
|
||||||
description='Chinese Words Segementation Utilities',
|
jieba
|
||||||
author='Sun, Junyi',
|
=====
|
||||||
author_email='ccnusjy@gmail.com',
|
|
||||||
url='http://github.com/fxsjy',
|
“结巴”中文分词:做最好的 Python 中文分词组件
|
||||||
packages=['jieba'],
|
|
||||||
|
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to
|
||||||
|
be the best Python Chinese word segmentation module.
|
||||||
|
|
||||||
|
完整文档见 ``README.md``
|
||||||
|
|
||||||
|
GitHub: https://github.com/fxsjy/jieba
|
||||||
|
|
||||||
|
特点
|
||||||
|
====
|
||||||
|
|
||||||
|
- 支持三种分词模式:
|
||||||
|
|
||||||
|
- 精确模式,试图将句子最精确地切开,适合文本分析;
|
||||||
|
- 全模式,把句子中所有的可以成词的词语都扫描出来,
|
||||||
|
速度非常快,但是不能解决歧义;
|
||||||
|
- 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
||||||
|
|
||||||
|
- 支持繁体分词
|
||||||
|
- 支持自定义词典
|
||||||
|
- MIT 授权协议
|
||||||
|
|
||||||
|
在线演示: http://jiebademo.ap01.aws.af.cm/
|
||||||
|
|
||||||
|
安装说明
|
||||||
|
========
|
||||||
|
|
||||||
|
代码对 Python 2/3 均兼容
|
||||||
|
|
||||||
|
- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba`` / ``pip3 install jieba``
|
||||||
|
- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
|
||||||
|
python setup.py install
|
||||||
|
- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
|
||||||
|
- 通过 ``import jieba`` 来引用
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
setup(name='jieba',
|
||||||
|
version='0.42.1',
|
||||||
|
description='Chinese Words Segmentation Utilities',
|
||||||
|
long_description=LONGDOC,
|
||||||
|
author='Sun, Junyi',
|
||||||
|
author_email='ccnusjy@gmail.com',
|
||||||
|
url='https://github.com/fxsjy/jieba',
|
||||||
|
license="MIT",
|
||||||
|
classifiers=[
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
'Natural Language :: Chinese (Simplified)',
|
||||||
|
'Natural Language :: Chinese (Traditional)',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 2',
|
||||||
|
'Programming Language :: Python :: 2.6',
|
||||||
|
'Programming Language :: Python :: 2.7',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Programming Language :: Python :: 3.2',
|
||||||
|
'Programming Language :: Python :: 3.3',
|
||||||
|
'Programming Language :: Python :: 3.4',
|
||||||
|
'Topic :: Text Processing',
|
||||||
|
'Topic :: Text Processing :: Indexing',
|
||||||
|
'Topic :: Text Processing :: Linguistic',
|
||||||
|
],
|
||||||
|
keywords='NLP,tokenizing,Chinese word segementation',
|
||||||
|
packages=['jieba'],
|
||||||
package_dir={'jieba':'jieba'},
|
package_dir={'jieba':'jieba'},
|
||||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*', 'lac_small/*.py','lac_small/*.dic', 'lac_small/model_baseline/*']}
|
||||||
)
|
)
|
||||||
|
81
test/demo.py
81
test/demo.py
@ -1,17 +1,84 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
|
|
||||||
import jieba
|
import jieba
|
||||||
|
import jieba.posseg
|
||||||
|
import jieba.analyse
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
print('='*40)
|
||||||
print "Full Mode:", "/ ".join(seg_list) #全模式
|
print('1. 分词')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print "Default Mode:", "/ ".join(seg_list) #默认模式
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
|
|
||||||
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||||
|
print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
|
print('='*40)
|
||||||
|
print('2. 添加自定义词典/调整词典')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
|
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
|
||||||
|
#如果/放到/post/中将/出错/。
|
||||||
|
print(jieba.suggest_freq(('中', '将'), True))
|
||||||
|
#494
|
||||||
|
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))
|
||||||
|
#如果/放到/post/中/将/出错/。
|
||||||
|
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
|
||||||
|
#「/台/中/」/正确/应该/不会/被/切开
|
||||||
|
print(jieba.suggest_freq('台中', True))
|
||||||
|
#69
|
||||||
|
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))
|
||||||
|
#「/台中/」/正确/应该/不会/被/切开
|
||||||
|
|
||||||
|
print('='*40)
|
||||||
|
print('3. 关键词提取')
|
||||||
|
print('-'*40)
|
||||||
|
print(' TF-IDF')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
|
s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||||
|
for x, w in jieba.analyse.extract_tags(s, withWeight=True):
|
||||||
|
print('%s %s' % (x, w))
|
||||||
|
|
||||||
|
print('-'*40)
|
||||||
|
print(' TextRank')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
|
for x, w in jieba.analyse.textrank(s, withWeight=True):
|
||||||
|
print('%s %s' % (x, w))
|
||||||
|
|
||||||
|
print('='*40)
|
||||||
|
print('4. 词性标注')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
|
words = jieba.posseg.cut("我爱北京天安门")
|
||||||
|
for word, flag in words:
|
||||||
|
print('%s %s' % (word, flag))
|
||||||
|
|
||||||
|
print('='*40)
|
||||||
|
print('6. Tokenize: 返回词语在原文的起止位置')
|
||||||
|
print('-'*40)
|
||||||
|
print(' 默认模式')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
|
result = jieba.tokenize('永和服装饰品有限公司')
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
|
||||||
|
print('-'*40)
|
||||||
|
print(' 搜索模式')
|
||||||
|
print('-'*40)
|
||||||
|
|
||||||
|
result = jieba.tokenize('永和服装饰品有限公司', mode='search')
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
@ -5,29 +5,26 @@ import jieba
|
|||||||
import jieba.analyse
|
import jieba.analyse
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
|
||||||
USAGE ="usage: python extract_tags.py [file name] -k [top k]"
|
USAGE = "usage: python extract_tags.py [file name] -k [top k]"
|
||||||
|
|
||||||
parser = OptionParser(USAGE)
|
parser = OptionParser(USAGE)
|
||||||
parser.add_option("-k",dest="topK")
|
parser.add_option("-k", dest="topK")
|
||||||
opt, args = parser.parse_args()
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
if len(args) <1:
|
if len(args) < 1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
|
|
||||||
if opt.topK==None:
|
if opt.topK is None:
|
||||||
topK=10
|
topK = 10
|
||||||
else:
|
else:
|
||||||
topK = int(opt.topK)
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
content = open(file_name, 'rb').read()
|
||||||
|
|
||||||
content = open(file_name,'rb').read()
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
tags = jieba.analyse.extract_tags(content,topK=topK)
|
|
||||||
|
|
||||||
print ",".join(tags)
|
|
||||||
|
|
||||||
|
|
||||||
|
print(",".join(tags))
|
||||||
|
32
test/extract_tags_idfpath.py
Normal file
32
test/extract_tags_idfpath.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE = "usage: python extract_tags_idfpath.py [file name] -k [top k]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k", dest="topK")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) < 1:
|
||||||
|
print(USAGE)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK is None:
|
||||||
|
topK = 10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
content = open(file_name, 'rb').read()
|
||||||
|
|
||||||
|
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
|
print(",".join(tags))
|
33
test/extract_tags_stop_words.py
Normal file
33
test/extract_tags_stop_words.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE = "usage: python extract_tags_stop_words.py [file name] -k [top k]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k", dest="topK")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) < 1:
|
||||||
|
print(USAGE)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK is None:
|
||||||
|
topK = 10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
content = open(file_name, 'rb').read()
|
||||||
|
|
||||||
|
jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
|
||||||
|
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
|
print(",".join(tags))
|
43
test/extract_tags_with_weight.py
Normal file
43
test/extract_tags_with_weight.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE = "usage: python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k", dest="topK")
|
||||||
|
parser.add_option("-w", dest="withWeight")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) < 1:
|
||||||
|
print(USAGE)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK is None:
|
||||||
|
topK = 10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
if opt.withWeight is None:
|
||||||
|
withWeight = False
|
||||||
|
else:
|
||||||
|
if int(opt.withWeight) is 1:
|
||||||
|
withWeight = True
|
||||||
|
else:
|
||||||
|
withWeight = False
|
||||||
|
|
||||||
|
content = open(file_name, 'rb').read()
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
|
||||||
|
|
||||||
|
if withWeight is True:
|
||||||
|
for tag in tags:
|
||||||
|
print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
|
||||||
|
else:
|
||||||
|
print(",".join(tags))
|
63
test/extract_topic.py
Normal file
63
test/extract_topic.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from sklearn import decomposition
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import time
|
||||||
|
import glob
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
if len(sys.argv)<2:
|
||||||
|
print("usage: extract_topic.py directory [n_topic] [n_top_words]")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
n_topic = 10
|
||||||
|
n_top_words = 25
|
||||||
|
|
||||||
|
if len(sys.argv)>2:
|
||||||
|
n_topic = int(sys.argv[2])
|
||||||
|
|
||||||
|
if len(sys.argv)>3:
|
||||||
|
n_top_words = int(sys.argv[3])
|
||||||
|
|
||||||
|
count_vect = CountVectorizer()
|
||||||
|
docs = []
|
||||||
|
|
||||||
|
pattern = os.path.join(sys.argv[1],"*.txt")
|
||||||
|
print("read "+pattern)
|
||||||
|
|
||||||
|
for f_name in glob.glob(pattern):
|
||||||
|
with open(f_name) as f:
|
||||||
|
print("read file:", f_name)
|
||||||
|
for line in f: #one line as a document
|
||||||
|
words = " ".join(jieba.cut(line))
|
||||||
|
docs.append(words)
|
||||||
|
|
||||||
|
random.shuffle(docs)
|
||||||
|
|
||||||
|
print("read done.")
|
||||||
|
|
||||||
|
print("transform")
|
||||||
|
counts = count_vect.fit_transform(docs)
|
||||||
|
tfidf = TfidfTransformer().fit_transform(counts)
|
||||||
|
print(tfidf.shape)
|
||||||
|
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
|
print("training...")
|
||||||
|
|
||||||
|
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
|
||||||
|
print("done in %0.3fs." % (time.time() - t0))
|
||||||
|
|
||||||
|
# Inverse the vectorizer vocabulary to be able
|
||||||
|
feature_names = count_vect.get_feature_names()
|
||||||
|
|
||||||
|
for topic_idx, topic in enumerate(nmf.components_):
|
||||||
|
print("Topic #%d:" % topic_idx)
|
||||||
|
print(" ".join([feature_names[i]
|
||||||
|
for i in topic.argsort()[:-n_top_words - 1:-1]]))
|
||||||
|
print("")
|
1
test/foobar.txt
Normal file
1
test/foobar.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
好人 12 n
|
205
test/jieba_test.py
Normal file
205
test/jieba_test.py
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
#-*-coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import unittest
|
||||||
|
import types
|
||||||
|
import jieba
|
||||||
|
if sys.version_info[0] > 2:
|
||||||
|
from imp import reload
|
||||||
|
|
||||||
|
jieba.initialize()
|
||||||
|
|
||||||
|
|
||||||
|
test_contents = [
|
||||||
|
"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。",
|
||||||
|
"我不喜欢日本和服。",
|
||||||
|
"雷猴回归人间。",
|
||||||
|
"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
|
||||||
|
"我需要廉租房",
|
||||||
|
"永和服装饰品有限公司",
|
||||||
|
"我爱北京天安门",
|
||||||
|
"abc",
|
||||||
|
"隐马尔可夫",
|
||||||
|
"雷猴是个好网站",
|
||||||
|
"“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成",
|
||||||
|
"草泥马和欺实马是今年的流行词汇",
|
||||||
|
"伊藤洋华堂总府店",
|
||||||
|
"中国科学院计算技术研究所",
|
||||||
|
"罗密欧与朱丽叶",
|
||||||
|
"我购买了道具和服装",
|
||||||
|
"PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍",
|
||||||
|
"湖北省石首市",
|
||||||
|
"湖北省十堰市",
|
||||||
|
"总经理完成了这件事情",
|
||||||
|
"电脑修好了",
|
||||||
|
"做好了这件事情就一了百了了",
|
||||||
|
"人们审美的观点是不同的",
|
||||||
|
"我们买了一个美的空调",
|
||||||
|
"线程初始化时我们要注意",
|
||||||
|
"一个分子是由好多原子组织成的",
|
||||||
|
"祝你马到功成",
|
||||||
|
"他掉进了无底洞里",
|
||||||
|
"中国的首都是北京",
|
||||||
|
"孙君意",
|
||||||
|
"外交部发言人马朝旭",
|
||||||
|
"领导人会议和第四届东亚峰会",
|
||||||
|
"在过去的这五年",
|
||||||
|
"还需要很长的路要走",
|
||||||
|
"60周年首都阅兵",
|
||||||
|
"你好人们审美的观点是不同的",
|
||||||
|
"买水果然后来世博园",
|
||||||
|
"买水果然后去世博园",
|
||||||
|
"但是后来我才知道你是对的",
|
||||||
|
"存在即合理",
|
||||||
|
"的的的的的在的的的的就以和和和",
|
||||||
|
"I love你,不以为耻,反以为rong",
|
||||||
|
"因",
|
||||||
|
"",
|
||||||
|
"hello你好人们审美的观点是不同的",
|
||||||
|
"很好但主要是基于网页形式",
|
||||||
|
"hello你好人们审美的观点是不同的",
|
||||||
|
"为什么我不能拥有想要的生活",
|
||||||
|
"后来我才",
|
||||||
|
"此次来中国是为了",
|
||||||
|
"使用了它就可以解决一些问题",
|
||||||
|
",使用了它就可以解决一些问题",
|
||||||
|
"其实使用了它就可以解决一些问题",
|
||||||
|
"好人使用了它就可以解决一些问题",
|
||||||
|
"是因为和国家",
|
||||||
|
"老年搜索还支持",
|
||||||
|
"干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
|
||||||
|
"大",
|
||||||
|
"",
|
||||||
|
"他说的确实在理",
|
||||||
|
"长春市长春节讲话",
|
||||||
|
"结婚的和尚未结婚的",
|
||||||
|
"结合成分子时",
|
||||||
|
"旅游和服务是最好的",
|
||||||
|
"这件事情的确是我的错",
|
||||||
|
"供大家参考指正",
|
||||||
|
"哈尔滨政府公布塌桥原因",
|
||||||
|
"我在机场入口处",
|
||||||
|
"邢永臣摄影报道",
|
||||||
|
"BP神经网络如何训练才能在分类时增加区分度?",
|
||||||
|
"南京市长江大桥",
|
||||||
|
"应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究",
|
||||||
|
'长春市长春药店',
|
||||||
|
'邓颖超生前最喜欢的衣服',
|
||||||
|
'胡锦涛是热爱世界和平的政治局常委',
|
||||||
|
'程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪',
|
||||||
|
'一次性交多少钱',
|
||||||
|
'两块五一套,三块八一斤,四块七一本,五块六一条',
|
||||||
|
'小和尚留了一个像大和尚一样的和尚头',
|
||||||
|
'我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站',
|
||||||
|
'张晓梅去人民医院做了个B超然后去买了件T恤',
|
||||||
|
'AT&T是一件不错的公司,给你发offer了吗?',
|
||||||
|
'C++和c#是什么关系?11+122=133,是吗?PI=3.14159',
|
||||||
|
'你认识那个和主席握手的的哥吗?他开一辆黑色的士。',
|
||||||
|
'枪杆子中出政权']
|
||||||
|
|
||||||
|
|
||||||
|
class JiebaTestCase(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
reload(jieba)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def testDefaultCut(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.cut(content)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
|
||||||
|
print(" , ".join(result), file=sys.stderr)
|
||||||
|
print("testDefaultCut", file=sys.stderr)
|
||||||
|
|
||||||
|
def testCutAll(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.cut(content, cut_all=True)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test CutAll error on content: %s" % content
|
||||||
|
print(" , ".join(result), file=sys.stderr)
|
||||||
|
print("testCutAll", file=sys.stderr)
|
||||||
|
|
||||||
|
def testSetDictionary(self):
|
||||||
|
jieba.set_dictionary("foobar.txt")
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.cut(content)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
|
||||||
|
print(" , ".join(result), file=sys.stderr)
|
||||||
|
print("testSetDictionary", file=sys.stderr)
|
||||||
|
|
||||||
|
def testCutForSearch(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.cut_for_search(content)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
|
||||||
|
print(" , ".join(result), file=sys.stderr)
|
||||||
|
print("testCutForSearch", file=sys.stderr)
|
||||||
|
|
||||||
|
def testPosseg(self):
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
for content in test_contents:
|
||||||
|
result = pseg.cut(content)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test Posseg error on content: %s" % content
|
||||||
|
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
|
||||||
|
print("testPosseg", file=sys.stderr)
|
||||||
|
|
||||||
|
def testTokenize(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.tokenize(content)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
|
||||||
|
print("testTokenize", file=sys.stderr)
|
||||||
|
|
||||||
|
def testDefaultCut_NOHMM(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.cut(content,HMM=False)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
|
||||||
|
print(" , ".join(result), file=sys.stderr)
|
||||||
|
print("testDefaultCut_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
|
def testPosseg_NOHMM(self):
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
for content in test_contents:
|
||||||
|
result = pseg.cut(content,HMM=False)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test Posseg error on content: %s" % content
|
||||||
|
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
|
||||||
|
print("testPosseg_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
|
def testTokenize_NOHMM(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.tokenize(content,HMM=False)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
|
||||||
|
print("testTokenize_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
|
def testCutForSearch_NOHMM(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.cut_for_search(content,HMM=False)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
|
||||||
|
print(" , ".join(result), file=sys.stderr)
|
||||||
|
print("testCutForSearch_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
@ -6,7 +6,7 @@ cat abc.txt | python jiebacmd.py | sort | uniq -c | sort -nr -k1 | head -100
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
|
|
||||||
@ -15,14 +15,14 @@ import jieba
|
|||||||
default_encoding='utf-8'
|
default_encoding='utf-8'
|
||||||
|
|
||||||
if len(sys.argv)>1:
|
if len(sys.argv)>1:
|
||||||
default_encoding = sys.argv[1]
|
default_encoding = sys.argv[1]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
line = sys.stdin.readline()
|
line = sys.stdin.readline()
|
||||||
if line=="":
|
if line=="":
|
||||||
break
|
break
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
for word in jieba.cut(line):
|
for word in jieba.cut(line):
|
||||||
print word.encode(default_encoding)
|
print(word)
|
||||||
|
|
||||||
|
|
||||||
|
44
test/lyric.txt
Normal file
44
test/lyric.txt
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
我沒有心
|
||||||
|
我沒有真實的自我
|
||||||
|
我只有消瘦的臉孔
|
||||||
|
所謂軟弱
|
||||||
|
所謂的順從一向是我
|
||||||
|
的座右銘
|
||||||
|
|
||||||
|
而我
|
||||||
|
沒有那海洋的寬闊
|
||||||
|
我只要熱情的撫摸
|
||||||
|
所謂空洞
|
||||||
|
所謂不安全感是我
|
||||||
|
的墓誌銘
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般怯懦
|
||||||
|
是否和我一般矯作
|
||||||
|
和我一般囉唆
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般退縮
|
||||||
|
是否和我一般肌迫
|
||||||
|
一般地困惑
|
||||||
|
|
||||||
|
我沒有力
|
||||||
|
我沒有滿腔的熱火
|
||||||
|
我只有滿肚的如果
|
||||||
|
所謂勇氣
|
||||||
|
所謂的認同感是我
|
||||||
|
隨便說說
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般怯懦
|
||||||
|
是否和我一般矯作
|
||||||
|
是否對你來說
|
||||||
|
只是一場遊戲
|
||||||
|
雖然沒有把握
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般退縮
|
||||||
|
是否和我一般肌迫
|
||||||
|
是否對你來說
|
||||||
|
只是逼不得已
|
||||||
|
雖然沒有藉口
|
34
test/parallel/extract_tags.py
Normal file
34
test/parallel/extract_tags.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE ="usage: python extract_tags.py [file name] -k [top k]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k",dest="topK")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) <1:
|
||||||
|
print(USAGE)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK==None:
|
||||||
|
topK=10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
|
||||||
|
content = open(file_name,'rb').read()
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content,topK=topK)
|
||||||
|
|
||||||
|
print(",".join(tags))
|
||||||
|
|
||||||
|
|
99
test/parallel/test.py
Normal file
99
test/parallel/test.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent)
|
||||||
|
for word in result:
|
||||||
|
print(word, "/", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
95
test/parallel/test2.py
Normal file
95
test/parallel/test2.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent,cut_all=True)
|
||||||
|
for word in result:
|
||||||
|
print(word, "/", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
95
test/parallel/test_cut_for_search.py
Normal file
95
test/parallel/test_cut_for_search.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut_for_search(test_sent)
|
||||||
|
for word in result:
|
||||||
|
print(word, "/", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
95
test/parallel/test_disable_hmm.py
Normal file
95
test/parallel/test_disable_hmm.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent, HMM=False)
|
||||||
|
for word in result:
|
||||||
|
print(word, "/", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
20
test/parallel/test_file.py
Normal file
20
test/parallel/test_file.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import sys
|
||||||
|
import time
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
jieba.enable_parallel()
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
content = open(url,"rb").read()
|
||||||
|
t1 = time.time()
|
||||||
|
words = "/ ".join(jieba.cut(content))
|
||||||
|
|
||||||
|
t2 = time.time()
|
||||||
|
tm_cost = t2-t1
|
||||||
|
|
||||||
|
log_f = open("1.log","wb")
|
||||||
|
log_f.write(words.encode('utf-8'))
|
||||||
|
|
||||||
|
print('speed %s bytes/second' % (len(content)/tm_cost))
|
||||||
|
|
100
test/parallel/test_pos.py
Normal file
100
test/parallel/test_pos.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = pseg.cut(test_sent)
|
||||||
|
for w in result:
|
||||||
|
print(w.word, "/", w.flag, ", ", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
22
test/parallel/test_pos_file.py
Normal file
22
test/parallel/test_pos_file.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from __future__ import print_function
|
||||||
|
import sys,time
|
||||||
|
import sys
|
||||||
|
sys.path.append("../../")
|
||||||
|
import jieba
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
content = open(url,"rb").read()
|
||||||
|
t1 = time.time()
|
||||||
|
words = list(pseg.cut(content))
|
||||||
|
|
||||||
|
t2 = time.time()
|
||||||
|
tm_cost = t2-t1
|
||||||
|
|
||||||
|
log_f = open("1.log","w")
|
||||||
|
log_f.write(' / '.join(map(str, words)))
|
||||||
|
|
||||||
|
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||||
|
|
177
test/test.py
177
test/test.py
@ -3,91 +3,100 @@ import sys
|
|||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent)
|
result = jieba.cut(test_sent)
|
||||||
for word in result:
|
print(" / ".join(result))
|
||||||
print word, "/",
|
|
||||||
print ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
cuttest("我不喜欢日本和服。")
|
cuttest("我不喜欢日本和服。")
|
||||||
cuttest("雷猴回归人间。")
|
cuttest("雷猴回归人间。")
|
||||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
cuttest("我需要廉租房")
|
cuttest("我需要廉租房")
|
||||||
cuttest("永和服装饰品有限公司")
|
cuttest("永和服装饰品有限公司")
|
||||||
cuttest("我爱北京天安门")
|
cuttest("我爱北京天安门")
|
||||||
cuttest("abc")
|
cuttest("abc")
|
||||||
cuttest("隐马尔可夫")
|
cuttest("隐马尔可夫")
|
||||||
cuttest("雷猴是个好网站")
|
cuttest("雷猴是个好网站")
|
||||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
cuttest("伊藤洋华堂总府店")
|
cuttest("伊藤洋华堂总府店")
|
||||||
cuttest("中国科学院计算技术研究所")
|
cuttest("中国科学院计算技术研究所")
|
||||||
cuttest("罗密欧与朱丽叶")
|
cuttest("罗密欧与朱丽叶")
|
||||||
cuttest("我购买了道具和服装")
|
cuttest("我购买了道具和服装")
|
||||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
cuttest("湖北省石首市")
|
cuttest("湖北省石首市")
|
||||||
cuttest("湖北省十堰市")
|
cuttest("湖北省十堰市")
|
||||||
cuttest("总经理完成了这件事情")
|
cuttest("总经理完成了这件事情")
|
||||||
cuttest("电脑修好了")
|
cuttest("电脑修好了")
|
||||||
cuttest("做好了这件事情就一了百了了")
|
cuttest("做好了这件事情就一了百了了")
|
||||||
cuttest("人们审美的观点是不同的")
|
cuttest("人们审美的观点是不同的")
|
||||||
cuttest("我们买了一个美的空调")
|
cuttest("我们买了一个美的空调")
|
||||||
cuttest("线程初始化时我们要注意")
|
cuttest("线程初始化时我们要注意")
|
||||||
cuttest("一个分子是由好多原子组织成的")
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
cuttest("祝你马到功成")
|
cuttest("祝你马到功成")
|
||||||
cuttest("他掉进了无底洞里")
|
cuttest("他掉进了无底洞里")
|
||||||
cuttest("中国的首都是北京")
|
cuttest("中国的首都是北京")
|
||||||
cuttest("孙君意")
|
cuttest("孙君意")
|
||||||
cuttest("外交部发言人马朝旭")
|
cuttest("外交部发言人马朝旭")
|
||||||
cuttest("领导人会议和第四届东亚峰会")
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
cuttest("在过去的这五年")
|
cuttest("在过去的这五年")
|
||||||
cuttest("还需要很长的路要走")
|
cuttest("还需要很长的路要走")
|
||||||
cuttest("60周年首都阅兵")
|
cuttest("60周年首都阅兵")
|
||||||
cuttest("你好人们审美的观点是不同的")
|
cuttest("你好人们审美的观点是不同的")
|
||||||
cuttest("买水果然后来世博园")
|
cuttest("买水果然后来世博园")
|
||||||
cuttest("买水果然后去世博园")
|
cuttest("买水果然后去世博园")
|
||||||
cuttest("但是后来我才知道你是对的")
|
cuttest("但是后来我才知道你是对的")
|
||||||
cuttest("存在即合理")
|
cuttest("存在即合理")
|
||||||
cuttest("的的的的的在的的的的就以和和和")
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
cuttest("I love你,不以为耻,反以为rong")
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
cuttest("因")
|
cuttest("因")
|
||||||
cuttest("")
|
cuttest("")
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
cuttest("很好但主要是基于网页形式")
|
cuttest("很好但主要是基于网页形式")
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
cuttest("为什么我不能拥有想要的生活")
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
cuttest("后来我才")
|
cuttest("后来我才")
|
||||||
cuttest("此次来中国是为了")
|
cuttest("此次来中国是为了")
|
||||||
cuttest("使用了它就可以解决一些问题")
|
cuttest("使用了它就可以解决一些问题")
|
||||||
cuttest(",使用了它就可以解决一些问题")
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
cuttest("其实使用了它就可以解决一些问题")
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
cuttest("好人使用了它就可以解决一些问题")
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
cuttest("是因为和国家")
|
cuttest("是因为和国家")
|
||||||
cuttest("老年搜索还支持")
|
cuttest("老年搜索还支持")
|
||||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
cuttest("大")
|
cuttest("大")
|
||||||
cuttest("")
|
cuttest("")
|
||||||
cuttest("他说的确实在理")
|
cuttest("他说的确实在理")
|
||||||
cuttest("长春市长春节讲话")
|
cuttest("长春市长春节讲话")
|
||||||
cuttest("结婚的和尚未结婚的")
|
cuttest("结婚的和尚未结婚的")
|
||||||
cuttest("结合成分子时")
|
cuttest("结合成分子时")
|
||||||
cuttest("旅游和服务是最好的")
|
cuttest("旅游和服务是最好的")
|
||||||
cuttest("这件事情的确是我的错")
|
cuttest("这件事情的确是我的错")
|
||||||
cuttest("供大家参考指正")
|
cuttest("供大家参考指正")
|
||||||
cuttest("哈尔滨政府公布塌桥原因")
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
cuttest("我在机场入口处")
|
cuttest("我在机场入口处")
|
||||||
cuttest("邢永臣摄影报道")
|
cuttest("邢永臣摄影报道")
|
||||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
cuttest("南京市长江大桥")
|
cuttest("南京市长江大桥")
|
||||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
cuttest('长春市长春药店')
|
cuttest('长春市长春药店')
|
||||||
cuttest('邓颖超生前最喜欢的衣服')
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
cuttest('一次性交多少钱')
|
cuttest('一次性交多少钱')
|
||||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
cuttest('张三风同学走上了不归路')
|
||||||
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
|
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
||||||
|
jieba.del_word('很赞')
|
||||||
|
cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')
|
||||||
|
@ -1,93 +0,0 @@
|
|||||||
#encoding=utf-8
|
|
||||||
import sys
|
|
||||||
sys.path.append("../")
|
|
||||||
import jieba
|
|
||||||
|
|
||||||
def cuttest(test_sent):
|
|
||||||
result = jieba.cut(test_sent,cut_all=True)
|
|
||||||
for word in result:
|
|
||||||
print word, "/",
|
|
||||||
print ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
|
||||||
cuttest("我不喜欢日本和服。")
|
|
||||||
cuttest("雷猴回归人间。")
|
|
||||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
|
||||||
cuttest("我需要廉租房")
|
|
||||||
cuttest("永和服装饰品有限公司")
|
|
||||||
cuttest("我爱北京天安门")
|
|
||||||
cuttest("abc")
|
|
||||||
cuttest("隐马尔可夫")
|
|
||||||
cuttest("雷猴是个好网站")
|
|
||||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
|
||||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
|
||||||
cuttest("伊藤洋华堂总府店")
|
|
||||||
cuttest("中国科学院计算技术研究所")
|
|
||||||
cuttest("罗密欧与朱丽叶")
|
|
||||||
cuttest("我购买了道具和服装")
|
|
||||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
|
||||||
cuttest("湖北省石首市")
|
|
||||||
cuttest("湖北省十堰市")
|
|
||||||
cuttest("总经理完成了这件事情")
|
|
||||||
cuttest("电脑修好了")
|
|
||||||
cuttest("做好了这件事情就一了百了了")
|
|
||||||
cuttest("人们审美的观点是不同的")
|
|
||||||
cuttest("我们买了一个美的空调")
|
|
||||||
cuttest("线程初始化时我们要注意")
|
|
||||||
cuttest("一个分子是由好多原子组织成的")
|
|
||||||
cuttest("祝你马到功成")
|
|
||||||
cuttest("他掉进了无底洞里")
|
|
||||||
cuttest("中国的首都是北京")
|
|
||||||
cuttest("孙君意")
|
|
||||||
cuttest("外交部发言人马朝旭")
|
|
||||||
cuttest("领导人会议和第四届东亚峰会")
|
|
||||||
cuttest("在过去的这五年")
|
|
||||||
cuttest("还需要很长的路要走")
|
|
||||||
cuttest("60周年首都阅兵")
|
|
||||||
cuttest("你好人们审美的观点是不同的")
|
|
||||||
cuttest("买水果然后来世博园")
|
|
||||||
cuttest("买水果然后去世博园")
|
|
||||||
cuttest("但是后来我才知道你是对的")
|
|
||||||
cuttest("存在即合理")
|
|
||||||
cuttest("的的的的的在的的的的就以和和和")
|
|
||||||
cuttest("I love你,不以为耻,反以为rong")
|
|
||||||
cuttest("因")
|
|
||||||
cuttest("")
|
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
|
||||||
cuttest("很好但主要是基于网页形式")
|
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
|
||||||
cuttest("为什么我不能拥有想要的生活")
|
|
||||||
cuttest("后来我才")
|
|
||||||
cuttest("此次来中国是为了")
|
|
||||||
cuttest("使用了它就可以解决一些问题")
|
|
||||||
cuttest(",使用了它就可以解决一些问题")
|
|
||||||
cuttest("其实使用了它就可以解决一些问题")
|
|
||||||
cuttest("好人使用了它就可以解决一些问题")
|
|
||||||
cuttest("是因为和国家")
|
|
||||||
cuttest("老年搜索还支持")
|
|
||||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
|
||||||
cuttest("大")
|
|
||||||
cuttest("")
|
|
||||||
cuttest("他说的确实在理")
|
|
||||||
cuttest("长春市长春节讲话")
|
|
||||||
cuttest("结婚的和尚未结婚的")
|
|
||||||
cuttest("结合成分子时")
|
|
||||||
cuttest("旅游和服务是最好的")
|
|
||||||
cuttest("这件事情的确是我的错")
|
|
||||||
cuttest("供大家参考指正")
|
|
||||||
cuttest("哈尔滨政府公布塌桥原因")
|
|
||||||
cuttest("我在机场入口处")
|
|
||||||
cuttest("邢永臣摄影报道")
|
|
||||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
|
||||||
cuttest("南京市长江大桥")
|
|
||||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
|
||||||
cuttest('长春市长春药店')
|
|
||||||
cuttest('邓颖超生前最喜欢的衣服')
|
|
||||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
|
||||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
|
||||||
cuttest('一次性交多少钱')
|
|
||||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
|
||||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
|
||||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
|
10
test/test_bug.py
Normal file
10
test/test_bug.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
words=pseg.cut("又跛又啞")
|
||||||
|
for w in words:
|
||||||
|
print(w.word,w.flag)
|
||||||
|
|
28
test/test_change_dictpath.py
Normal file
28
test/test_change_dictpath.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent)
|
||||||
|
print(" ".join(result))
|
||||||
|
|
||||||
|
def testcase():
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
testcase()
|
||||||
|
jieba.set_dictionary("foobar.txt")
|
||||||
|
print("================================")
|
||||||
|
testcase()
|
||||||
|
|
@ -1,93 +1,98 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut_for_search(test_sent)
|
result = jieba.cut_for_search(test_sent)
|
||||||
for word in result:
|
for word in result:
|
||||||
print word, "/",
|
print(word, "/", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
cuttest("我不喜欢日本和服。")
|
cuttest("我不喜欢日本和服。")
|
||||||
cuttest("雷猴回归人间。")
|
cuttest("雷猴回归人间。")
|
||||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
cuttest("我需要廉租房")
|
cuttest("我需要廉租房")
|
||||||
cuttest("永和服装饰品有限公司")
|
cuttest("永和服装饰品有限公司")
|
||||||
cuttest("我爱北京天安门")
|
cuttest("我爱北京天安门")
|
||||||
cuttest("abc")
|
cuttest("abc")
|
||||||
cuttest("隐马尔可夫")
|
cuttest("隐马尔可夫")
|
||||||
cuttest("雷猴是个好网站")
|
cuttest("雷猴是个好网站")
|
||||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
cuttest("伊藤洋华堂总府店")
|
cuttest("伊藤洋华堂总府店")
|
||||||
cuttest("中国科学院计算技术研究所")
|
cuttest("中国科学院计算技术研究所")
|
||||||
cuttest("罗密欧与朱丽叶")
|
cuttest("罗密欧与朱丽叶")
|
||||||
cuttest("我购买了道具和服装")
|
cuttest("我购买了道具和服装")
|
||||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
cuttest("湖北省石首市")
|
cuttest("湖北省石首市")
|
||||||
cuttest("湖北省十堰市")
|
cuttest("湖北省十堰市")
|
||||||
cuttest("总经理完成了这件事情")
|
cuttest("总经理完成了这件事情")
|
||||||
cuttest("电脑修好了")
|
cuttest("电脑修好了")
|
||||||
cuttest("做好了这件事情就一了百了了")
|
cuttest("做好了这件事情就一了百了了")
|
||||||
cuttest("人们审美的观点是不同的")
|
cuttest("人们审美的观点是不同的")
|
||||||
cuttest("我们买了一个美的空调")
|
cuttest("我们买了一个美的空调")
|
||||||
cuttest("线程初始化时我们要注意")
|
cuttest("线程初始化时我们要注意")
|
||||||
cuttest("一个分子是由好多原子组织成的")
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
cuttest("祝你马到功成")
|
cuttest("祝你马到功成")
|
||||||
cuttest("他掉进了无底洞里")
|
cuttest("他掉进了无底洞里")
|
||||||
cuttest("中国的首都是北京")
|
cuttest("中国的首都是北京")
|
||||||
cuttest("孙君意")
|
cuttest("孙君意")
|
||||||
cuttest("外交部发言人马朝旭")
|
cuttest("外交部发言人马朝旭")
|
||||||
cuttest("领导人会议和第四届东亚峰会")
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
cuttest("在过去的这五年")
|
cuttest("在过去的这五年")
|
||||||
cuttest("还需要很长的路要走")
|
cuttest("还需要很长的路要走")
|
||||||
cuttest("60周年首都阅兵")
|
cuttest("60周年首都阅兵")
|
||||||
cuttest("你好人们审美的观点是不同的")
|
cuttest("你好人们审美的观点是不同的")
|
||||||
cuttest("买水果然后来世博园")
|
cuttest("买水果然后来世博园")
|
||||||
cuttest("买水果然后去世博园")
|
cuttest("买水果然后去世博园")
|
||||||
cuttest("但是后来我才知道你是对的")
|
cuttest("但是后来我才知道你是对的")
|
||||||
cuttest("存在即合理")
|
cuttest("存在即合理")
|
||||||
cuttest("的的的的的在的的的的就以和和和")
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
cuttest("I love你,不以为耻,反以为rong")
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
cuttest("因")
|
cuttest("因")
|
||||||
cuttest("")
|
cuttest("")
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
cuttest("很好但主要是基于网页形式")
|
cuttest("很好但主要是基于网页形式")
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
cuttest("为什么我不能拥有想要的生活")
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
cuttest("后来我才")
|
cuttest("后来我才")
|
||||||
cuttest("此次来中国是为了")
|
cuttest("此次来中国是为了")
|
||||||
cuttest("使用了它就可以解决一些问题")
|
cuttest("使用了它就可以解决一些问题")
|
||||||
cuttest(",使用了它就可以解决一些问题")
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
cuttest("其实使用了它就可以解决一些问题")
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
cuttest("好人使用了它就可以解决一些问题")
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
cuttest("是因为和国家")
|
cuttest("是因为和国家")
|
||||||
cuttest("老年搜索还支持")
|
cuttest("老年搜索还支持")
|
||||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
cuttest("大")
|
cuttest("大")
|
||||||
cuttest("")
|
cuttest("")
|
||||||
cuttest("他说的确实在理")
|
cuttest("他说的确实在理")
|
||||||
cuttest("长春市长春节讲话")
|
cuttest("长春市长春节讲话")
|
||||||
cuttest("结婚的和尚未结婚的")
|
cuttest("结婚的和尚未结婚的")
|
||||||
cuttest("结合成分子时")
|
cuttest("结合成分子时")
|
||||||
cuttest("旅游和服务是最好的")
|
cuttest("旅游和服务是最好的")
|
||||||
cuttest("这件事情的确是我的错")
|
cuttest("这件事情的确是我的错")
|
||||||
cuttest("供大家参考指正")
|
cuttest("供大家参考指正")
|
||||||
cuttest("哈尔滨政府公布塌桥原因")
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
cuttest("我在机场入口处")
|
cuttest("我在机场入口处")
|
||||||
cuttest("邢永臣摄影报道")
|
cuttest("邢永臣摄影报道")
|
||||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
cuttest("南京市长江大桥")
|
cuttest("南京市长江大桥")
|
||||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
cuttest('长春市长春药店')
|
cuttest('长春市长春药店')
|
||||||
cuttest('邓颖超生前最喜欢的衣服')
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
cuttest('一次性交多少钱')
|
cuttest('一次性交多少钱')
|
||||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
101
test/test_cutall.py
Normal file
101
test/test_cutall.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent,cut_all=True)
|
||||||
|
for word in result:
|
||||||
|
print(word, "/", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
jieba.add_word('超敏C反应蛋白')
|
||||||
|
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')
|
||||||
|
cuttest('steel健身爆发力运动兴奋补充剂')
|
@ -1,20 +1,21 @@
|
|||||||
import urllib2
|
import time
|
||||||
import sys,time
|
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba
|
import jieba
|
||||||
|
jieba.initialize()
|
||||||
|
|
||||||
url = sys.argv[1]
|
url = sys.argv[1]
|
||||||
content = open(url,"rb").read()
|
content = open(url,"rb").read()
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
words = list(jieba.cut(content))
|
words = "/ ".join(jieba.cut(content))
|
||||||
|
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
tm_cost = t2-t1
|
tm_cost = t2-t1
|
||||||
|
|
||||||
log_f = open("1.log","wb")
|
log_f = open("1.log","wb")
|
||||||
for w in words:
|
log_f.write(words.encode('utf-8'))
|
||||||
print >> log_f, w.encode("gbk"), "/" ,
|
log_f.close()
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
print('cost ' + str(tm_cost))
|
||||||
|
print('speed %s bytes/second' % (len(content)/tm_cost))
|
||||||
|
|
||||||
|
42
test/test_lock.py
Normal file
42
test/test_lock.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import threading
|
||||||
|
|
||||||
|
def inittokenizer(tokenizer, group):
|
||||||
|
print('===> Thread %s:%s started' % (group, threading.current_thread().ident))
|
||||||
|
tokenizer.initialize()
|
||||||
|
print('<=== Thread %s:%s finished' % (group, threading.current_thread().ident))
|
||||||
|
|
||||||
|
tokrs1 = [jieba.Tokenizer() for n in range(5)]
|
||||||
|
tokrs2 = [jieba.Tokenizer('../extra_dict/dict.txt.small') for n in range(5)]
|
||||||
|
|
||||||
|
thr1 = [threading.Thread(target=inittokenizer, args=(tokr, 1)) for tokr in tokrs1]
|
||||||
|
thr2 = [threading.Thread(target=inittokenizer, args=(tokr, 2)) for tokr in tokrs2]
|
||||||
|
for thr in thr1:
|
||||||
|
thr.start()
|
||||||
|
for thr in thr2:
|
||||||
|
thr.start()
|
||||||
|
for thr in thr1:
|
||||||
|
thr.join()
|
||||||
|
for thr in thr2:
|
||||||
|
thr.join()
|
||||||
|
|
||||||
|
del tokrs1, tokrs2
|
||||||
|
|
||||||
|
print('='*40)
|
||||||
|
|
||||||
|
tokr1 = jieba.Tokenizer()
|
||||||
|
tokr2 = jieba.Tokenizer('../extra_dict/dict.txt.small')
|
||||||
|
|
||||||
|
thr1 = [threading.Thread(target=inittokenizer, args=(tokr1, 1)) for n in range(5)]
|
||||||
|
thr2 = [threading.Thread(target=inittokenizer, args=(tokr2, 2)) for n in range(5)]
|
||||||
|
for thr in thr1:
|
||||||
|
thr.start()
|
||||||
|
for thr in thr2:
|
||||||
|
thr.start()
|
||||||
|
for thr in thr1:
|
||||||
|
thr.join()
|
||||||
|
for thr in thr2:
|
||||||
|
thr.join()
|
29
test/test_multithread.py
Normal file
29
test/test_multithread.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
sys.path.append("../")
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
class Worker(threading.Thread):
|
||||||
|
def run(self):
|
||||||
|
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
||||||
|
print("Full Mode:" + "/ ".join(seg_list)) #全模式
|
||||||
|
|
||||||
|
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
||||||
|
print("Default Mode:" + "/ ".join(seg_list)) #默认模式
|
||||||
|
|
||||||
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
||||||
|
print(", ".join(seg_list))
|
||||||
|
workers = []
|
||||||
|
for i in range(10):
|
||||||
|
worker = Worker()
|
||||||
|
workers.append(worker)
|
||||||
|
worker.start()
|
||||||
|
|
||||||
|
for worker in workers:
|
||||||
|
worker.join()
|
||||||
|
|
100
test/test_no_hmm.py
Normal file
100
test/test_no_hmm.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent,HMM=False)
|
||||||
|
print(" / ".join(result))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
cuttest('张三风同学走上了不归路')
|
||||||
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
|
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
102
test/test_paddle.py
Normal file
102
test/test_paddle.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
jieba.enable_paddle()
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = jieba.cut(test_sent, use_paddle=True)
|
||||||
|
print(" / ".join(result))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
cuttest('张三风同学走上了不归路')
|
||||||
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
|
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
||||||
|
jieba.del_word('很赞')
|
||||||
|
cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')
|
102
test/test_paddle_postag.py
Normal file
102
test/test_paddle_postag.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
import jieba
|
||||||
|
jieba.enable_paddle()
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = pseg.cut(test_sent, use_paddle=True)
|
||||||
|
for word, flag in result:
|
||||||
|
print('%s %s' % (word, flag))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
cuttest('张三风同学走上了不归路')
|
||||||
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
|
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
174
test/test_pos.py
174
test/test_pos.py
@ -1,93 +1,99 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
for w in result:
|
for word, flag in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print(word, "/", flag, ", ", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
cuttest("我不喜欢日本和服。")
|
cuttest("我不喜欢日本和服。")
|
||||||
cuttest("雷猴回归人间。")
|
cuttest("雷猴回归人间。")
|
||||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
cuttest("我需要廉租房")
|
cuttest("我需要廉租房")
|
||||||
cuttest("永和服装饰品有限公司")
|
cuttest("永和服装饰品有限公司")
|
||||||
cuttest("我爱北京天安门")
|
cuttest("我爱北京天安门")
|
||||||
cuttest("abc")
|
cuttest("abc")
|
||||||
cuttest("隐马尔可夫")
|
cuttest("隐马尔可夫")
|
||||||
cuttest("雷猴是个好网站")
|
cuttest("雷猴是个好网站")
|
||||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
cuttest("伊藤洋华堂总府店")
|
cuttest("伊藤洋华堂总府店")
|
||||||
cuttest("中国科学院计算技术研究所")
|
cuttest("中国科学院计算技术研究所")
|
||||||
cuttest("罗密欧与朱丽叶")
|
cuttest("罗密欧与朱丽叶")
|
||||||
cuttest("我购买了道具和服装")
|
cuttest("我购买了道具和服装")
|
||||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
cuttest("湖北省石首市")
|
cuttest("湖北省石首市")
|
||||||
cuttest("湖北省十堰市")
|
cuttest("湖北省十堰市")
|
||||||
cuttest("总经理完成了这件事情")
|
cuttest("总经理完成了这件事情")
|
||||||
cuttest("电脑修好了")
|
cuttest("电脑修好了")
|
||||||
cuttest("做好了这件事情就一了百了了")
|
cuttest("做好了这件事情就一了百了了")
|
||||||
cuttest("人们审美的观点是不同的")
|
cuttest("人们审美的观点是不同的")
|
||||||
cuttest("我们买了一个美的空调")
|
cuttest("我们买了一个美的空调")
|
||||||
cuttest("线程初始化时我们要注意")
|
cuttest("线程初始化时我们要注意")
|
||||||
cuttest("一个分子是由好多原子组织成的")
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
cuttest("祝你马到功成")
|
cuttest("祝你马到功成")
|
||||||
cuttest("他掉进了无底洞里")
|
cuttest("他掉进了无底洞里")
|
||||||
cuttest("中国的首都是北京")
|
cuttest("中国的首都是北京")
|
||||||
cuttest("孙君意")
|
cuttest("孙君意")
|
||||||
cuttest("外交部发言人马朝旭")
|
cuttest("外交部发言人马朝旭")
|
||||||
cuttest("领导人会议和第四届东亚峰会")
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
cuttest("在过去的这五年")
|
cuttest("在过去的这五年")
|
||||||
cuttest("还需要很长的路要走")
|
cuttest("还需要很长的路要走")
|
||||||
cuttest("60周年首都阅兵")
|
cuttest("60周年首都阅兵")
|
||||||
cuttest("你好人们审美的观点是不同的")
|
cuttest("你好人们审美的观点是不同的")
|
||||||
cuttest("买水果然后来世博园")
|
cuttest("买水果然后来世博园")
|
||||||
cuttest("买水果然后去世博园")
|
cuttest("买水果然后去世博园")
|
||||||
cuttest("但是后来我才知道你是对的")
|
cuttest("但是后来我才知道你是对的")
|
||||||
cuttest("存在即合理")
|
cuttest("存在即合理")
|
||||||
cuttest("的的的的的在的的的的就以和和和")
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
cuttest("I love你,不以为耻,反以为rong")
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
cuttest("因")
|
cuttest("因")
|
||||||
cuttest("")
|
cuttest("")
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
cuttest("很好但主要是基于网页形式")
|
cuttest("很好但主要是基于网页形式")
|
||||||
cuttest("hello你好人们审美的观点是不同的")
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
cuttest("为什么我不能拥有想要的生活")
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
cuttest("后来我才")
|
cuttest("后来我才")
|
||||||
cuttest("此次来中国是为了")
|
cuttest("此次来中国是为了")
|
||||||
cuttest("使用了它就可以解决一些问题")
|
cuttest("使用了它就可以解决一些问题")
|
||||||
cuttest(",使用了它就可以解决一些问题")
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
cuttest("其实使用了它就可以解决一些问题")
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
cuttest("好人使用了它就可以解决一些问题")
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
cuttest("是因为和国家")
|
cuttest("是因为和国家")
|
||||||
cuttest("老年搜索还支持")
|
cuttest("老年搜索还支持")
|
||||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
cuttest("大")
|
cuttest("大")
|
||||||
cuttest("")
|
cuttest("")
|
||||||
cuttest("他说的确实在理")
|
cuttest("他说的确实在理")
|
||||||
cuttest("长春市长春节讲话")
|
cuttest("长春市长春节讲话")
|
||||||
cuttest("结婚的和尚未结婚的")
|
cuttest("结婚的和尚未结婚的")
|
||||||
cuttest("结合成分子时")
|
cuttest("结合成分子时")
|
||||||
cuttest("旅游和服务是最好的")
|
cuttest("旅游和服务是最好的")
|
||||||
cuttest("这件事情的确是我的错")
|
cuttest("这件事情的确是我的错")
|
||||||
cuttest("供大家参考指正")
|
cuttest("供大家参考指正")
|
||||||
cuttest("哈尔滨政府公布塌桥原因")
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
cuttest("我在机场入口处")
|
cuttest("我在机场入口处")
|
||||||
cuttest("邢永臣摄影报道")
|
cuttest("邢永臣摄影报道")
|
||||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
cuttest("南京市长江大桥")
|
cuttest("南京市长江大桥")
|
||||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
cuttest('长春市长春药店')
|
cuttest('长春市长春药店')
|
||||||
cuttest('邓颖超生前最喜欢的衣服')
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
cuttest('一次性交多少钱')
|
cuttest('一次性交多少钱')
|
||||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
import urllib2
|
from __future__ import print_function
|
||||||
import sys,time
|
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
jieba.initialize()
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
url = sys.argv[1]
|
url = sys.argv[1]
|
||||||
@ -12,9 +14,8 @@ words = list(pseg.cut(content))
|
|||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
tm_cost = t2-t1
|
tm_cost = t2-t1
|
||||||
|
|
||||||
log_f = open("1.log","wb")
|
log_f = open("1.log","w")
|
||||||
for w in words:
|
log_f.write(' / '.join(map(str, words)))
|
||||||
print >> log_f, w.encode("gbk"), "/" ,
|
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||||
|
|
||||||
|
99
test/test_pos_no_hmm.py
Normal file
99
test/test_pos_no_hmm.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
result = pseg.cut(test_sent, HMM=False)
|
||||||
|
for word, flag in result:
|
||||||
|
print(word, "/", flag, ", ", end=' ')
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
106
test/test_tokenize.py
Normal file
106
test/test_tokenize.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function,unicode_literals
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
g_mode="default"
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
global g_mode
|
||||||
|
result = jieba.tokenize(test_sent,mode=g_mode)
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
for m in ("default","search"):
|
||||||
|
g_mode = m
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
cuttest('张三风同学走上了不归路')
|
||||||
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
|
cuttest('在1号店能买到小S和大S八卦的书。')
|
106
test/test_tokenize_no_hmm.py
Normal file
106
test/test_tokenize_no_hmm.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function,unicode_literals
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
g_mode="default"
|
||||||
|
|
||||||
|
def cuttest(test_sent):
|
||||||
|
global g_mode
|
||||||
|
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
for m in ("default","search"):
|
||||||
|
g_mode = m
|
||||||
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
|
cuttest("我不喜欢日本和服。")
|
||||||
|
cuttest("雷猴回归人间。")
|
||||||
|
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||||
|
cuttest("我需要廉租房")
|
||||||
|
cuttest("永和服装饰品有限公司")
|
||||||
|
cuttest("我爱北京天安门")
|
||||||
|
cuttest("abc")
|
||||||
|
cuttest("隐马尔可夫")
|
||||||
|
cuttest("雷猴是个好网站")
|
||||||
|
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||||
|
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||||
|
cuttest("伊藤洋华堂总府店")
|
||||||
|
cuttest("中国科学院计算技术研究所")
|
||||||
|
cuttest("罗密欧与朱丽叶")
|
||||||
|
cuttest("我购买了道具和服装")
|
||||||
|
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||||
|
cuttest("湖北省石首市")
|
||||||
|
cuttest("湖北省十堰市")
|
||||||
|
cuttest("总经理完成了这件事情")
|
||||||
|
cuttest("电脑修好了")
|
||||||
|
cuttest("做好了这件事情就一了百了了")
|
||||||
|
cuttest("人们审美的观点是不同的")
|
||||||
|
cuttest("我们买了一个美的空调")
|
||||||
|
cuttest("线程初始化时我们要注意")
|
||||||
|
cuttest("一个分子是由好多原子组织成的")
|
||||||
|
cuttest("祝你马到功成")
|
||||||
|
cuttest("他掉进了无底洞里")
|
||||||
|
cuttest("中国的首都是北京")
|
||||||
|
cuttest("孙君意")
|
||||||
|
cuttest("外交部发言人马朝旭")
|
||||||
|
cuttest("领导人会议和第四届东亚峰会")
|
||||||
|
cuttest("在过去的这五年")
|
||||||
|
cuttest("还需要很长的路要走")
|
||||||
|
cuttest("60周年首都阅兵")
|
||||||
|
cuttest("你好人们审美的观点是不同的")
|
||||||
|
cuttest("买水果然后来世博园")
|
||||||
|
cuttest("买水果然后去世博园")
|
||||||
|
cuttest("但是后来我才知道你是对的")
|
||||||
|
cuttest("存在即合理")
|
||||||
|
cuttest("的的的的的在的的的的就以和和和")
|
||||||
|
cuttest("I love你,不以为耻,反以为rong")
|
||||||
|
cuttest("因")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("很好但主要是基于网页形式")
|
||||||
|
cuttest("hello你好人们审美的观点是不同的")
|
||||||
|
cuttest("为什么我不能拥有想要的生活")
|
||||||
|
cuttest("后来我才")
|
||||||
|
cuttest("此次来中国是为了")
|
||||||
|
cuttest("使用了它就可以解决一些问题")
|
||||||
|
cuttest(",使用了它就可以解决一些问题")
|
||||||
|
cuttest("其实使用了它就可以解决一些问题")
|
||||||
|
cuttest("好人使用了它就可以解决一些问题")
|
||||||
|
cuttest("是因为和国家")
|
||||||
|
cuttest("老年搜索还支持")
|
||||||
|
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||||
|
cuttest("大")
|
||||||
|
cuttest("")
|
||||||
|
cuttest("他说的确实在理")
|
||||||
|
cuttest("长春市长春节讲话")
|
||||||
|
cuttest("结婚的和尚未结婚的")
|
||||||
|
cuttest("结合成分子时")
|
||||||
|
cuttest("旅游和服务是最好的")
|
||||||
|
cuttest("这件事情的确是我的错")
|
||||||
|
cuttest("供大家参考指正")
|
||||||
|
cuttest("哈尔滨政府公布塌桥原因")
|
||||||
|
cuttest("我在机场入口处")
|
||||||
|
cuttest("邢永臣摄影报道")
|
||||||
|
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||||
|
cuttest("南京市长江大桥")
|
||||||
|
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||||
|
cuttest('长春市长春药店')
|
||||||
|
cuttest('邓颖超生前最喜欢的衣服')
|
||||||
|
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||||
|
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||||
|
cuttest('一次性交多少钱')
|
||||||
|
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||||
|
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||||
|
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||||
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
cuttest('枪杆子中出政权')
|
||||||
|
cuttest('张三风同学走上了不归路')
|
||||||
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
|
cuttest('在1号店能买到小S和大S八卦的书。')
|
48
test/test_userdict.py
Normal file
48
test/test_userdict.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function, unicode_literals
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
jieba.load_userdict("userdict.txt")
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
jieba.add_word('石墨烯')
|
||||||
|
jieba.add_word('凱特琳')
|
||||||
|
jieba.del_word('自定义词')
|
||||||
|
|
||||||
|
test_sent = (
|
||||||
|
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
|
||||||
|
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
|
||||||
|
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
|
||||||
|
)
|
||||||
|
words = jieba.cut(test_sent)
|
||||||
|
print('/'.join(words))
|
||||||
|
|
||||||
|
print("="*40)
|
||||||
|
|
||||||
|
result = pseg.cut(test_sent)
|
||||||
|
|
||||||
|
for w in result:
|
||||||
|
print(w.word, "/", w.flag, ", ", end=' ')
|
||||||
|
|
||||||
|
print("\n" + "="*40)
|
||||||
|
|
||||||
|
terms = jieba.cut('easy_install is great')
|
||||||
|
print('/'.join(terms))
|
||||||
|
terms = jieba.cut('python 的正则表达式是好用的')
|
||||||
|
print('/'.join(terms))
|
||||||
|
|
||||||
|
print("="*40)
|
||||||
|
# test frequency tune
|
||||||
|
testlist = [
|
||||||
|
('今天天气不错', ('今天', '天气')),
|
||||||
|
('如果放到post中将出错。', ('中', '将')),
|
||||||
|
('我们中出了一个叛徒', ('中', '出')),
|
||||||
|
]
|
||||||
|
|
||||||
|
for sent, seg in testlist:
|
||||||
|
print('/'.join(jieba.cut(sent, HMM=False)))
|
||||||
|
word = ''.join(seg)
|
||||||
|
print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
|
||||||
|
print('/'.join(jieba.cut(sent, HMM=False)))
|
||||||
|
print("-"*40)
|
64
test/test_whoosh.py
Normal file
64
test/test_whoosh.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import sys,os
|
||||||
|
sys.path.append("../")
|
||||||
|
from whoosh.index import create_in,open_dir
|
||||||
|
from whoosh.fields import *
|
||||||
|
from whoosh.qparser import QueryParser
|
||||||
|
|
||||||
|
from jieba.analyse.analyzer import ChineseAnalyzer
|
||||||
|
|
||||||
|
analyzer = ChineseAnalyzer()
|
||||||
|
|
||||||
|
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
|
||||||
|
if not os.path.exists("tmp"):
|
||||||
|
os.mkdir("tmp")
|
||||||
|
|
||||||
|
ix = create_in("tmp", schema) # for create new index
|
||||||
|
#ix = open_dir("tmp") # for read only
|
||||||
|
writer = ix.writer()
|
||||||
|
|
||||||
|
writer.add_document(
|
||||||
|
title="document1",
|
||||||
|
path="/a",
|
||||||
|
content="This is the first document we’ve added!"
|
||||||
|
)
|
||||||
|
|
||||||
|
writer.add_document(
|
||||||
|
title="document2",
|
||||||
|
path="/b",
|
||||||
|
content="The second one 你 中文测试中文 is even more interesting! 吃水果"
|
||||||
|
)
|
||||||
|
|
||||||
|
writer.add_document(
|
||||||
|
title="document3",
|
||||||
|
path="/c",
|
||||||
|
content="买水果然后来世博园。"
|
||||||
|
)
|
||||||
|
|
||||||
|
writer.add_document(
|
||||||
|
title="document4",
|
||||||
|
path="/c",
|
||||||
|
content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||||
|
)
|
||||||
|
|
||||||
|
writer.add_document(
|
||||||
|
title="document4",
|
||||||
|
path="/c",
|
||||||
|
content="咱俩交换一下吧。"
|
||||||
|
)
|
||||||
|
|
||||||
|
writer.commit()
|
||||||
|
searcher = ix.searcher()
|
||||||
|
parser = QueryParser("content", schema=ix.schema)
|
||||||
|
|
||||||
|
for keyword in ("水果世博园","你","first","中文","交换机","交换"):
|
||||||
|
print("result of ",keyword)
|
||||||
|
q = parser.parse(keyword)
|
||||||
|
results = searcher.search(q)
|
||||||
|
for hit in results:
|
||||||
|
print(hit.highlights("content"))
|
||||||
|
print("="*10)
|
||||||
|
|
||||||
|
for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
|
||||||
|
print(t.text)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user