mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
merge chage from chao78787
This commit is contained in:
commit
9ea14a8a54
@ -144,7 +144,7 @@ Output:
|
|||||||
* 实验结果:在4核3.4GHz Linux机器上,对金庸全集进行精确分词,获得了1MB/s的速度,是单进程版的3.3倍。
|
* 实验结果:在4核3.4GHz Linux机器上,对金庸全集进行精确分词,获得了1MB/s的速度,是单进程版的3.3倍。
|
||||||
|
|
||||||
|
|
||||||
功能 5) : Tokenize:返回词语在原文的起始位置
|
功能 6) : Tokenize:返回词语在原文的起始位置
|
||||||
============================================
|
============================================
|
||||||
* 注意,输入参数只接受unicode
|
* 注意,输入参数只接受unicode
|
||||||
* 默认模式
|
* 默认模式
|
||||||
@ -181,7 +181,7 @@ word 有限公司 start: 6 end:10
|
|||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
功能 6) : ChineseAnalyzer for Whoosh搜索引擎
|
功能 7) : ChineseAnalyzer for Whoosh搜索引擎
|
||||||
============================================
|
============================================
|
||||||
* 引用: `from jieba.analyse import ChineseAnalyzer `
|
* 引用: `from jieba.analyse import ChineseAnalyzer `
|
||||||
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
|
||||||
|
@ -264,15 +264,22 @@ def load_userdict(f):
|
|||||||
if line_no==1:
|
if line_no==1:
|
||||||
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
||||||
if len(tup)==3:
|
if len(tup)==3:
|
||||||
user_word_tag_tab[word]=tup[2].strip()
|
add_word(word, freq, tup[2])
|
||||||
|
else:
|
||||||
|
add_word(word, freq)
|
||||||
|
|
||||||
|
def add_word(word, freq, tag=None):
|
||||||
|
global FREQ, trie, total, user_word_tag_tab
|
||||||
freq = float(freq)
|
freq = float(freq)
|
||||||
FREQ[word] = log(freq / total)
|
FREQ[word] = log(freq / total)
|
||||||
|
if tag is not None:
|
||||||
|
user_word_tag_tab[word] = tag.strip()
|
||||||
p = trie
|
p = trie
|
||||||
for c in word:
|
for c in word:
|
||||||
if not c in p:
|
if not c in p:
|
||||||
p[c] ={}
|
p[c] = {}
|
||||||
p = p[c]
|
p = p[c]
|
||||||
p['']='' #ending flag
|
p[''] = '' # ending flag
|
||||||
|
|
||||||
__ref_cut = cut
|
__ref_cut = cut
|
||||||
__ref_cut_for_search = cut_for_search
|
__ref_cut_for_search = cut_for_search
|
||||||
|
2
setup.py
2
setup.py
@ -1,6 +1,6 @@
|
|||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
setup(name='jieba',
|
setup(name='jieba',
|
||||||
version='0.30',
|
version='0.31.alpha',
|
||||||
description='Chinese Words Segementation Utilities',
|
description='Chinese Words Segementation Utilities',
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
author_email='ccnusjy@gmail.com',
|
author_email='ccnusjy@gmail.com',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user