mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
use CRLF as seperator to make chunks in parallel mode
This commit is contained in:
parent
6b83593b5a
commit
b46166f768
@ -233,7 +233,7 @@ def enable_parallel(processnum):
|
|||||||
pool = Pool(processnum)
|
pool = Pool(processnum)
|
||||||
|
|
||||||
def pcut(sentence,cut_all=False):
|
def pcut(sentence,cut_all=False):
|
||||||
parts = re.compile('(\s+)').split(sentence)
|
parts = re.compile('([\r\n]+)').split(sentence)
|
||||||
if cut_all:
|
if cut_all:
|
||||||
result = pool.map(__lcut_all,parts)
|
result = pool.map(__lcut_all,parts)
|
||||||
else:
|
else:
|
||||||
@ -243,7 +243,7 @@ def enable_parallel(processnum):
|
|||||||
yield w
|
yield w
|
||||||
|
|
||||||
def pcut_for_search(sentence):
|
def pcut_for_search(sentence):
|
||||||
parts = re.compile('(\s+)').split(sentence)
|
parts = re.compile('([\r\n]+)').split(sentence)
|
||||||
result = pool.map(__lcut_for_search,parts)
|
result = pool.map(__lcut_for_search,parts)
|
||||||
for r in result:
|
for r in result:
|
||||||
for w in r:
|
for w in r:
|
||||||
|
@ -155,7 +155,7 @@ def cut(sentence):
|
|||||||
for w in __cut_internal(sentence):
|
for w in __cut_internal(sentence):
|
||||||
yield w
|
yield w
|
||||||
else:
|
else:
|
||||||
parts = re.compile('(\s+)').split(sentence)
|
parts = re.compile('([\r\n]+)').split(sentence)
|
||||||
result = jieba.pool.map(__lcut_internal,parts)
|
result = jieba.pool.map(__lcut_internal,parts)
|
||||||
for r in result:
|
for r in result:
|
||||||
for w in r:
|
for w in r:
|
||||||
|
34
test/parallel/extract_tags.py
Normal file
34
test/parallel/extract_tags.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
jieba.enable_parallel(4)
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE ="usage: python extract_tags.py [file name] -k [top k]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k",dest="topK")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) <1:
|
||||||
|
print USAGE
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK==None:
|
||||||
|
topK=10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
|
||||||
|
content = open(file_name,'rb').read()
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content,topK=topK)
|
||||||
|
|
||||||
|
print ",".join(tags)
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user