mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
Merge pull request #75 from chao787/feature_richard
Refactoring jieba/__init__.py
This commit is contained in:
commit
a1ad2cbd55
@ -1,8 +1,7 @@
|
|||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
import re
|
import re
|
||||||
import math
|
import os
|
||||||
import os,sys
|
import sys
|
||||||
import pprint
|
|
||||||
import finalseg
|
import finalseg
|
||||||
import time
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
@ -103,16 +102,18 @@ def initialize(*args):
|
|||||||
|
|
||||||
|
|
||||||
def require_initialized(fn):
|
def require_initialized(fn):
|
||||||
global initialized,DICTIONARY
|
global initialized,DICTIONARY
|
||||||
|
|
||||||
@wraps(fn)
|
@wraps(fn)
|
||||||
def wrapped(*args, **kwargs):
|
def wrapped(*args, **kwargs):
|
||||||
if initialized:
|
if initialized:
|
||||||
return fn(*args, **kwargs)
|
return fn(*args, **kwargs)
|
||||||
else:
|
else:
|
||||||
initialize(DICTIONARY)
|
initialize(DICTIONARY)
|
||||||
return fn(*args, **kwargs)
|
return fn(*args, **kwargs)
|
||||||
return wrapped
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
def __cut_all(sentence):
|
def __cut_all(sentence):
|
||||||
dag = get_DAG(sentence)
|
dag = get_DAG(sentence)
|
||||||
@ -211,18 +212,18 @@ def cut(sentence,cut_all=False):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\s+)", re.U)
|
||||||
if cut_all:
|
if cut_all:
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
cut_block = __cut_DAG
|
cut_block = __cut_DAG
|
||||||
if cut_all:
|
if cut_all:
|
||||||
cut_block = __cut_all
|
cut_block = __cut_all
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
#pprint.pprint(__cut_DAG(blk))
|
#pprint.pprint(__cut_DAG(blk))
|
||||||
for word in cut_block(blk):
|
for word in cut_block(blk):
|
||||||
yield word
|
yield word
|
||||||
else:
|
else:
|
||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user