Merge pull request #75 from chao787/feature_richard

Refactoring jieba/__init__.py
This commit is contained in:
Sun Junyi 2013-07-10 01:34:43 -07:00
commit a1ad2cbd55

View File

@ -1,8 +1,7 @@
from __future__ import with_statement from __future__ import with_statement
import re import re
import math import os
import os,sys import sys
import pprint
import finalseg import finalseg
import time import time
import tempfile import tempfile
@ -103,16 +102,18 @@ def initialize(*args):
def require_initialized(fn): def require_initialized(fn):
global initialized,DICTIONARY global initialized,DICTIONARY
@wraps(fn) @wraps(fn)
def wrapped(*args, **kwargs): def wrapped(*args, **kwargs):
if initialized: if initialized:
return fn(*args, **kwargs) return fn(*args, **kwargs)
else: else:
initialize(DICTIONARY) initialize(DICTIONARY)
return fn(*args, **kwargs) return fn(*args, **kwargs)
return wrapped
return wrapped
def __cut_all(sentence): def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
@ -211,18 +212,18 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\s+)", re.U)
if cut_all: if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
cut_block = __cut_DAG cut_block = __cut_DAG
if cut_all: if cut_all:
cut_block = __cut_all cut_block = __cut_all
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
#pprint.pprint(__cut_DAG(blk)) #pprint.pprint(__cut_DAG(blk))
for word in cut_block(blk): for word in cut_block(blk):
yield word yield word
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp: