mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
commit
2726a7c89b
@ -243,11 +243,22 @@ def __cut_DAG(sentence):
|
|||||||
yield elem
|
yield elem
|
||||||
|
|
||||||
def cut(sentence,cut_all=False,HMM=True):
|
def cut(sentence,cut_all=False,HMM=True):
|
||||||
|
'''The main function that segments an entire sentence that contains
|
||||||
|
Chinese characters into seperated words.
|
||||||
|
Parameter:
|
||||||
|
- sentence: The String to be segmented
|
||||||
|
- cut_all: Model. True means full pattern, false means accurate pattern.
|
||||||
|
- HMM: Whether use Hidden Markov Model.
|
||||||
|
'''
|
||||||
if not isinstance(sentence, unicode):
|
if not isinstance(sentence, unicode):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
|
'''
|
||||||
|
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||||
|
\r\n|\s : whitespace characters. Will not be Handled.
|
||||||
|
'''
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
||||||
if cut_all:
|
if cut_all:
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
||||||
@ -292,6 +303,15 @@ def cut_for_search(sentence,HMM=True):
|
|||||||
|
|
||||||
@require_initialized
|
@require_initialized
|
||||||
def load_userdict(f):
|
def load_userdict(f):
|
||||||
|
''' Load personalized dict to improve detect rate.
|
||||||
|
Parameter:
|
||||||
|
- f : A plain text file contains words and their ocurrences.
|
||||||
|
Structure of dict file:
|
||||||
|
word1 freq1 word_type1
|
||||||
|
word2 freq2 word_type2
|
||||||
|
...
|
||||||
|
Word type may be ignored
|
||||||
|
'''
|
||||||
global trie,total,FREQ
|
global trie,total,FREQ
|
||||||
if isinstance(f, (str, unicode)):
|
if isinstance(f, (str, unicode)):
|
||||||
f = open(f, 'rb')
|
f = open(f, 'rb')
|
||||||
@ -302,6 +322,7 @@ def load_userdict(f):
|
|||||||
if line.rstrip()=='': continue
|
if line.rstrip()=='': continue
|
||||||
tup =line.split(" ")
|
tup =line.split(" ")
|
||||||
word,freq = tup[0],tup[1]
|
word,freq = tup[0],tup[1]
|
||||||
|
if freq.isdigit() is False: continue
|
||||||
if line_no==1:
|
if line_no==1:
|
||||||
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
||||||
if len(tup)==3:
|
if len(tup)==3:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user