From 5b2ec920ed20952095bd67a7dc8175d73cfb2bed Mon Sep 17 00:00:00 2001 From: davidlihm Date: Thu, 15 May 2014 07:55:11 +0800 Subject: [PATCH] Update __init__.py --- jieba/__init__.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/jieba/__init__.py b/jieba/__init__.py index 0c7ee24..6ba3b22 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -243,11 +243,22 @@ def __cut_DAG(sentence): yield elem def cut(sentence,cut_all=False,HMM=True): + '''The main function that segments an entire sentence that contains + Chinese characters into seperated words. + Parameter: + - sentence: The String to be segmented + - cut_all: Model. True means full pattern, false means accurate pattern. + - HMM: Whether use Hidden Markov Model. + ''' if not isinstance(sentence, unicode): try: sentence = sentence.decode('utf-8') except UnicodeDecodeError: sentence = sentence.decode('gbk','ignore') + ''' + \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han + \r\n|\s : whitespace characters. Will not be Handled. + ''' re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U) if cut_all: re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U) @@ -292,6 +303,15 @@ def cut_for_search(sentence,HMM=True): @require_initialized def load_userdict(f): + ''' Load personalized dict to improve detect rate. + Parameter: + - f : A plain text file contains words and their ocurrences. + Structure of dict file: + word1 freq1 word_type1 + word2 freq2 word_type2 + ... + Word type may be ignored + ''' global trie,total,FREQ if isinstance(f, (str, unicode)): f = open(f, 'rb') @@ -302,6 +322,7 @@ def load_userdict(f): if line.rstrip()=='': continue tup =line.split(" ") word,freq = tup[0],tup[1] + if freq.isdigit() is False: continue if line_no==1: word = word.replace(u'\ufeff',u"") #remove bom flag if it exists if len(tup)==3: