suggest word frequency, support passing str to add_word

This commit is contained in:
Dingyuan Wang 2015-03-14 12:44:19 +08:00
parent 1b4721ebb8
commit 4a552ca94f
3 changed files with 109 additions and 24 deletions

View File

@ -240,8 +240,10 @@ re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
def cut(sentence, cut_all=False, HMM=True): def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains '''
The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
- sentence: The str(unicode) to be segmented. - sentence: The str(unicode) to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern. - cut_all: Model type. True for full pattern, False for accurate pattern.
@ -284,6 +286,9 @@ def cut(sentence, cut_all=False, HMM=True):
def cut_for_search(sentence, HMM=True): def cut_for_search(sentence, HMM=True):
"""
Finer segmentation for search engines.
"""
words = cut(sentence, HMM=HMM) words = cut(sentence, HMM=HMM)
for w in words: for w in words:
if len(w) > 2: if len(w) > 2:
@ -301,9 +306,12 @@ def cut_for_search(sentence, HMM=True):
@require_initialized @require_initialized
def load_userdict(f): def load_userdict(f):
''' Load personalized dict to improve detect rate. '''
Load personalized dict to improve detect rate.
Parameter: Parameter:
- f : A plain text file contains words and their ocurrences. - f : A plain text file contains words and their ocurrences.
Structure of dict file: Structure of dict file:
word1 freq1 word_type1 word1 freq1 word_type1
word2 freq2 word_type2 word2 freq2 word_type2
@ -315,18 +323,32 @@ def load_userdict(f):
content = f.read().decode('utf-8').lstrip('\ufeff') content = f.read().decode('utf-8').lstrip('\ufeff')
line_no = 0 line_no = 0
for line in content.splitlines(): for line in content.splitlines():
line_no += 1 try:
if not line.rstrip(): line_no += 1
continue line = line.strip()
tup = line.strip().split(" ") if not line:
if tup[1].isdigit(): continue
tup = line.split(" ")
add_word(*tup) add_word(*tup)
except Exception as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise e
@require_initialized @require_initialized
def add_word(word, freq, tag=None): def add_word(word, freq=None, tag=None):
"""
Add a word to dictionary.
freq and tag can be omitted, freq defaults to be a calculated value
that ensures the word can be cut out.
"""
global FREQ, total, user_word_tag_tab global FREQ, total, user_word_tag_tab
freq = int(freq) word = strdecode(word)
if freq is None:
freq = suggest_freq(word, False)
else:
freq = int(freq)
FREQ[word] = freq FREQ[word] = freq
total += freq total += freq
if tag is not None: if tag is not None:
@ -336,6 +358,46 @@ def add_word(word, freq, tag=None):
if wfrag not in FREQ: if wfrag not in FREQ:
FREQ[wfrag] = 0 FREQ[wfrag] = 0
def del_word(word):
"""
Convenient function for deleting a word.
"""
add_word(word, 0)
@require_initialized
def suggest_freq(segment, tune=False):
"""
Suggest word frequency to force the characters in a word to be
joined or splitted.
Parameter:
- segment : The segments that the word is expected to be cut into,
If the word should be treated as a whole, use a str.
- tune : If True, tune the word frequency.
Note that HMM may affect the final result. If the result doesn't change,
set HMM=False.
"""
ftotal = float(total)
freq = 1
if isinstance(segment, string_types):
word = segment
for seg in cut(word, HMM=False):
freq *= FREQ.get(seg, 1) / ftotal
freq = max(int(freq*total) + 1, FREQ.get(word, 1))
else:
segment = tuple(map(strdecode, segment))
word = ''.join(segment)
for seg in segment:
freq *= FREQ.get(seg, 1) / ftotal
freq = min(int(freq*total), FREQ.get(word, 0))
if tune:
add_word(word, freq)
return freq
__ref_cut = cut __ref_cut = cut
__ref_cut_for_search = cut_for_search __ref_cut_for_search = cut_for_search
@ -402,8 +464,8 @@ def set_dictionary(dictionary_path):
global initialized, DICTIONARY global initialized, DICTIONARY
with DICT_LOCK: with DICT_LOCK:
abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path)) abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
if not os.path.exists(abs_path): if not os.path.isfile(abs_path):
raise Exception("jieba: path does not exist: " + abs_path) raise Exception("jieba: file does not exist: " + abs_path)
DICTIONARY = abs_path DICTIONARY = abs_path
initialized = False initialized = False
@ -413,7 +475,9 @@ def get_abs_path_dict():
def tokenize(unicode_sentence, mode="default", HMM=True): def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end) """
Tokenize a sentence and yields tuples of (word, start, end)
Parameter: Parameter:
- sentence: the str(unicode) to be segmented. - sentence: the str(unicode) to be segmented.
- mode: "default" or "search", "search" is for finer segmentation. - mode: "default" or "search", "search" is for finer segmentation.

View File

@ -1,28 +1,48 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function from __future__ import print_function, unicode_literals
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba import jieba
jieba.load_userdict("userdict.txt") jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg import jieba.posseg as pseg
test_sent = "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿" jieba.add_word('石墨烯')
test_sent += "例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型" jieba.add_word('凱特琳')
jieba.del_word('自定义词')
test_sent = (
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
"例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类\n"
"「台中」正確應該不會被切開。mac上可分出「石墨烯」此時又可以分出來凱特琳了。"
)
words = jieba.cut(test_sent) words = jieba.cut(test_sent)
for w in words: print('/'.join(words))
print(w)
print("="*40)
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print(w.word, "/", w.flag, ", ", end=' ') print(w.word, "/", w.flag, ", ", end=' ')
print("\n========") print("\n" + "="*40)
terms = jieba.cut('easy_install is great') terms = jieba.cut('easy_install is great')
for t in terms: print('/'.join(terms))
print(t)
print('-------------------------')
terms = jieba.cut('python 的正则表达式是好用的') terms = jieba.cut('python 的正则表达式是好用的')
for t in terms: print('/'.join(terms))
print(t)
print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('', '')),
('我们中出了一个叛徒', ('', '')),
]
for sent, seg in testlist:
print('/'.join(jieba.cut(sent, HMM=False)))
word = ''.join(seg)
print('%s Before: %s, After: %s' % (word, jieba.FREQ[word], jieba.suggest_freq(seg, True)))
print('/'.join(jieba.cut(sent, HMM=False)))
print("-"*40)

View File

@ -4,4 +4,5 @@
easy_install 3 eng easy_install 3 eng
好用 300 好用 300
韩玉赏鉴 3 nz 韩玉赏鉴 3 nz
八一双鹿 3 nz 八一双鹿 3 nz
台中