mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
suggest word frequency, support passing str to add_word
This commit is contained in:
parent
1b4721ebb8
commit
4a552ca94f
@ -240,8 +240,10 @@ re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||
|
||||
|
||||
def cut(sentence, cut_all=False, HMM=True):
|
||||
'''The main function that segments an entire sentence that contains
|
||||
'''
|
||||
The main function that segments an entire sentence that contains
|
||||
Chinese characters into seperated words.
|
||||
|
||||
Parameter:
|
||||
- sentence: The str(unicode) to be segmented.
|
||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||
@ -284,6 +286,9 @@ def cut(sentence, cut_all=False, HMM=True):
|
||||
|
||||
|
||||
def cut_for_search(sentence, HMM=True):
|
||||
"""
|
||||
Finer segmentation for search engines.
|
||||
"""
|
||||
words = cut(sentence, HMM=HMM)
|
||||
for w in words:
|
||||
if len(w) > 2:
|
||||
@ -301,9 +306,12 @@ def cut_for_search(sentence, HMM=True):
|
||||
|
||||
@require_initialized
|
||||
def load_userdict(f):
|
||||
''' Load personalized dict to improve detect rate.
|
||||
'''
|
||||
Load personalized dict to improve detect rate.
|
||||
|
||||
Parameter:
|
||||
- f : A plain text file contains words and their ocurrences.
|
||||
|
||||
Structure of dict file:
|
||||
word1 freq1 word_type1
|
||||
word2 freq2 word_type2
|
||||
@ -315,18 +323,32 @@ def load_userdict(f):
|
||||
content = f.read().decode('utf-8').lstrip('\ufeff')
|
||||
line_no = 0
|
||||
for line in content.splitlines():
|
||||
line_no += 1
|
||||
if not line.rstrip():
|
||||
continue
|
||||
tup = line.strip().split(" ")
|
||||
if tup[1].isdigit():
|
||||
try:
|
||||
line_no += 1
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
tup = line.split(" ")
|
||||
add_word(*tup)
|
||||
except Exception as e:
|
||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
||||
raise e
|
||||
|
||||
|
||||
@require_initialized
|
||||
def add_word(word, freq, tag=None):
|
||||
def add_word(word, freq=None, tag=None):
|
||||
"""
|
||||
Add a word to dictionary.
|
||||
|
||||
freq and tag can be omitted, freq defaults to be a calculated value
|
||||
that ensures the word can be cut out.
|
||||
"""
|
||||
global FREQ, total, user_word_tag_tab
|
||||
freq = int(freq)
|
||||
word = strdecode(word)
|
||||
if freq is None:
|
||||
freq = suggest_freq(word, False)
|
||||
else:
|
||||
freq = int(freq)
|
||||
FREQ[word] = freq
|
||||
total += freq
|
||||
if tag is not None:
|
||||
@ -336,6 +358,46 @@ def add_word(word, freq, tag=None):
|
||||
if wfrag not in FREQ:
|
||||
FREQ[wfrag] = 0
|
||||
|
||||
|
||||
def del_word(word):
|
||||
"""
|
||||
Convenient function for deleting a word.
|
||||
"""
|
||||
add_word(word, 0)
|
||||
|
||||
|
||||
@require_initialized
|
||||
def suggest_freq(segment, tune=False):
|
||||
"""
|
||||
Suggest word frequency to force the characters in a word to be
|
||||
joined or splitted.
|
||||
|
||||
Parameter:
|
||||
- segment : The segments that the word is expected to be cut into,
|
||||
If the word should be treated as a whole, use a str.
|
||||
- tune : If True, tune the word frequency.
|
||||
|
||||
Note that HMM may affect the final result. If the result doesn't change,
|
||||
set HMM=False.
|
||||
"""
|
||||
ftotal = float(total)
|
||||
freq = 1
|
||||
if isinstance(segment, string_types):
|
||||
word = segment
|
||||
for seg in cut(word, HMM=False):
|
||||
freq *= FREQ.get(seg, 1) / ftotal
|
||||
freq = max(int(freq*total) + 1, FREQ.get(word, 1))
|
||||
else:
|
||||
segment = tuple(map(strdecode, segment))
|
||||
word = ''.join(segment)
|
||||
for seg in segment:
|
||||
freq *= FREQ.get(seg, 1) / ftotal
|
||||
freq = min(int(freq*total), FREQ.get(word, 0))
|
||||
if tune:
|
||||
add_word(word, freq)
|
||||
return freq
|
||||
|
||||
|
||||
__ref_cut = cut
|
||||
__ref_cut_for_search = cut_for_search
|
||||
|
||||
@ -402,8 +464,8 @@ def set_dictionary(dictionary_path):
|
||||
global initialized, DICTIONARY
|
||||
with DICT_LOCK:
|
||||
abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("jieba: path does not exist: " + abs_path)
|
||||
if not os.path.isfile(abs_path):
|
||||
raise Exception("jieba: file does not exist: " + abs_path)
|
||||
DICTIONARY = abs_path
|
||||
initialized = False
|
||||
|
||||
@ -413,7 +475,9 @@ def get_abs_path_dict():
|
||||
|
||||
|
||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
||||
"""Tokenize a sentence and yields tuples of (word, start, end)
|
||||
"""
|
||||
Tokenize a sentence and yields tuples of (word, start, end)
|
||||
|
||||
Parameter:
|
||||
- sentence: the str(unicode) to be segmented.
|
||||
- mode: "default" or "search", "search" is for finer segmentation.
|
||||
|
@ -1,28 +1,48 @@
|
||||
#encoding=utf-8
|
||||
from __future__ import print_function
|
||||
from __future__ import print_function, unicode_literals
|
||||
import sys
|
||||
sys.path.append("../")
|
||||
import jieba
|
||||
jieba.load_userdict("userdict.txt")
|
||||
import jieba.posseg as pseg
|
||||
|
||||
test_sent = "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿"
|
||||
test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
jieba.add_word('石墨烯')
|
||||
jieba.add_word('凱特琳')
|
||||
jieba.del_word('自定义词')
|
||||
|
||||
test_sent = (
|
||||
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
|
||||
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
|
||||
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
|
||||
)
|
||||
words = jieba.cut(test_sent)
|
||||
for w in words:
|
||||
print(w)
|
||||
print('/'.join(words))
|
||||
|
||||
print("="*40)
|
||||
|
||||
result = pseg.cut(test_sent)
|
||||
|
||||
for w in result:
|
||||
print(w.word, "/", w.flag, ", ", end=' ')
|
||||
|
||||
print("\n========")
|
||||
print("\n" + "="*40)
|
||||
|
||||
terms = jieba.cut('easy_install is great')
|
||||
for t in terms:
|
||||
print(t)
|
||||
print('-------------------------')
|
||||
print('/'.join(terms))
|
||||
terms = jieba.cut('python 的正则表达式是好用的')
|
||||
for t in terms:
|
||||
print(t)
|
||||
print('/'.join(terms))
|
||||
|
||||
print("="*40)
|
||||
# test frequency tune
|
||||
testlist = [
|
||||
('今天天气不错', ('今天', '天气')),
|
||||
('如果放到post中将出错。', ('中', '将')),
|
||||
('我们中出了一个叛徒', ('中', '出')),
|
||||
]
|
||||
|
||||
for sent, seg in testlist:
|
||||
print('/'.join(jieba.cut(sent, HMM=False)))
|
||||
word = ''.join(seg)
|
||||
print('%s Before: %s, After: %s' % (word, jieba.FREQ[word], jieba.suggest_freq(seg, True)))
|
||||
print('/'.join(jieba.cut(sent, HMM=False)))
|
||||
print("-"*40)
|
||||
|
@ -4,4 +4,5 @@
|
||||
easy_install 3 eng
|
||||
好用 300
|
||||
韩玉赏鉴 3 nz
|
||||
八一双鹿 3 nz
|
||||
八一双鹿 3 nz
|
||||
台中
|
||||
|
Loading…
x
Reference in New Issue
Block a user