From 4a552ca94fbc7b7842add0f9b8d63c5ab16cab37 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Sat, 14 Mar 2015 12:44:19 +0800 Subject: [PATCH 1/2] suggest word frequency, support passing str to add_word --- jieba/__init__.py | 88 +++++++++++++++++++++++++++++++++++++------ test/test_userdict.py | 42 +++++++++++++++------ test/userdict.txt | 3 +- 3 files changed, 109 insertions(+), 24 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index 2188a8c..e24f137 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -240,8 +240,10 @@ re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U) def cut(sentence, cut_all=False, HMM=True): - '''The main function that segments an entire sentence that contains + ''' + The main function that segments an entire sentence that contains Chinese characters into seperated words. + Parameter: - sentence: The str(unicode) to be segmented. - cut_all: Model type. True for full pattern, False for accurate pattern. @@ -284,6 +286,9 @@ def cut(sentence, cut_all=False, HMM=True): def cut_for_search(sentence, HMM=True): + """ + Finer segmentation for search engines. + """ words = cut(sentence, HMM=HMM) for w in words: if len(w) > 2: @@ -301,9 +306,12 @@ def cut_for_search(sentence, HMM=True): @require_initialized def load_userdict(f): - ''' Load personalized dict to improve detect rate. + ''' + Load personalized dict to improve detect rate. + Parameter: - f : A plain text file contains words and their ocurrences. + Structure of dict file: word1 freq1 word_type1 word2 freq2 word_type2 @@ -315,18 +323,32 @@ def load_userdict(f): content = f.read().decode('utf-8').lstrip('\ufeff') line_no = 0 for line in content.splitlines(): - line_no += 1 - if not line.rstrip(): - continue - tup = line.strip().split(" ") - if tup[1].isdigit(): + try: + line_no += 1 + line = line.strip() + if not line: + continue + tup = line.split(" ") add_word(*tup) + except Exception as e: + logger.debug('%s at line %s %s' % (f_name, lineno, line)) + raise e @require_initialized -def add_word(word, freq, tag=None): +def add_word(word, freq=None, tag=None): + """ + Add a word to dictionary. + + freq and tag can be omitted, freq defaults to be a calculated value + that ensures the word can be cut out. + """ global FREQ, total, user_word_tag_tab - freq = int(freq) + word = strdecode(word) + if freq is None: + freq = suggest_freq(word, False) + else: + freq = int(freq) FREQ[word] = freq total += freq if tag is not None: @@ -336,6 +358,46 @@ def add_word(word, freq, tag=None): if wfrag not in FREQ: FREQ[wfrag] = 0 + +def del_word(word): + """ + Convenient function for deleting a word. + """ + add_word(word, 0) + + +@require_initialized +def suggest_freq(segment, tune=False): + """ + Suggest word frequency to force the characters in a word to be + joined or splitted. + + Parameter: + - segment : The segments that the word is expected to be cut into, + If the word should be treated as a whole, use a str. + - tune : If True, tune the word frequency. + + Note that HMM may affect the final result. If the result doesn't change, + set HMM=False. + """ + ftotal = float(total) + freq = 1 + if isinstance(segment, string_types): + word = segment + for seg in cut(word, HMM=False): + freq *= FREQ.get(seg, 1) / ftotal + freq = max(int(freq*total) + 1, FREQ.get(word, 1)) + else: + segment = tuple(map(strdecode, segment)) + word = ''.join(segment) + for seg in segment: + freq *= FREQ.get(seg, 1) / ftotal + freq = min(int(freq*total), FREQ.get(word, 0)) + if tune: + add_word(word, freq) + return freq + + __ref_cut = cut __ref_cut_for_search = cut_for_search @@ -402,8 +464,8 @@ def set_dictionary(dictionary_path): global initialized, DICTIONARY with DICT_LOCK: abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path)) - if not os.path.exists(abs_path): - raise Exception("jieba: path does not exist: " + abs_path) + if not os.path.isfile(abs_path): + raise Exception("jieba: file does not exist: " + abs_path) DICTIONARY = abs_path initialized = False @@ -413,7 +475,9 @@ def get_abs_path_dict(): def tokenize(unicode_sentence, mode="default", HMM=True): - """Tokenize a sentence and yields tuples of (word, start, end) + """ + Tokenize a sentence and yields tuples of (word, start, end) + Parameter: - sentence: the str(unicode) to be segmented. - mode: "default" or "search", "search" is for finer segmentation. diff --git a/test/test_userdict.py b/test/test_userdict.py index ccb9758..2810afa 100644 --- a/test/test_userdict.py +++ b/test/test_userdict.py @@ -1,28 +1,48 @@ #encoding=utf-8 -from __future__ import print_function +from __future__ import print_function, unicode_literals import sys sys.path.append("../") import jieba jieba.load_userdict("userdict.txt") import jieba.posseg as pseg -test_sent = "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿" -test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型" +jieba.add_word('石墨烯') +jieba.add_word('凱特琳') +jieba.del_word('自定义词') + +test_sent = ( +"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" +"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" +"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。" +) words = jieba.cut(test_sent) -for w in words: - print(w) +print('/'.join(words)) + +print("="*40) result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') -print("\n========") +print("\n" + "="*40) terms = jieba.cut('easy_install is great') -for t in terms: - print(t) -print('-------------------------') +print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') -for t in terms: - print(t) +print('/'.join(terms)) + +print("="*40) +# test frequency tune +testlist = [ +('今天天气不错', ('今天', '天气')), +('如果放到post中将出错。', ('中', '将')), +('我们中出了一个叛徒', ('中', '出')), +] + +for sent, seg in testlist: + print('/'.join(jieba.cut(sent, HMM=False))) + word = ''.join(seg) + print('%s Before: %s, After: %s' % (word, jieba.FREQ[word], jieba.suggest_freq(seg, True))) + print('/'.join(jieba.cut(sent, HMM=False))) + print("-"*40) diff --git a/test/userdict.txt b/test/userdict.txt index fc73d5c..4b4ec51 100644 --- a/test/userdict.txt +++ b/test/userdict.txt @@ -4,4 +4,5 @@ easy_install 3 eng 好用 300 韩玉赏鉴 3 nz -八一双鹿 3 nz \ No newline at end of file +八一双鹿 3 nz +台中 From 4fa2728fb68d15ce6977e715e464928236bbbdf7 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Sat, 14 Mar 2015 12:44:49 +0800 Subject: [PATCH 2/2] update README about new features --- README.md | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ed699cc..c2bd9ae 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ jieba * 支持繁体分词 * 支持自定义词典 +* MIT 授权协议 在线演示 ========= @@ -93,9 +94,13 @@ print(", ".join(seg_list)) 2) :添加自定义词典 ---------------- +### 载入词典 + * 开发者可以指定自己自定义的词典,以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力,但是自行添加新词可以保证更高的正确率 * 用法: jieba.load_userdict(file_name) # file_name 为自定义词典的路径 -* 词典格式和`dict.txt`一样,一个词占一行;每一行分三部分,一部分为词语,另一部分为词频,最后为词性(可省略),用空格隔开 +* 词典格式和`dict.txt`一样,一个词占一行;每一行分三部分,一部分为词语,另一部分为词频(可省略),最后为词性(可省略),用空格隔开 +* 词频可省略,使用计算出的能保证分出该词的词频 + * 范例: * 自定义词典:https://github.com/fxsjy/jieba/blob/master/test/userdict.txt @@ -107,6 +112,29 @@ print(", ".join(seg_list)) * 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / +### 调整词典 + +* 使用 `add_word(word, freq=None, tag=None)` 和 `del_word(word)` 可在程序中动态修改词典。 +* 使用 `suggest_freq(segment, tune=True)` 可调节单个词语的词频,使其能(或不能)被分出来。 + +* 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。 + +代码示例: + +```pycon +>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) +如果/放到/post/中将/出错/。 +>>> jieba.suggest_freq(('中', '将'), True) +494 +>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) +如果/放到/post/中/将/出错/。 +>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) +「/台/中/」/正确/应该/不会/被/切开 +>>> jieba.suggest_freq('台中', True) +69 +>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) +「/台中/」/正确/应该/不会/被/切开 +``` * "通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14 @@ -362,10 +390,35 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big 常见问题 ========= -1. 模型的数据是如何生成的?https://github.com/fxsjy/jieba/issues/7 -2. 这个库的授权是? https://github.com/fxsjy/jieba/issues/2 -* 更多问题请点击:https://github.com/fxsjy/jieba/issues?sort=updated&state=closed +## 1. 模型的数据是如何生成的? + +详见: https://github.com/fxsjy/jieba/issues/7 + +## 2. “台中”总是被切成“台 中”?(以及类似情况) + +P(台中) < P(台)×P(中),“台中”词频不够导致其成词概率较低 + +解决方法:强制调高词频 + +`jieba.add_word('台中')` 或者 `jieba.suggest_freq('台中', True)` + +## 3. “今天天气 不错”应该被切成“今天 天气 不错”?(以及类似情况) + +解决方法:强制调低词频 + +`jieba.suggest_freq(('今天', '天气'), True)` + +或者直接删除该词 `jieba.del_word('今天天气')` + +## 4. 切出了词典中没有的词语,效果不理想? + +解决方法:关闭新词发现 + +`jieba.cut('丰田太省了', HMM=False)` +`jieba.cut('我们中出了一个叛徒', HMM=False)` + +**更多问题请点击**:https://github.com/fxsjy/jieba/issues?sort=updated&state=closed 修订历史 ========== @@ -380,9 +433,15 @@ jieba Features ======== * Support three types of segmentation mode: -* 1) Accurate Mode attempts to cut the sentence into the most accurate segmentations, which is suitable for text analysis. -* 2) Full Mode gets all the possible words from the sentence. Fast but not accurate. -* 3) Search Engine Mode, based on the Accurate Mode, attempts to cut long words into several short words, which can raise the recall rate. Suitable for search engines. + +1. Accurate Mode attempts to cut the sentence into the most accurate segmentations, which is suitable for text analysis. +2. Full Mode gets all the possible words from the sentence. Fast but not accurate. +3. Search Engine Mode, based on the Accurate Mode, attempts to cut long words into several short words, which can raise the recall rate. Suitable for search engines. + +* Supports Traditional Chinese +* Supports customized dictionaries +* MIT License + Online demo ========= @@ -446,6 +505,8 @@ Output: 2) : Add a custom dictionary ---------------------------- +### Load dictionary + * Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but adding your own new words can ensure a higher accuracy. * Usage: `jieba.load_userdict(file_name) # file_name is the path of the custom dictionary` * The dictionary format is the same as that of `analyse/idf.txt`: one word per line; each line is divided into two parts, the first is the word itself, the other is the word frequency, separated by a space @@ -459,6 +520,31 @@ Output: [After]: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / + +### Modify dictionary + +* Use `add_word(word, freq=None, tag=None)` and `del_word(word)` to modify the dictionary dynamically in programs. +* Use `suggest_freq(segment, tune=True)` to adjust the frequency of a single word so that it can (or cannot) be segmented. + +* Note that HMM may affect the final result. + +Example: + +```pycon +>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) +如果/放到/post/中将/出错/。 +>>> jieba.suggest_freq(('中', '将'), True) +494 +>>> print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False))) +如果/放到/post/中/将/出错/。 +>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) +「/台/中/」/正确/应该/不会/被/切开 +>>> jieba.suggest_freq('台中', True) +69 +>>> print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False))) +「/台中/」/正确/应该/不会/被/切开 +``` + 3) : Keyword Extraction ----------------------- * `jieba.analyse.extract_tags(sentence,topK,withWeight) # needs to first import jieba.analyse`