mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
support user defined word tag
This commit is contained in:
parent
44e19a2e27
commit
58c363655c
@ -11,7 +11,7 @@ import random
|
||||
|
||||
FREQ = {}
|
||||
total =0.0
|
||||
|
||||
user_word_tag_tab={}
|
||||
|
||||
def gen_trie(f_name):
|
||||
lfreq = {}
|
||||
@ -191,9 +191,16 @@ def load_userdict(f):
|
||||
if isinstance(f, (str, unicode)):
|
||||
f = open(f, 'rb')
|
||||
content = f.read().decode('utf-8')
|
||||
line_no = 0
|
||||
for line in content.split("\n"):
|
||||
line_no+=1
|
||||
if line.rstrip()=='': continue
|
||||
word,freq = line.split(" ")
|
||||
tup =line.split(" ")
|
||||
word,freq = tup[0],tup[1]
|
||||
if line_no==1:
|
||||
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
||||
if len(tup)==3:
|
||||
user_word_tag_tab[word]=tup[2].strip()
|
||||
freq = float(freq)
|
||||
FREQ[word] = log(freq / total)
|
||||
p = trie
|
||||
|
@ -27,6 +27,9 @@ prob_emit = load_model("prob_emit.py")
|
||||
char_state_tab = load_model("char_state_tab.py")
|
||||
word_tag_tab = load_model("../dict.txt")
|
||||
|
||||
if jieba.user_word_tag_tab:
|
||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||
|
||||
class pair(object):
|
||||
def __init__(self,word,flag):
|
||||
self.word = word
|
||||
|
16
test/test_userdict.py
Normal file
16
test/test_userdict.py
Normal file
@ -0,0 +1,16 @@
|
||||
#encoding=utf-8
|
||||
import sys
|
||||
sys.path.append("../")
|
||||
import jieba
|
||||
jieba.load_userdict("userdict.txt")
|
||||
import jieba.posseg as pseg
|
||||
|
||||
test_sent = "李小福是创新办主任也是云计算方面的专家"
|
||||
words = jieba.cut(test_sent)
|
||||
for w in words:
|
||||
print w
|
||||
|
||||
result = pseg.cut(test_sent)
|
||||
|
||||
for w in result:
|
||||
print w.word, "/", w.flag, ", ",
|
3
test/userdict.txt
Normal file
3
test/userdict.txt
Normal file
@ -0,0 +1,3 @@
|
||||
云计算 5
|
||||
李小福 2 nr
|
||||
创新办 3 i
|
Loading…
x
Reference in New Issue
Block a user