support user defined word tag

This commit is contained in:
Sun Junyi 2013-03-25 17:28:37 +08:00
parent 44e19a2e27
commit 58c363655c
4 changed files with 31 additions and 2 deletions

View File

@ -11,7 +11,7 @@ import random
FREQ = {}
total =0.0
user_word_tag_tab={}
def gen_trie(f_name):
lfreq = {}
@ -191,9 +191,16 @@ def load_userdict(f):
if isinstance(f, (str, unicode)):
f = open(f, 'rb')
content = f.read().decode('utf-8')
line_no = 0
for line in content.split("\n"):
line_no+=1
if line.rstrip()=='': continue
word,freq = line.split(" ")
tup =line.split(" ")
word,freq = tup[0],tup[1]
if line_no==1:
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
if len(tup)==3:
user_word_tag_tab[word]=tup[2].strip()
freq = float(freq)
FREQ[word] = log(freq / total)
p = trie

View File

@ -27,6 +27,9 @@ prob_emit = load_model("prob_emit.py")
char_state_tab = load_model("char_state_tab.py")
word_tag_tab = load_model("../dict.txt")
if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab)
class pair(object):
def __init__(self,word,flag):
self.word = word

16
test/test_userdict.py Normal file
View File

@ -0,0 +1,16 @@
#encoding=utf-8
import sys
sys.path.append("../")
import jieba
jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg
test_sent = "李小福是创新办主任也是云计算方面的专家"
words = jieba.cut(test_sent)
for w in words:
print w
result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",

3
test/userdict.txt Normal file
View File

@ -0,0 +1,3 @@
云计算 5
李小福 2 nr
创新办 3 i