mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
support user defined word tag
This commit is contained in:
parent
44e19a2e27
commit
58c363655c
@ -11,7 +11,7 @@ import random
|
|||||||
|
|
||||||
FREQ = {}
|
FREQ = {}
|
||||||
total =0.0
|
total =0.0
|
||||||
|
user_word_tag_tab={}
|
||||||
|
|
||||||
def gen_trie(f_name):
|
def gen_trie(f_name):
|
||||||
lfreq = {}
|
lfreq = {}
|
||||||
@ -191,9 +191,16 @@ def load_userdict(f):
|
|||||||
if isinstance(f, (str, unicode)):
|
if isinstance(f, (str, unicode)):
|
||||||
f = open(f, 'rb')
|
f = open(f, 'rb')
|
||||||
content = f.read().decode('utf-8')
|
content = f.read().decode('utf-8')
|
||||||
|
line_no = 0
|
||||||
for line in content.split("\n"):
|
for line in content.split("\n"):
|
||||||
|
line_no+=1
|
||||||
if line.rstrip()=='': continue
|
if line.rstrip()=='': continue
|
||||||
word,freq = line.split(" ")
|
tup =line.split(" ")
|
||||||
|
word,freq = tup[0],tup[1]
|
||||||
|
if line_no==1:
|
||||||
|
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
||||||
|
if len(tup)==3:
|
||||||
|
user_word_tag_tab[word]=tup[2].strip()
|
||||||
freq = float(freq)
|
freq = float(freq)
|
||||||
FREQ[word] = log(freq / total)
|
FREQ[word] = log(freq / total)
|
||||||
p = trie
|
p = trie
|
||||||
|
@ -27,6 +27,9 @@ prob_emit = load_model("prob_emit.py")
|
|||||||
char_state_tab = load_model("char_state_tab.py")
|
char_state_tab = load_model("char_state_tab.py")
|
||||||
word_tag_tab = load_model("../dict.txt")
|
word_tag_tab = load_model("../dict.txt")
|
||||||
|
|
||||||
|
if jieba.user_word_tag_tab:
|
||||||
|
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||||
|
|
||||||
class pair(object):
|
class pair(object):
|
||||||
def __init__(self,word,flag):
|
def __init__(self,word,flag):
|
||||||
self.word = word
|
self.word = word
|
||||||
|
16
test/test_userdict.py
Normal file
16
test/test_userdict.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#encoding=utf-8
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
jieba.load_userdict("userdict.txt")
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
test_sent = "李小福是创新办主任也是云计算方面的专家"
|
||||||
|
words = jieba.cut(test_sent)
|
||||||
|
for w in words:
|
||||||
|
print w
|
||||||
|
|
||||||
|
result = pseg.cut(test_sent)
|
||||||
|
|
||||||
|
for w in result:
|
||||||
|
print w.word, "/", w.flag, ", ",
|
3
test/userdict.txt
Normal file
3
test/userdict.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
云计算 5
|
||||||
|
李小福 2 nr
|
||||||
|
创新办 3 i
|
Loading…
x
Reference in New Issue
Block a user