diff --git a/jieba/__init__.py b/jieba/__init__.py index 75c3ee8..4717da5 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -11,7 +11,7 @@ import random FREQ = {} total =0.0 - +user_word_tag_tab={} def gen_trie(f_name): lfreq = {} @@ -191,9 +191,16 @@ def load_userdict(f): if isinstance(f, (str, unicode)): f = open(f, 'rb') content = f.read().decode('utf-8') + line_no = 0 for line in content.split("\n"): + line_no+=1 if line.rstrip()=='': continue - word,freq = line.split(" ") + tup =line.split(" ") + word,freq = tup[0],tup[1] + if line_no==1: + word = word.replace(u'\ufeff',u"") #remove bom flag if it exists + if len(tup)==3: + user_word_tag_tab[word]=tup[2].strip() freq = float(freq) FREQ[word] = log(freq / total) p = trie diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index bdbdb3e..c3cae77 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -27,6 +27,9 @@ prob_emit = load_model("prob_emit.py") char_state_tab = load_model("char_state_tab.py") word_tag_tab = load_model("../dict.txt") +if jieba.user_word_tag_tab: + word_tag_tab.update(jieba.user_word_tag_tab) + class pair(object): def __init__(self,word,flag): self.word = word diff --git a/test/test_userdict.py b/test/test_userdict.py new file mode 100644 index 0000000..aac4730 --- /dev/null +++ b/test/test_userdict.py @@ -0,0 +1,16 @@ +#encoding=utf-8 +import sys +sys.path.append("../") +import jieba +jieba.load_userdict("userdict.txt") +import jieba.posseg as pseg + +test_sent = "李小福是创新办主任也是云计算方面的专家" +words = jieba.cut(test_sent) +for w in words: + print w + +result = pseg.cut(test_sent) + +for w in result: + print w.word, "/", w.flag, ", ", diff --git a/test/userdict.txt b/test/userdict.txt new file mode 100644 index 0000000..56e1f3d --- /dev/null +++ b/test/userdict.txt @@ -0,0 +1,3 @@ +云计算 5 +李小福 2 nr +创新办 3 i \ No newline at end of file