use only one dictionary

2025-07-10 00:01:33 +08:00 · 2012-11-06 11:01:31 +08:00 · 2012-11-06 11:01:31 +08:00 · 193bfee1d4
commit 193bfee1d4
parent 6c0bcf6557
4 changed files with 367441 additions and 389074 deletions
--- a/README.md
+++ b/README.md
@ -78,6 +78,22 @@ Output:

 	https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py

+功能 4) : 词性标注
+================
+* 标注句子分词后每个词的词性，采用和ictclas兼容的标记法
+* 用法示例
+
+		>>> import jieba.posseg as pseg
+		>>> words =pseg.cut("我爱北京天安门")
+		>>> for w in words:
+		...    print w.word,w.flag
+		...
+		我 r
+		爱 v
+		北京 ns
+		天安门 ns
+
+

 分词速度
 =========
--- a/jieba/init.py
+++ b/jieba/init.py
@ -16,7 +16,7 @@ def gen_trie(f_name):
 	ltotal = 0.0
 	content = open(f_name,'rb').read().decode('utf-8')
 	for line in content.split("\n"):
-		word,freq = line.split(" ")
+		word,freq,_ = line.split(" ")
 		freq = float(freq)
 		lfreq[word] = freq
 		ltotal+=freq
--- a/jieba/dict.txt
+++ b/jieba/dict.txt
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -15,7 +15,7 @@ def load_model(f_name):
 		for line in open(prob_p_path,"rb"):
 			line = line.strip()
 			if line=="":continue
-			word, tag = line.split(' ')
+			word, _, tag = line.split(' ')
 			result[word.decode('utf-8')]=tag
 		return result

@ -24,7 +24,7 @@ prob_start = load_model("prob_start.py")
 prob_trans = load_model("prob_trans.py")
 prob_emit = load_model("prob_emit.py")
 char_state_tab = load_model("char_state_tab.py")
-word_tag_tab = load_model("tags.txt")
+word_tag_tab = load_model("../dict.txt")

 class pair(object):
 	def __init__(self,word,flag):