mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix conflict of merging
This commit is contained in:
parent
c01680c6a8
commit
b9b1f1a418
@ -82,6 +82,7 @@ def initialize(*args):
|
|||||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
||||||
min_freq = min(FREQ.values())
|
min_freq = min(FREQ.values())
|
||||||
print("dumping model to file cache " + cache_file, file=sys.stderr)
|
print("dumping model to file cache " + cache_file, file=sys.stderr)
|
||||||
|
try:
|
||||||
tmp_suffix = "."+str(random.random())
|
tmp_suffix = "."+str(random.random())
|
||||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||||
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
||||||
@ -91,7 +92,10 @@ def initialize(*args):
|
|||||||
else:
|
else:
|
||||||
replace_file = os.rename
|
replace_file = os.rename
|
||||||
replace_file(cache_file+tmp_suffix,cache_file)
|
replace_file(cache_file+tmp_suffix,cache_file)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
print("dump cache file failed.",file=sys.stderr)
|
||||||
|
print(traceback.format_exc(),file=sys.stderr)
|
||||||
initialized = True
|
initialized = True
|
||||||
|
|
||||||
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
|
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
|
||||||
@ -263,15 +267,22 @@ def load_userdict(f):
|
|||||||
if line_no==1:
|
if line_no==1:
|
||||||
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
||||||
if len(tup)==3:
|
if len(tup)==3:
|
||||||
user_word_tag_tab[word]=tup[2].strip()
|
add_word(word, freq, tup[2])
|
||||||
|
else:
|
||||||
|
add_word(word, freq)
|
||||||
|
|
||||||
|
def add_word(word, freq, tag=None):
|
||||||
|
global FREQ, trie, total, user_word_tag_tab
|
||||||
freq = float(freq)
|
freq = float(freq)
|
||||||
FREQ[word] = log(freq / total)
|
FREQ[word] = log(freq / total)
|
||||||
|
if tag is not None:
|
||||||
|
user_word_tag_tab[word] = tag.strip()
|
||||||
p = trie
|
p = trie
|
||||||
for c in word:
|
for c in word:
|
||||||
if not c in p:
|
if not c in p:
|
||||||
p[c] ={}
|
p[c] = {}
|
||||||
p = p[c]
|
p = p[c]
|
||||||
p['']='' #ending flag
|
p[''] = '' # ending flag
|
||||||
|
|
||||||
__ref_cut = cut
|
__ref_cut = cut
|
||||||
__ref_cut_for_search = cut_for_search
|
__ref_cut_for_search = cut_for_search
|
||||||
@ -333,3 +344,29 @@ def get_abs_path_dict():
|
|||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
abs_path = os.path.join(_curpath,DICTIONARY)
|
abs_path = os.path.join(_curpath,DICTIONARY)
|
||||||
return abs_path
|
return abs_path
|
||||||
|
|
||||||
|
def tokenize(unicode_sentence,mode="default"):
|
||||||
|
#mode ("default" or "search")
|
||||||
|
if not isinstance(unicode_sentence, str):
|
||||||
|
raise Exception("jieba: the input parameter should string.")
|
||||||
|
start = 0
|
||||||
|
if mode=='default':
|
||||||
|
for w in cut(unicode_sentence):
|
||||||
|
width = len(w)
|
||||||
|
yield (w,start,start+width)
|
||||||
|
start+=width
|
||||||
|
else:
|
||||||
|
for w in cut(unicode_sentence):
|
||||||
|
width = len(w)
|
||||||
|
if len(w)>2:
|
||||||
|
for i in range(len(w)-1):
|
||||||
|
gram2 = w[i:i+2]
|
||||||
|
if gram2 in FREQ:
|
||||||
|
yield (gram2,start+i,start+i+2)
|
||||||
|
if len(w)>3:
|
||||||
|
for i in range(len(w)-2):
|
||||||
|
gram3 = w[i:i+3]
|
||||||
|
if gram3 in FREQ:
|
||||||
|
yield (gram3,start+i,start+i+3)
|
||||||
|
yield (w,start,start+width)
|
||||||
|
start+=width
|
||||||
|
@ -150,5 +150,15 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
print(" , ".join([w.word + " / " + w.flag for w in result]),file=sys.stderr)
|
print(" , ".join([w.word + " / " + w.flag for w in result]),file=sys.stderr)
|
||||||
print("testPosseg",file=sys.stderr)
|
print("testPosseg",file=sys.stderr)
|
||||||
|
|
||||||
|
def testTokenize(self):
|
||||||
|
for content in test_contents:
|
||||||
|
result = jieba.tokenize(content)
|
||||||
|
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||||
|
result = list(result)
|
||||||
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
|
for tk in result:
|
||||||
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
|
||||||
|
print("testTokenize",file=sys.stderr)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -7,10 +7,9 @@ g_mode="default"
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
global g_mode
|
global g_mode
|
||||||
test_sent = test_sent.decode('utf-8')
|
|
||||||
result = jieba.tokenize(test_sent,mode=g_mode)
|
result = jieba.tokenize(test_sent,mode=g_mode)
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -15,40 +15,40 @@ ix = create_in("tmp", schema) # for create new index
|
|||||||
writer = ix.writer()
|
writer = ix.writer()
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document1",
|
title="document1",
|
||||||
path=u"/a",
|
path="/a",
|
||||||
content=u"This is the first document we’ve added!"
|
content="This is the first document we’ve added!"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document2",
|
title="document2",
|
||||||
path=u"/b",
|
path="/b",
|
||||||
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
|
content="The second one 你 中文测试中文 is even more interesting! 吃水果"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document3",
|
title="document3",
|
||||||
path=u"/c",
|
path="/c",
|
||||||
content=u"买水果然后来世博园。"
|
content="买水果然后来世博园。"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document4",
|
title="document4",
|
||||||
path=u"/c",
|
path="/c",
|
||||||
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document4",
|
title="document4",
|
||||||
path=u"/c",
|
path="/c",
|
||||||
content=u"咱俩交换一下吧。"
|
content="咱俩交换一下吧。"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
searcher = ix.searcher()
|
searcher = ix.searcher()
|
||||||
parser = QueryParser("content", schema=ix.schema)
|
parser = QueryParser("content", schema=ix.schema)
|
||||||
|
|
||||||
for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"):
|
for keyword in ("水果世博园","你","first","中文","交换机","交换"):
|
||||||
print "result of ",keyword
|
print "result of ",keyword
|
||||||
q = parser.parse(keyword)
|
q = parser.parse(keyword)
|
||||||
results = searcher.search(q)
|
results = searcher.search(q)
|
||||||
@ -56,5 +56,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
|
|||||||
print hit.highlights("content")
|
print hit.highlights("content")
|
||||||
print "="*10
|
print "="*10
|
||||||
|
|
||||||
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
|
for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
|
||||||
print t.text
|
print t.text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user