diff --git a/jieba/__init__.py b/jieba/__init__.py index 5006705..df3faff 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -17,21 +17,26 @@ def gen_trie(f_name): lfreq = {} trie = {} ltotal = 0.0 - content = open(f_name,'rb').read().decode('utf-8') - for line in content.split("\n"): - word,freq,_ = line.split(" ") - freq = float(freq) - lfreq[word] = freq - ltotal+=freq - p = trie - for c in word: - if not c in p: - p[c] ={} - p = p[c] - p['']='' #ending flag + with open(f_name, 'rb') as f: + lineno = 1 + for line in f.readlines(): + lineno += 1 + try: + word,freq,_ = line.decode('utf-8').split(' ') + freq = float(freq) + lfreq[word] = freq + ltotal+=freq + p = trie + for c in word: + if not c in p: + p[c] ={} + p = p[c] + p['']='' #ending flag + except ValueError, e: + print >> sys.stderr, ' at line', lineno, line + raise e return trie, lfreq,ltotal - _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) print >> sys.stderr, "Building Trie..." diff --git a/jieba/dict.txt b/jieba/dict.txt index 3fbcb4a..c7d5f8d 100644 --- a/jieba/dict.txt +++ b/jieba/dict.txt @@ -367427,4 +367427,4 @@ C++ 3 nz c++ 3 nz C# 3 nz c# 3 nz -AT&T 3 nz \ No newline at end of file +AT&T 3 nz