mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
Merge master and jieba3k, make the code Python 2/3 compatible
This commit is contained in:
commit
22bcf8be7a
28
README.md
28
README.md
@ -68,16 +68,16 @@ python setup.py install
|
|||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print "Full Mode:", "/ ".join(seg_list) # 全模式
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||||
print "Default Mode:", "/ ".join(seg_list) # 精确模式
|
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
|
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
```
|
```
|
||||||
|
|
||||||
输出:
|
输出:
|
||||||
@ -174,7 +174,7 @@ jieba.analyse.textrank(raw_text)
|
|||||||
>>> import jieba.posseg as pseg
|
>>> import jieba.posseg as pseg
|
||||||
>>> words = pseg.cut("我爱北京天安门")
|
>>> words = pseg.cut("我爱北京天安门")
|
||||||
>>> for w in words:
|
>>> for w in words:
|
||||||
... print w.word, w.flag
|
... print('%s %s' % (w.word, w.flag))
|
||||||
...
|
...
|
||||||
我 r
|
我 r
|
||||||
爱 v
|
爱 v
|
||||||
@ -203,7 +203,7 @@ jieba.analyse.textrank(raw_text)
|
|||||||
```python
|
```python
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司')
|
result = jieba.tokenize(u'永和服装饰品有限公司')
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -219,7 +219,7 @@ word 有限公司 start: 6 end:10
|
|||||||
```python
|
```python
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -413,16 +413,16 @@ Main Functions
|
|||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print "Full Mode:", "/ ".join(seg_list) # 全模式
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||||
print "Default Mode:", "/ ".join(seg_list) # 默认模式
|
print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
```
|
```
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
@ -488,7 +488,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
|
|||||||
>>> import jieba.posseg as pseg
|
>>> import jieba.posseg as pseg
|
||||||
>>> words = pseg.cut("我爱北京天安门")
|
>>> words = pseg.cut("我爱北京天安门")
|
||||||
>>> for w in words:
|
>>> for w in words:
|
||||||
... print w.word, w.flag
|
... print('%s %s' % (w.word, w.flag))
|
||||||
...
|
...
|
||||||
我 r
|
我 r
|
||||||
爱 v
|
爱 v
|
||||||
@ -517,7 +517,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
|
|||||||
```python
|
```python
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司')
|
result = jieba.tokenize(u'永和服装饰品有限公司')
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -533,7 +533,7 @@ word 有限公司 start: 6 end:10
|
|||||||
```python
|
```python
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -1,20 +1,20 @@
|
|||||||
from __future__ import with_statement
|
from __future__ import absolute_import, unicode_literals
|
||||||
__version__ = '0.35'
|
__version__ = '0.35'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import finalseg
|
|
||||||
import time
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
import marshal
|
import marshal
|
||||||
from math import log
|
from math import log
|
||||||
import random
|
|
||||||
import threading
|
import threading
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
import logging
|
import logging
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
|
from ._compat import *
|
||||||
|
from . import finalseg
|
||||||
|
|
||||||
DICTIONARY = "dict.txt"
|
DICTIONARY = "dict.txt"
|
||||||
DICT_LOCK = threading.RLock()
|
DICT_LOCK = threading.RLock()
|
||||||
@ -51,13 +51,13 @@ def gen_pfdict(f_name):
|
|||||||
ltotal += freq
|
ltotal += freq
|
||||||
for ch in xrange(len(word)):
|
for ch in xrange(len(word)):
|
||||||
pfdict.add(word[:ch+1])
|
pfdict.add(word[:ch+1])
|
||||||
except ValueError, e:
|
except ValueError as e:
|
||||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
||||||
raise ValueError, e
|
raise e
|
||||||
return pfdict, lfreq, ltotal
|
return pfdict, lfreq, ltotal
|
||||||
|
|
||||||
def initialize(dictionary=None):
|
def initialize(dictionary=None):
|
||||||
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
|
global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK
|
||||||
if not dictionary:
|
if not dictionary:
|
||||||
dictionary = DICTIONARY
|
dictionary = DICTIONARY
|
||||||
with DICT_LOCK:
|
with DICT_LOCK:
|
||||||
@ -121,7 +121,7 @@ def require_initialized(fn):
|
|||||||
def __cut_all(sentence):
|
def __cut_all(sentence):
|
||||||
dag = get_DAG(sentence)
|
dag = get_DAG(sentence)
|
||||||
old_j = -1
|
old_j = -1
|
||||||
for k,L in dag.iteritems():
|
for k,L in iteritems(dag):
|
||||||
if len(L) == 1 and k > old_j:
|
if len(L) == 1 and k > old_j:
|
||||||
yield sentence[k:L[0]+1]
|
yield sentence[k:L[0]+1]
|
||||||
old_j = L[0]
|
old_j = L[0]
|
||||||
@ -158,13 +158,13 @@ def get_DAG(sentence):
|
|||||||
return DAG
|
return DAG
|
||||||
|
|
||||||
def __cut_DAG_NO_HMM(sentence):
|
def __cut_DAG_NO_HMM(sentence):
|
||||||
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
||||||
DAG = get_DAG(sentence)
|
DAG = get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
calc(sentence, DAG, route)
|
calc(sentence, DAG, route)
|
||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = u''
|
buf = ''
|
||||||
while x < N:
|
while x < N:
|
||||||
y = route[x][1] + 1
|
y = route[x][1] + 1
|
||||||
l_word = sentence[x:y]
|
l_word = sentence[x:y]
|
||||||
@ -174,19 +174,19 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
else:
|
else:
|
||||||
if buf:
|
if buf:
|
||||||
yield buf
|
yield buf
|
||||||
buf = u''
|
buf = ''
|
||||||
yield l_word
|
yield l_word
|
||||||
x = y
|
x = y
|
||||||
if buf:
|
if buf:
|
||||||
yield buf
|
yield buf
|
||||||
buf = u''
|
buf = ''
|
||||||
|
|
||||||
def __cut_DAG(sentence):
|
def __cut_DAG(sentence):
|
||||||
DAG = get_DAG(sentence)
|
DAG = get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
calc(sentence, DAG, route=route)
|
calc(sentence, DAG, route=route)
|
||||||
x = 0
|
x = 0
|
||||||
buf = u''
|
buf = ''
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
while x < N:
|
while x < N:
|
||||||
y = route[x][1]+1
|
y = route[x][1]+1
|
||||||
@ -197,7 +197,7 @@ def __cut_DAG(sentence):
|
|||||||
if buf:
|
if buf:
|
||||||
if len(buf) == 1:
|
if len(buf) == 1:
|
||||||
yield buf
|
yield buf
|
||||||
buf = u''
|
buf = ''
|
||||||
else:
|
else:
|
||||||
if buf not in FREQ:
|
if buf not in FREQ:
|
||||||
recognized = finalseg.cut(buf)
|
recognized = finalseg.cut(buf)
|
||||||
@ -206,7 +206,7 @@ def __cut_DAG(sentence):
|
|||||||
else:
|
else:
|
||||||
for elem in buf:
|
for elem in buf:
|
||||||
yield elem
|
yield elem
|
||||||
buf = u''
|
buf = ''
|
||||||
yield l_word
|
yield l_word
|
||||||
x = y
|
x = y
|
||||||
|
|
||||||
@ -225,23 +225,19 @@ def cut(sentence, cut_all=False, HMM=True):
|
|||||||
'''The main function that segments an entire sentence that contains
|
'''The main function that segments an entire sentence that contains
|
||||||
Chinese characters into seperated words.
|
Chinese characters into seperated words.
|
||||||
Parameter:
|
Parameter:
|
||||||
- sentence: The str/unicode to be segmented.
|
- sentence: The str(unicode) to be segmented.
|
||||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||||
- HMM: Whether to use the Hidden Markov Model.
|
- HMM: Whether to use the Hidden Markov Model.
|
||||||
'''
|
'''
|
||||||
if not isinstance(sentence, unicode):
|
sentence = strdecode(sentence)
|
||||||
try:
|
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
|
||||||
|
|
||||||
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||||
# \r\n|\s : whitespace characters. Will not be handled.
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
|
|
||||||
if cut_all:
|
if cut_all:
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||||
else:
|
else:
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
if cut_all:
|
if cut_all:
|
||||||
cut_block = __cut_all
|
cut_block = __cut_all
|
||||||
@ -292,9 +288,9 @@ def load_userdict(f):
|
|||||||
...
|
...
|
||||||
Word type may be ignored
|
Word type may be ignored
|
||||||
'''
|
'''
|
||||||
if isinstance(f, (str, unicode)):
|
if isinstance(f, string_types):
|
||||||
f = open(f, 'rb')
|
f = open(f, 'rb')
|
||||||
content = f.read().decode('utf-8').lstrip(u'\ufeff')
|
content = f.read().decode('utf-8').lstrip('\ufeff')
|
||||||
line_no = 0
|
line_no = 0
|
||||||
for line in content.split("\n"):
|
for line in content.split("\n"):
|
||||||
line_no += 1
|
line_no += 1
|
||||||
@ -333,15 +329,13 @@ def enable_parallel(processnum=None):
|
|||||||
global pool, cut, cut_for_search
|
global pool, cut, cut_for_search
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
raise Exception("jieba: parallel mode only supports posix system")
|
raise Exception("jieba: parallel mode only supports posix system")
|
||||||
if sys.version_info[0]==2 and sys.version_info[1]<6:
|
|
||||||
raise Exception("jieba: the parallel feature needs Python version>2.5")
|
|
||||||
from multiprocessing import Pool, cpu_count
|
from multiprocessing import Pool, cpu_count
|
||||||
if processnum is None:
|
if processnum is None:
|
||||||
processnum = cpu_count()
|
processnum = cpu_count()
|
||||||
pool = Pool(processnum)
|
pool = Pool(processnum)
|
||||||
|
|
||||||
def pcut(sentence,cut_all=False,HMM=True):
|
def pcut(sentence,cut_all=False,HMM=True):
|
||||||
parts = re.compile('([\r\n]+)').split(sentence)
|
parts = strdecode(sentence).split('\n')
|
||||||
if cut_all:
|
if cut_all:
|
||||||
result = pool.map(__lcut_all, parts)
|
result = pool.map(__lcut_all, parts)
|
||||||
elif HMM:
|
elif HMM:
|
||||||
@ -353,7 +347,7 @@ def enable_parallel(processnum=None):
|
|||||||
yield w
|
yield w
|
||||||
|
|
||||||
def pcut_for_search(sentence):
|
def pcut_for_search(sentence):
|
||||||
parts = re.compile('([\r\n]+)').split(sentence)
|
parts = strdecode(sentence).split('\n')
|
||||||
result = pool.map(__lcut_for_search, parts)
|
result = pool.map(__lcut_for_search, parts)
|
||||||
for r in result:
|
for r in result:
|
||||||
for w in r:
|
for w in r:
|
||||||
@ -385,11 +379,11 @@ def get_abs_path_dict():
|
|||||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
def tokenize(unicode_sentence, mode="default", HMM=True):
|
||||||
"""Tokenize a sentence and yields tuples of (word, start, end)
|
"""Tokenize a sentence and yields tuples of (word, start, end)
|
||||||
Parameter:
|
Parameter:
|
||||||
- sentence: the unicode to be segmented.
|
- sentence: the str(unicode) to be segmented.
|
||||||
- mode: "default" or "search", "search" is for finer segmentation.
|
- mode: "default" or "search", "search" is for finer segmentation.
|
||||||
- HMM: whether to use the Hidden Markov Model.
|
- HMM: whether to use the Hidden Markov Model.
|
||||||
"""
|
"""
|
||||||
if not isinstance(unicode_sentence, unicode):
|
if not isinstance(unicode_sentence, text_type):
|
||||||
raise Exception("jieba: the input parameter should be unicode.")
|
raise Exception("jieba: the input parameter should be unicode.")
|
||||||
start = 0
|
start = 0
|
||||||
if mode == 'default':
|
if mode == 'default':
|
||||||
|
@ -25,7 +25,7 @@ args = parser.parse_args()
|
|||||||
|
|
||||||
if args.quiet:
|
if args.quiet:
|
||||||
jieba.setLogLevel(60)
|
jieba.setLogLevel(60)
|
||||||
delim = unicode(args.delimiter)
|
delim = text_type(args.delimiter)
|
||||||
cutall = args.cutall
|
cutall = args.cutall
|
||||||
hmm = args.hmm
|
hmm = args.hmm
|
||||||
fp = open(args.filename, 'r') if args.filename else sys.stdin
|
fp = open(args.filename, 'r') if args.filename else sys.stdin
|
||||||
@ -40,7 +40,10 @@ if args.user_dict:
|
|||||||
ln = fp.readline()
|
ln = fp.readline()
|
||||||
while ln:
|
while ln:
|
||||||
l = ln.rstrip('\r\n')
|
l = ln.rstrip('\r\n')
|
||||||
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
|
result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
|
||||||
|
if PY2:
|
||||||
|
result = result.encode(default_encoding)
|
||||||
|
print(result)
|
||||||
ln = fp.readline()
|
ln = fp.readline()
|
||||||
|
|
||||||
fp.close()
|
fp.close()
|
||||||
|
31
jieba/_compat.py
Normal file
31
jieba/_compat.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import sys
|
||||||
|
|
||||||
|
PY2 = sys.version_info[0] == 2
|
||||||
|
|
||||||
|
default_encoding = sys.getfilesystemencoding()
|
||||||
|
|
||||||
|
if PY2:
|
||||||
|
text_type = unicode
|
||||||
|
string_types = (str, unicode)
|
||||||
|
|
||||||
|
iterkeys = lambda d: d.iterkeys()
|
||||||
|
itervalues = lambda d: d.itervalues()
|
||||||
|
iteritems = lambda d: d.iteritems()
|
||||||
|
|
||||||
|
else:
|
||||||
|
text_type = str
|
||||||
|
string_types = (str,)
|
||||||
|
xrange = range
|
||||||
|
|
||||||
|
iterkeys = lambda d: iter(d.keys())
|
||||||
|
itervalues = lambda d: iter(d.values())
|
||||||
|
iteritems = lambda d: iter(d.items())
|
||||||
|
|
||||||
|
def strdecode(sentence):
|
||||||
|
if not isinstance(sentence, text_type):
|
||||||
|
try:
|
||||||
|
sentence = sentence.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
|
return sentence
|
@ -1,13 +1,14 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import jieba
|
import jieba
|
||||||
import jieba.posseg
|
import jieba.posseg
|
||||||
import os
|
import os
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
try:
|
try:
|
||||||
from analyzer import ChineseAnalyzer
|
from .analyzer import ChineseAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
from textrank import textrank
|
from .textrank import textrank
|
||||||
|
|
||||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
abs_path = os.path.join(_curpath, "idf.txt")
|
abs_path = os.path.join(_curpath, "idf.txt")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
##encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
||||||
from whoosh.analysis import Tokenizer,Token
|
from whoosh.analysis import Tokenizer,Token
|
||||||
from whoosh.lang.porter import stem
|
from whoosh.lang.porter import stem
|
||||||
@ -10,9 +11,9 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
|
|||||||
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
|
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
|
||||||
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
|
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
|
||||||
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
||||||
'you', 'your', u'的', u'了', u'和'))
|
'you', 'your', '的', '了', '和'))
|
||||||
|
|
||||||
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
|
accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")
|
||||||
|
|
||||||
class ChineseTokenizer(Tokenizer):
|
class ChineseTokenizer(Tokenizer):
|
||||||
def __call__(self, text, **kargs):
|
def __call__(self, text, **kargs):
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
import collections
|
import collections
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
@ -35,7 +36,7 @@ class UndirectWeightedGraph:
|
|||||||
|
|
||||||
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
|
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
|
||||||
|
|
||||||
for w in ws.itervalues():
|
for w in itervalues(ws):
|
||||||
if w < min_rank:
|
if w < min_rank:
|
||||||
min_rank = w
|
min_rank = w
|
||||||
elif w > max_rank:
|
elif w > max_rank:
|
||||||
@ -88,4 +89,4 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||||
for x, w in textrank(s, withWeight=True):
|
for x, w in textrank(s, withWeight=True):
|
||||||
print x, w
|
print('%s %s' % (x, w))
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
from __future__ import with_statement
|
from __future__ import absolute_import, unicode_literals
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import marshal
|
import marshal
|
||||||
import sys
|
import sys
|
||||||
|
from .._compat import *
|
||||||
|
|
||||||
MIN_FLOAT = -3.14e100
|
MIN_FLOAT = -3.14e100
|
||||||
|
|
||||||
@ -41,9 +42,9 @@ def load_model():
|
|||||||
if sys.platform.startswith("java"):
|
if sys.platform.startswith("java"):
|
||||||
start_P, trans_P, emit_P = load_model()
|
start_P, trans_P, emit_P = load_model()
|
||||||
else:
|
else:
|
||||||
from prob_start import P as start_P
|
from .prob_start import P as start_P
|
||||||
from prob_trans import P as trans_P
|
from .prob_trans import P as trans_P
|
||||||
from prob_emit import P as emit_P
|
from .prob_emit import P as emit_P
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] #tabular
|
||||||
@ -85,12 +86,8 @@ def __cut(sentence):
|
|||||||
yield sentence[nexti:]
|
yield sentence[nexti:]
|
||||||
|
|
||||||
def cut(sentence):
|
def cut(sentence):
|
||||||
if not isinstance(sentence, unicode):
|
sentence = strdecode(sentence)
|
||||||
try:
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
|
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,13 +1,12 @@
|
|||||||
from __future__ import with_statement
|
from __future__ import absolute_import, unicode_literals
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import viterbi
|
|
||||||
import jieba
|
import jieba
|
||||||
import sys
|
import sys
|
||||||
import marshal
|
import marshal
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
from .._compat import *
|
||||||
default_encoding = sys.getfilesystemencoding()
|
from .viterbi import viterbi
|
||||||
|
|
||||||
PROB_START_P = "prob_start.p"
|
PROB_START_P = "prob_start.p"
|
||||||
PROB_TRANS_P = "prob_trans.p"
|
PROB_TRANS_P = "prob_trans.p"
|
||||||
@ -18,13 +17,14 @@ def load_model(f_name, isJython=True):
|
|||||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
with open(f_name, "r") as f:
|
with open(f_name, "rb") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
word, _, tag = line.split(' ')
|
line = line.decode("utf-8")
|
||||||
result[word.decode('utf-8')] = tag
|
word, _, tag = line.split(" ")
|
||||||
|
result[word] = tag
|
||||||
|
|
||||||
if not isJython:
|
if not isJython:
|
||||||
return result
|
return result
|
||||||
@ -55,10 +55,10 @@ def load_model(f_name, isJython=True):
|
|||||||
if sys.platform.startswith("java"):
|
if sys.platform.startswith("java"):
|
||||||
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
|
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
|
||||||
else:
|
else:
|
||||||
from char_state_tab import P as char_state_tab_P
|
from .char_state_tab import P as char_state_tab_P
|
||||||
from prob_start import P as start_P
|
from .prob_start import P as start_P
|
||||||
from prob_trans import P as trans_P
|
from .prob_trans import P as trans_P
|
||||||
from prob_emit import P as emit_P
|
from .prob_emit import P as emit_P
|
||||||
|
|
||||||
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
|
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
|
||||||
|
|
||||||
@ -79,20 +79,23 @@ class pair(object):
|
|||||||
self.flag = flag
|
self.flag = flag
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return u'%s/%s' % (self.word, self.flag)
|
return '%s/%s' % (self.word, self.flag)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__unicode__().encode(default_encoding)
|
if PY2:
|
||||||
|
return self.__unicode__().encode(default_encoding)
|
||||||
|
else:
|
||||||
|
return self.__unicode__()
|
||||||
|
|
||||||
def encode(self,arg):
|
def encode(self,arg):
|
||||||
return self.__unicode__().encode(arg)
|
return self.__unicode__().encode(arg)
|
||||||
|
|
||||||
def __cut(sentence):
|
def __cut(sentence):
|
||||||
prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
|
prob, pos_list = viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
|
||||||
begin, next = 0, 0
|
begin, nexti = 0, 0
|
||||||
|
|
||||||
for i,char in enumerate(sentence):
|
for i,char in enumerate(sentence):
|
||||||
pos = pos_list[i][0]
|
pos = pos_list[i][0]
|
||||||
@ -100,16 +103,16 @@ def __cut(sentence):
|
|||||||
begin = i
|
begin = i
|
||||||
elif pos == 'E':
|
elif pos == 'E':
|
||||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||||
next = i+1
|
nexti = i+1
|
||||||
elif pos == 'S':
|
elif pos == 'S':
|
||||||
yield pair(char, pos_list[i][1])
|
yield pair(char, pos_list[i][1])
|
||||||
next = i+1
|
nexti = i+1
|
||||||
if next < len(sentence):
|
if nexti < len(sentence):
|
||||||
yield pair(sentence[next:], pos_list[next][1])
|
yield pair(sentence[nexti:], pos_list[nexti][1])
|
||||||
|
|
||||||
def __cut_detail(sentence):
|
def __cut_detail(sentence):
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||||
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
@ -132,8 +135,8 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
jieba.calc(sentence, DAG, route)
|
jieba.calc(sentence, DAG, route)
|
||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = u''
|
buf = ''
|
||||||
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
re_eng = re.compile('[a-zA-Z0-9]',re.U)
|
||||||
while x < N:
|
while x < N:
|
||||||
y = route[x][1]+1
|
y = route[x][1]+1
|
||||||
l_word = sentence[x:y]
|
l_word = sentence[x:y]
|
||||||
@ -143,12 +146,12 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
else:
|
else:
|
||||||
if buf:
|
if buf:
|
||||||
yield pair(buf,'eng')
|
yield pair(buf,'eng')
|
||||||
buf = u''
|
buf = ''
|
||||||
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
||||||
x = y
|
x = y
|
||||||
if buf:
|
if buf:
|
||||||
yield pair(buf,'eng')
|
yield pair(buf,'eng')
|
||||||
buf = u''
|
buf = ''
|
||||||
|
|
||||||
def __cut_DAG(sentence):
|
def __cut_DAG(sentence):
|
||||||
DAG = jieba.get_DAG(sentence)
|
DAG = jieba.get_DAG(sentence)
|
||||||
@ -157,7 +160,7 @@ def __cut_DAG(sentence):
|
|||||||
jieba.calc(sentence, DAG, route)
|
jieba.calc(sentence, DAG, route)
|
||||||
|
|
||||||
x = 0
|
x = 0
|
||||||
buf = u''
|
buf = ''
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
while x < N:
|
while x < N:
|
||||||
y = route[x][1]+1
|
y = route[x][1]+1
|
||||||
@ -175,7 +178,7 @@ def __cut_DAG(sentence):
|
|||||||
else:
|
else:
|
||||||
for elem in buf:
|
for elem in buf:
|
||||||
yield pair(elem, word_tag_tab.get(elem, 'x'))
|
yield pair(elem, word_tag_tab.get(elem, 'x'))
|
||||||
buf = u''
|
buf = ''
|
||||||
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
||||||
x = y
|
x = y
|
||||||
|
|
||||||
@ -191,13 +194,9 @@ def __cut_DAG(sentence):
|
|||||||
yield pair(elem, word_tag_tab.get(elem, 'x'))
|
yield pair(elem, word_tag_tab.get(elem, 'x'))
|
||||||
|
|
||||||
def __cut_internal(sentence, HMM=True):
|
def __cut_internal(sentence, HMM=True):
|
||||||
if not isinstance(sentence, unicode):
|
sentence = strdecode(sentence)
|
||||||
try:
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
|
||||||
sentence = sentence.decode('utf-8')
|
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
except UnicodeDecodeError:
|
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
|
|
||||||
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
if HMM:
|
if HMM:
|
||||||
__cut_blk = __cut_DAG
|
__cut_blk = __cut_DAG
|
||||||
@ -234,7 +233,7 @@ def cut(sentence, HMM=True):
|
|||||||
for w in __cut_internal(sentence, HMM=HMM):
|
for w in __cut_internal(sentence, HMM=HMM):
|
||||||
yield w
|
yield w
|
||||||
else:
|
else:
|
||||||
parts = re.compile('([\r\n]+)').split(sentence)
|
parts = strdecode(sentence).split('\n')
|
||||||
if HMM:
|
if HMM:
|
||||||
result = jieba.pool.map(__lcut_internal, parts)
|
result = jieba.pool.map(__lcut_internal, parts)
|
||||||
else:
|
else:
|
||||||
|
File diff suppressed because it is too large
Load Diff
178556
jieba/posseg/prob_emit.py
178556
jieba/posseg/prob_emit.py
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,11 @@
|
|||||||
|
import sys
|
||||||
import operator
|
import operator
|
||||||
MIN_FLOAT = -3.14e100
|
MIN_FLOAT = -3.14e100
|
||||||
MIN_INF = float("-inf")
|
MIN_INF = float("-inf")
|
||||||
|
|
||||||
|
if sys.version_info[0] > 2:
|
||||||
|
xrange = range
|
||||||
|
|
||||||
def get_top_states(t_state_v, K=4):
|
def get_top_states(t_state_v, K=4):
|
||||||
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
|
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
|
||||||
|
|
||||||
|
8
setup.py
8
setup.py
@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
LONGDOC = u"""
|
LONGDOC = """
|
||||||
jieba
|
jieba
|
||||||
=====
|
=====
|
||||||
|
|
||||||
@ -75,6 +75,12 @@ setup(name='jieba',
|
|||||||
'Natural Language :: Chinese (Traditional)',
|
'Natural Language :: Chinese (Traditional)',
|
||||||
'Programming Language :: Python',
|
'Programming Language :: Python',
|
||||||
'Programming Language :: Python :: 2',
|
'Programming Language :: Python :: 2',
|
||||||
|
'Programming Language :: Python :: 2.6',
|
||||||
|
'Programming Language :: Python :: 2.7',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Programming Language :: Python :: 3.2',
|
||||||
|
'Programming Language :: Python :: 3.3',
|
||||||
|
'Programming Language :: Python :: 3.4',
|
||||||
'Topic :: Text Processing',
|
'Topic :: Text Processing',
|
||||||
'Topic :: Text Processing :: Indexing',
|
'Topic :: Text Processing :: Indexing',
|
||||||
'Topic :: Text Processing :: Linguistic',
|
'Topic :: Text Processing :: Linguistic',
|
||||||
|
522
test/2to3.diff
522
test/2to3.diff
@ -1,522 +0,0 @@
|
|||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
|
|
||||||
--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800
|
|
||||||
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800
|
|
||||||
@@ -1,4 +1,4 @@
|
|
||||||
-##encoding=utf-8
|
|
||||||
+#encoding=utf-8
|
|
||||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
|
||||||
from whoosh.analysis import Tokenizer,Token
|
|
||||||
from whoosh.lang.porter import stem
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
|
|
||||||
--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800
|
|
||||||
+++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800
|
|
||||||
@@ -26,7 +26,7 @@
|
|
||||||
|
|
||||||
def set_new_path(self, new_idf_path):
|
|
||||||
if self.path != new_idf_path:
|
|
||||||
- content = open(new_idf_path, 'rb').read().decode('utf-8')
|
|
||||||
+ content = open(new_idf_path, 'r', encoding='utf-8').read()
|
|
||||||
idf_freq = {}
|
|
||||||
lines = content.rstrip('\n').split('\n')
|
|
||||||
for line in lines:
|
|
||||||
@@ -93,7 +93,7 @@
|
|
||||||
freq[k] *= idf_freq.get(k, median_idf) / total
|
|
||||||
|
|
||||||
if withWeight:
|
|
||||||
- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
|
|
||||||
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
|
||||||
else:
|
|
||||||
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
|
||||||
if topK:
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
|
|
||||||
--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800
|
|
||||||
+++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800
|
|
||||||
@@ -1,4 +1,4 @@
|
|
||||||
-#!/usr/bin/env python
|
|
||||||
+#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import sys
|
|
||||||
@@ -22,12 +22,12 @@
|
|
||||||
outSum = collections.defaultdict(float)
|
|
||||||
|
|
||||||
wsdef = 1.0 / len(self.graph)
|
|
||||||
- for n, out in list(self.graph.items()):
|
|
||||||
+ for n, out in self.graph.items():
|
|
||||||
ws[n] = wsdef
|
|
||||||
outSum[n] = sum((e[2] for e in out), 0.0)
|
|
||||||
|
|
||||||
for x in range(10): # 10 iters
|
|
||||||
- for n, inedges in list(self.graph.items()):
|
|
||||||
+ for n, inedges in self.graph.items():
|
|
||||||
s = 0
|
|
||||||
for e in inedges:
|
|
||||||
s += e[2] / outSum[e[1]] * ws[e[1]]
|
|
||||||
@@ -41,7 +41,7 @@
|
|
||||||
elif w > max_rank:
|
|
||||||
max_rank = w
|
|
||||||
|
|
||||||
- for n, w in list(ws.items()):
|
|
||||||
+ for n, w in ws.items():
|
|
||||||
# to unify the weights, don't *100.
|
|
||||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
|
||||||
|
|
||||||
@@ -72,12 +72,12 @@
|
|
||||||
continue
|
|
||||||
cm[(words[i].word, words[j].word)] += 1
|
|
||||||
|
|
||||||
- for terms, w in list(cm.items()):
|
|
||||||
+ for terms, w in cm.items():
|
|
||||||
g.addEdge(terms[0], terms[1], w)
|
|
||||||
|
|
||||||
nodes_rank = g.rank()
|
|
||||||
if withWeight:
|
|
||||||
- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
|
|
||||||
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
|
||||||
else:
|
|
||||||
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
|
||||||
if topK:
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
|
|
||||||
--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800
|
|
||||||
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800
|
|
||||||
@@ -1,4 +1,3 @@
|
|
||||||
-
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import marshal
|
|
||||||
@@ -89,7 +88,7 @@
|
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
|
||||||
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
|
|
||||||
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
|
||||||
blocks = re_han.split(sentence)
|
|
||||||
for blk in blocks:
|
|
||||||
if re_han.match(blk):
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
|
|
||||||
--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800
|
|
||||||
+++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800
|
|
||||||
@@ -1,4 +1,3 @@
|
|
||||||
-
|
|
||||||
__version__ = '0.35'
|
|
||||||
__license__ = 'MIT'
|
|
||||||
|
|
||||||
@@ -51,7 +50,7 @@
|
|
||||||
pfdict.add(word[:ch+1])
|
|
||||||
except ValueError as e:
|
|
||||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
|
||||||
- raise ValueError(e)
|
|
||||||
+ raise e
|
|
||||||
return pfdict, lfreq, ltotal
|
|
||||||
|
|
||||||
def initialize(dictionary=None):
|
|
||||||
@@ -229,11 +228,11 @@
|
|
||||||
'''The main function that segments an entire sentence that contains
|
|
||||||
Chinese characters into seperated words.
|
|
||||||
Parameter:
|
|
||||||
- - sentence: The str/unicode to be segmented.
|
|
||||||
+ - sentence: The str to be segmented.
|
|
||||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
|
||||||
- HMM: Whether to use the Hidden Markov Model.
|
|
||||||
'''
|
|
||||||
- if not isinstance(sentence, str):
|
|
||||||
+ if isinstance(sentence, bytes):
|
|
||||||
try:
|
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
@@ -243,9 +242,9 @@
|
|
||||||
# \r\n|\s : whitespace characters. Will not be handled.
|
|
||||||
|
|
||||||
if cut_all:
|
|
||||||
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
|
|
||||||
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
||||||
else:
|
|
||||||
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
|
|
||||||
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
|
|
||||||
blocks = re_han.split(sentence)
|
|
||||||
if cut_all:
|
|
||||||
cut_block = __cut_all
|
|
||||||
@@ -339,8 +338,6 @@
|
|
||||||
global pool, cut, cut_for_search
|
|
||||||
if os.name == 'nt':
|
|
||||||
raise Exception("jieba: parallel mode only supports posix system")
|
|
||||||
- if sys.version_info[0]==2 and sys.version_info[1]<6:
|
|
||||||
- raise Exception("jieba: the parallel feature needs Python version>2.5")
|
|
||||||
from multiprocessing import Pool, cpu_count
|
|
||||||
if processnum is None:
|
|
||||||
processnum = cpu_count()
|
|
||||||
@@ -393,12 +390,12 @@
|
|
||||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
|
||||||
"""Tokenize a sentence and yields tuples of (word, start, end)
|
|
||||||
Parameter:
|
|
||||||
- - sentence: the unicode to be segmented.
|
|
||||||
+ - sentence: the str to be segmented.
|
|
||||||
- mode: "default" or "search", "search" is for finer segmentation.
|
|
||||||
- HMM: whether to use the Hidden Markov Model.
|
|
||||||
"""
|
|
||||||
if not isinstance(unicode_sentence, str):
|
|
||||||
- raise Exception("jieba: the input parameter should be unicode.")
|
|
||||||
+ raise Exception("jieba: the input parameter should be str.")
|
|
||||||
start = 0
|
|
||||||
if mode == 'default':
|
|
||||||
for w in cut(unicode_sentence, HMM=HMM):
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
|
|
||||||
--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800
|
|
||||||
+++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800
|
|
||||||
@@ -40,7 +40,7 @@
|
|
||||||
ln = fp.readline()
|
|
||||||
while ln:
|
|
||||||
l = ln.rstrip('\r\n')
|
|
||||||
- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
|
|
||||||
+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
|
|
||||||
ln = fp.readline()
|
|
||||||
|
|
||||||
fp.close()
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
|
|
||||||
--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800
|
|
||||||
+++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800
|
|
||||||
@@ -1,4 +1,3 @@
|
|
||||||
-
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
from . import viterbi
|
|
||||||
@@ -18,14 +17,14 @@
|
|
||||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
||||||
|
|
||||||
result = {}
|
|
||||||
- with open(f_name, "r") as f:
|
|
||||||
+ with open(f_name, "rb") as f:
|
|
||||||
for line in f:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
- word, _, tag = line.split(' ')
|
|
||||||
- result[word.decode('utf-8')] = tag
|
|
||||||
-
|
|
||||||
+ line = line.decode("utf-8")
|
|
||||||
+ word, _, tag = line.split(" ")
|
|
||||||
+ result[word] = tag
|
|
||||||
if not isJython:
|
|
||||||
return result
|
|
||||||
|
|
||||||
@@ -105,8 +104,8 @@
|
|
||||||
yield pair(sentence[next:], pos_list[next][1])
|
|
||||||
|
|
||||||
def __cut_detail(sentence):
|
|
||||||
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
|
|
||||||
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
|
||||||
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
|
||||||
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
|
||||||
blocks = re_han.split(sentence)
|
|
||||||
for blk in blocks:
|
|
||||||
if re_han.match(blk):
|
|
||||||
@@ -130,7 +129,7 @@
|
|
||||||
x = 0
|
|
||||||
N = len(sentence)
|
|
||||||
buf = ''
|
|
||||||
- re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
|
||||||
+ re_eng = re.compile('[a-zA-Z0-9]',re.U)
|
|
||||||
while x < N:
|
|
||||||
y = route[x][1]+1
|
|
||||||
l_word = sentence[x:y]
|
|
||||||
@@ -195,8 +194,8 @@
|
|
||||||
sentence = sentence.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
|
||||||
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
|
|
||||||
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
|
||||||
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
|
|
||||||
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
|
||||||
blocks = re_han.split(sentence)
|
|
||||||
if HMM:
|
|
||||||
__cut_blk = __cut_DAG
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
|
|
||||||
--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800
|
|
||||||
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800
|
|
||||||
@@ -8,7 +8,7 @@
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
|
||||||
V = [{}] #tabular
|
|
||||||
mem_path = [{}]
|
|
||||||
- all_states = list(trans_p.keys())
|
|
||||||
+ all_states = trans_p.keys()
|
|
||||||
for y in states.get(obs[0], all_states): #init
|
|
||||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
|
||||||
mem_path[0][y] = ''
|
|
||||||
@@ -16,9 +16,9 @@
|
|
||||||
V.append({})
|
|
||||||
mem_path.append({})
|
|
||||||
#prev_states = get_top_states(V[t-1])
|
|
||||||
- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
|
|
||||||
+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
|
|
||||||
|
|
||||||
- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
|
|
||||||
+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
|
|
||||||
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
|
|
||||||
|
|
||||||
if not obs_states:
|
|
||||||
@@ -29,7 +29,7 @@
|
|
||||||
V[t][y] = prob
|
|
||||||
mem_path[t][y] = state
|
|
||||||
|
|
||||||
- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
|
|
||||||
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
|
|
||||||
#if len(last)==0:
|
|
||||||
#print obs
|
|
||||||
prob, state = max(last)
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
|
|
||||||
--- ./README.md 2014-11-29 15:46:08.487925926 +0800
|
|
||||||
+++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800
|
|
||||||
@@ -4,6 +4,9 @@
|
|
||||||
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
|
|
||||||
- _Scroll down for English documentation._
|
|
||||||
|
|
||||||
+注意!
|
|
||||||
+========
|
|
||||||
+这个branch `jieba3k` 是专门用于Python3.x的版本
|
|
||||||
|
|
||||||
特点
|
|
||||||
========
|
|
||||||
@@ -68,16 +71,16 @@
|
|
||||||
import jieba
|
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
|
||||||
-print "Full Mode:", "/ ".join(seg_list) # 全模式
|
|
||||||
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
|
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
|
||||||
-print "Default Mode:", "/ ".join(seg_list) # 精确模式
|
|
||||||
+print("Default Mode:", "/ ".join(seg_list)) # 精确模式
|
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
|
|
||||||
-print ", ".join(seg_list)
|
|
||||||
+print(", ".join(seg_list))
|
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
|
||||||
-print ", ".join(seg_list)
|
|
||||||
+print(", ".join(seg_list))
|
|
||||||
```
|
|
||||||
|
|
||||||
输出:
|
|
||||||
@@ -174,7 +177,7 @@
|
|
||||||
>>> import jieba.posseg as pseg
|
|
||||||
>>> words = pseg.cut("我爱北京天安门")
|
|
||||||
>>> for w in words:
|
|
||||||
-... print w.word, w.flag
|
|
||||||
+... print(w.word, w.flag)
|
|
||||||
...
|
|
||||||
我 r
|
|
||||||
爱 v
|
|
||||||
@@ -203,7 +206,7 @@
|
|
||||||
```python
|
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司')
|
|
||||||
for tk in result:
|
|
||||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
|
||||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
@@ -219,7 +222,7 @@
|
|
||||||
```python
|
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
|
||||||
for tk in result:
|
|
||||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
|
||||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
@@ -408,16 +411,16 @@
|
|
||||||
import jieba
|
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
|
||||||
-print "Full Mode:", "/ ".join(seg_list) # 全模式
|
|
||||||
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
|
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
|
||||||
-print "Default Mode:", "/ ".join(seg_list) # 默认模式
|
|
||||||
+print("Default Mode:", "/ ".join(seg_list)) # 默认模式
|
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
|
||||||
-print ", ".join(seg_list)
|
|
||||||
+print(", ".join(seg_list))
|
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
|
||||||
-print ", ".join(seg_list)
|
|
||||||
+print(", ".join(seg_list))
|
|
||||||
```
|
|
||||||
|
|
||||||
Output:
|
|
||||||
@@ -483,7 +486,7 @@
|
|
||||||
>>> import jieba.posseg as pseg
|
|
||||||
>>> words = pseg.cut("我爱北京天安门")
|
|
||||||
>>> for w in words:
|
|
||||||
-... print w.word, w.flag
|
|
||||||
+... print(w.word, w.flag)
|
|
||||||
...
|
|
||||||
我 r
|
|
||||||
爱 v
|
|
||||||
@@ -512,7 +515,7 @@
|
|
||||||
```python
|
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司')
|
|
||||||
for tk in result:
|
|
||||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
|
||||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
@@ -528,7 +531,7 @@
|
|
||||||
```python
|
|
||||||
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
|
||||||
for tk in result:
|
|
||||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
|
||||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
|
|
||||||
--- ./setup.py 2014-11-29 15:46:46.379925565 +0800
|
|
||||||
+++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800
|
|
||||||
@@ -11,7 +11,7 @@
|
|
||||||
|
|
||||||
完整文档见 ``README.md``
|
|
||||||
|
|
||||||
-GitHub: https://github.com/fxsjy/jieba
|
|
||||||
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
|
|
||||||
|
|
||||||
特点
|
|
||||||
====
|
|
||||||
@@ -34,17 +34,11 @@
|
|
||||||
Python 2.x
|
|
||||||
----------
|
|
||||||
|
|
||||||
-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
|
|
||||||
-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
|
|
||||||
- python setup.py install
|
|
||||||
-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
|
|
||||||
-- 通过 ``import jieba`` 来引用
|
|
||||||
+见 https://pypi.python.org/pypi/jieba/
|
|
||||||
|
|
||||||
Python 3.x
|
|
||||||
----------
|
|
||||||
|
|
||||||
-见 https://pypi.python.org/pypi/jieba3k/
|
|
||||||
-
|
|
||||||
- 目前 master 分支是只支持 Python 2.x 的
|
|
||||||
- Python 3.x 版本的分支也已经基本可用:
|
|
||||||
https://github.com/fxsjy/jieba/tree/jieba3k
|
|
||||||
@@ -59,13 +53,13 @@
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
-setup(name='jieba',
|
|
||||||
+setup(name='jieba3k',
|
|
||||||
version='0.35.1',
|
|
||||||
description='Chinese Words Segementation Utilities',
|
|
||||||
long_description=LONGDOC,
|
|
||||||
author='Sun, Junyi',
|
|
||||||
author_email='ccnusjy@gmail.com',
|
|
||||||
- url='https://github.com/fxsjy/jieba',
|
|
||||||
+ url='https://github.com/fxsjy/jieba/tree/jieba3k',
|
|
||||||
license="MIT",
|
|
||||||
classifiers=[
|
|
||||||
'Intended Audience :: Developers',
|
|
||||||
@@ -73,9 +67,8 @@
|
|
||||||
'Operating System :: OS Independent',
|
|
||||||
'Natural Language :: Chinese (Simplified)',
|
|
||||||
'Natural Language :: Chinese (Traditional)',
|
|
||||||
'Programming Language :: Python',
|
|
||||||
- 'Programming Language :: Python :: 2',
|
|
||||||
+ 'Programming Language :: Python :: 3',
|
|
||||||
'Topic :: Text Processing',
|
|
||||||
'Topic :: Text Processing :: Indexing',
|
|
||||||
'Topic :: Text Processing :: Linguistic',
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
|
|
||||||
--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800
|
|
||||||
+++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800
|
|
||||||
@@ -51,13 +51,13 @@
|
|
||||||
print("training...")
|
|
||||||
|
|
||||||
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
|
|
||||||
-print(("done in %0.3fs." % (time.time() - t0)))
|
|
||||||
+print("done in %0.3fs." % (time.time() - t0))
|
|
||||||
|
|
||||||
# Inverse the vectorizer vocabulary to be able
|
|
||||||
feature_names = count_vect.get_feature_names()
|
|
||||||
|
|
||||||
for topic_idx, topic in enumerate(nmf.components_):
|
|
||||||
- print(("Topic #%d:" % topic_idx))
|
|
||||||
- print((" ".join([feature_names[i]
|
|
||||||
- for i in topic.argsort()[:-n_top_words - 1:-1]])))
|
|
||||||
+ print("Topic #%d:" % topic_idx)
|
|
||||||
+ print(" ".join([feature_names[i]
|
|
||||||
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
|
|
||||||
print("")
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
|
|
||||||
--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800
|
|
||||||
+++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800
|
|
||||||
@@ -23,6 +23,6 @@
|
|
||||||
break
|
|
||||||
line = line.strip()
|
|
||||||
for word in jieba.cut(line):
|
|
||||||
- print(word.encode(default_encoding))
|
|
||||||
+ print(word)
|
|
||||||
|
|
||||||
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
|
|
||||||
--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800
|
|
||||||
+++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800
|
|
||||||
@@ -152,7 +152,7 @@
|
|
||||||
#-*-coding: utf-8 -*-
|
|
||||||
import sys
|
|
||||||
+import imp
|
|
||||||
sys.path.append("../")
|
|
||||||
import unittest
|
|
||||||
import types
|
|
||||||
@@ -97,7 +98,7 @@
|
|
||||||
|
|
||||||
class JiebaTestCase(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
- reload(jieba)
|
|
||||||
+ imp.reload(jieba)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
pass
|
|
||||||
@@ -151,7 +152,7 @@
|
|
||||||
|
|
||||||
def testTokenize(self):
|
|
||||||
for content in test_contents:
|
|
||||||
- result = jieba.tokenize(content.decode('utf-8'))
|
|
||||||
+ result = jieba.tokenize(content)
|
|
||||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
|
||||||
result = list(result)
|
|
||||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
|
||||||
@@ -181,7 +181,7 @@
|
|
||||||
|
|
||||||
def testTokenize_NOHMM(self):
|
|
||||||
for content in test_contents:
|
|
||||||
- result = jieba.tokenize(content.decode('utf-8'),HMM=False)
|
|
||||||
+ result = jieba.tokenize(content,HMM=False)
|
|
||||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
|
||||||
result = list(result)
|
|
||||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
|
|
||||||
--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800
|
|
||||||
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800
|
|
||||||
@@ -7,7 +7,6 @@
|
|
||||||
|
|
||||||
def cuttest(test_sent):
|
|
||||||
global g_mode
|
|
||||||
- test_sent = test_sent.decode('utf-8')
|
|
||||||
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
|
|
||||||
for tk in result:
|
|
||||||
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
|
|
||||||
--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800
|
|
||||||
+++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800
|
|
||||||
@@ -7,7 +7,6 @@
|
|
||||||
|
|
||||||
def cuttest(test_sent):
|
|
||||||
global g_mode
|
|
||||||
- test_sent = test_sent.decode('utf-8')
|
|
||||||
result = jieba.tokenize(test_sent,mode=g_mode)
|
|
||||||
for tk in result:
|
|
||||||
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
|
@ -1,34 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Set 2to3 path.
|
|
||||||
PYTHON2TO3=2to3
|
|
||||||
# Copy the python2 version.
|
|
||||||
echo Jieba 2to3 manual conversion tool
|
|
||||||
echo
|
|
||||||
if ! git rev-parse; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo Copying working directory to ../jieba2
|
|
||||||
if [ -d ../jieba2 ]; then
|
|
||||||
echo Found existing ../jieba2
|
|
||||||
read -p "Replace it with new one? (y/n) " -r
|
|
||||||
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
|
|
||||||
echo Cancelled.
|
|
||||||
exit
|
|
||||||
else
|
|
||||||
rm -rf ../jieba2
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if ! git checkout jieba3k; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
cp -r . ../jieba2
|
|
||||||
cd ../jieba2
|
|
||||||
if ! git checkout master; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
# Here starts auto conversion.
|
|
||||||
echo Converting jieba2 to Python3 ...
|
|
||||||
find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
|
|
||||||
find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
|
|
||||||
patch -p0 -s <../jieba/test/2to3.diff
|
|
||||||
echo Done. Compare jieba and jieba2 to manually port.
|
|
17
test/demo.py
17
test/demo.py
@ -1,17 +1,18 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
|
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print u"Full Mode:", u"/ ".join(seg_list) # 全模式
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
|
|
||||||
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||||
print u"Default Mode:", u"/ ".join(seg_list) # 默认模式
|
print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
|
||||||
|
|
||||||
seg_list = jieba.cut(u"他来到了网易杭研大厦")
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
print u", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||||
print u", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
|
|||||||
|
|
||||||
|
|
||||||
if len(args) < 1:
|
if len(args) < 1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
@ -27,4 +27,4 @@ content = open(file_name, 'rb').read()
|
|||||||
|
|
||||||
tags = jieba.analyse.extract_tags(content, topK=topK)
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
print ",".join(tags)
|
print(",".join(tags))
|
||||||
|
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
|
|||||||
|
|
||||||
|
|
||||||
if len(args) < 1:
|
if len(args) < 1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
@ -29,4 +29,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
|||||||
|
|
||||||
tags = jieba.analyse.extract_tags(content, topK=topK)
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
print ",".join(tags)
|
print(",".join(tags))
|
||||||
|
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
|
|||||||
|
|
||||||
|
|
||||||
if len(args) < 1:
|
if len(args) < 1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
@ -30,4 +30,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
|||||||
|
|
||||||
tags = jieba.analyse.extract_tags(content, topK=topK)
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
print ",".join(tags)
|
print(",".join(tags))
|
||||||
|
@ -14,7 +14,7 @@ opt, args = parser.parse_args()
|
|||||||
|
|
||||||
|
|
||||||
if len(args) < 1:
|
if len(args) < 1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
|
|||||||
|
|
||||||
if withWeight is True:
|
if withWeight is True:
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
print "tag: %s\t\t weight: %f" % (tag[0],tag[1])
|
print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
|
||||||
else:
|
else:
|
||||||
print ",".join(tags)
|
print(",".join(tags))
|
||||||
|
@ -12,7 +12,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
|
|
||||||
if len(sys.argv)<2:
|
if len(sys.argv)<2:
|
||||||
print "usage: extract_topic.py directory [n_topic] [n_top_words]"
|
print("usage: extract_topic.py directory [n_topic] [n_top_words]")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
n_topic = 10
|
n_topic = 10
|
||||||
@ -28,27 +28,27 @@ count_vect = CountVectorizer()
|
|||||||
docs = []
|
docs = []
|
||||||
|
|
||||||
pattern = os.path.join(sys.argv[1],"*.txt")
|
pattern = os.path.join(sys.argv[1],"*.txt")
|
||||||
print "read "+pattern
|
print("read "+pattern)
|
||||||
|
|
||||||
for f_name in glob.glob(pattern):
|
for f_name in glob.glob(pattern):
|
||||||
with open(f_name) as f:
|
with open(f_name) as f:
|
||||||
print "read file:", f_name
|
print("read file:", f_name)
|
||||||
for line in f: #one line as a document
|
for line in f: #one line as a document
|
||||||
words = " ".join(jieba.cut(line))
|
words = " ".join(jieba.cut(line))
|
||||||
docs.append(words)
|
docs.append(words)
|
||||||
|
|
||||||
random.shuffle(docs)
|
random.shuffle(docs)
|
||||||
|
|
||||||
print "read done."
|
print("read done.")
|
||||||
|
|
||||||
print "transform"
|
print("transform")
|
||||||
counts = count_vect.fit_transform(docs)
|
counts = count_vect.fit_transform(docs)
|
||||||
tfidf = TfidfTransformer().fit_transform(counts)
|
tfidf = TfidfTransformer().fit_transform(counts)
|
||||||
print tfidf.shape
|
print(tfidf.shape)
|
||||||
|
|
||||||
|
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
print "training..."
|
print("training...")
|
||||||
|
|
||||||
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
|
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
|
||||||
print("done in %0.3fs." % (time.time() - t0))
|
print("done in %0.3fs." % (time.time() - t0))
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
#-*-coding: utf-8 -*-
|
#-*-coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import unittest
|
import unittest
|
||||||
import types
|
import types
|
||||||
import jieba
|
import jieba
|
||||||
|
if sys.version_info[0] > 2:
|
||||||
|
from imp import reload
|
||||||
|
|
||||||
jieba.initialize()
|
jieba.initialize()
|
||||||
|
|
||||||
|
|
||||||
@ -108,8 +112,8 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
|
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
|
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join(result)
|
print(" , ".join(result), file=sys.stderr)
|
||||||
print >> sys.stderr, "testDefaultCut"
|
print("testDefaultCut", file=sys.stderr)
|
||||||
|
|
||||||
def testCutAll(self):
|
def testCutAll(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
@ -117,8 +121,8 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
|
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test CutAll error on content: %s" % content
|
assert isinstance(result, list), "Test CutAll error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join(result)
|
print(" , ".join(result), file=sys.stderr)
|
||||||
print >> sys.stderr, "testCutAll"
|
print("testCutAll", file=sys.stderr)
|
||||||
|
|
||||||
def testSetDictionary(self):
|
def testSetDictionary(self):
|
||||||
jieba.set_dictionary("foobar.txt")
|
jieba.set_dictionary("foobar.txt")
|
||||||
@ -127,8 +131,8 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
|
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
|
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join(result)
|
print(" , ".join(result), file=sys.stderr)
|
||||||
print >> sys.stderr, "testSetDictionary"
|
print("testSetDictionary", file=sys.stderr)
|
||||||
|
|
||||||
def testCutForSearch(self):
|
def testCutForSearch(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
@ -136,8 +140,8 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
|
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
|
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join(result)
|
print(" , ".join(result), file=sys.stderr)
|
||||||
print >> sys.stderr, "testCutForSearch"
|
print("testCutForSearch", file=sys.stderr)
|
||||||
|
|
||||||
def testPosseg(self):
|
def testPosseg(self):
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
@ -146,18 +150,18 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
|
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test Posseg error on content: %s" % content
|
assert isinstance(result, list), "Test Posseg error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
|
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
|
||||||
print >> sys.stderr, "testPosseg"
|
print("testPosseg", file=sys.stderr)
|
||||||
|
|
||||||
def testTokenize(self):
|
def testTokenize(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
result = jieba.tokenize(content.decode('utf-8'))
|
result = jieba.tokenize(content)
|
||||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
|
||||||
print >> sys.stderr, "testTokenize"
|
print("testTokenize", file=sys.stderr)
|
||||||
|
|
||||||
def testDefaultCut_NOHMM(self):
|
def testDefaultCut_NOHMM(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
@ -165,8 +169,8 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
|
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
|
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join(result)
|
print(" , ".join(result), file=sys.stderr)
|
||||||
print >> sys.stderr, "testDefaultCut_NOHMM"
|
print("testDefaultCut_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
def testPosseg_NOHMM(self):
|
def testPosseg_NOHMM(self):
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
@ -175,18 +179,18 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
|
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test Posseg error on content: %s" % content
|
assert isinstance(result, list), "Test Posseg error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
|
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
|
||||||
print >> sys.stderr, "testPosseg_NOHMM"
|
print("testPosseg_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
def testTokenize_NOHMM(self):
|
def testTokenize_NOHMM(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
result = jieba.tokenize(content.decode('utf-8'),HMM=False)
|
result = jieba.tokenize(content,HMM=False)
|
||||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
|
||||||
print >> sys.stderr, "testTokenize_NOHMM"
|
print("testTokenize_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
def testCutForSearch_NOHMM(self):
|
def testCutForSearch_NOHMM(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
@ -194,8 +198,8 @@ class JiebaTestCase(unittest.TestCase):
|
|||||||
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
|
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
|
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
|
||||||
print >> sys.stderr, " , ".join(result)
|
print(" , ".join(result), file=sys.stderr)
|
||||||
print >> sys.stderr, "testCutForSearch_NOHMM"
|
print("testCutForSearch_NOHMM", file=sys.stderr)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -6,7 +6,7 @@ cat abc.txt | python jiebacmd.py | sort | uniq -c | sort -nr -k1 | head -100
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
|
|
||||||
@ -23,6 +23,6 @@ while True:
|
|||||||
break
|
break
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
for word in jieba.cut(line):
|
for word in jieba.cut(line):
|
||||||
print word.encode(default_encoding)
|
print(word)
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ opt, args = parser.parse_args()
|
|||||||
|
|
||||||
|
|
||||||
if len(args) <1:
|
if len(args) <1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
@ -29,6 +29,6 @@ content = open(file_name,'rb').read()
|
|||||||
|
|
||||||
tags = jieba.analyse.extract_tags(content,topK=topK)
|
tags = jieba.analyse.extract_tags(content,topK=topK)
|
||||||
|
|
||||||
print ",".join(tags)
|
print(",".join(tags))
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../../")
|
sys.path.append("../../")
|
||||||
import jieba
|
import jieba
|
||||||
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent)
|
result = jieba.cut(test_sent)
|
||||||
for word in result:
|
for word in result:
|
||||||
print word, "/",
|
print(word, "/", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../../")
|
sys.path.append("../../")
|
||||||
import jieba
|
import jieba
|
||||||
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent,cut_all=True)
|
result = jieba.cut(test_sent,cut_all=True)
|
||||||
for word in result:
|
for word in result:
|
||||||
print word, "/",
|
print(word, "/", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../../")
|
sys.path.append("../../")
|
||||||
import jieba
|
import jieba
|
||||||
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut_for_search(test_sent)
|
result = jieba.cut_for_search(test_sent)
|
||||||
for word in result:
|
for word in result:
|
||||||
print word, "/",
|
print(word, "/", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import urllib2
|
|
||||||
import sys,time
|
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
sys.path.append("../../")
|
sys.path.append("../../")
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
@ -17,5 +16,5 @@ tm_cost = t2-t1
|
|||||||
log_f = open("1.log","wb")
|
log_f = open("1.log","wb")
|
||||||
log_f.write(words.encode('utf-8'))
|
log_f.write(words.encode('utf-8'))
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
print('speed %s bytes/second' % (len(content)/tm_cost))
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../../")
|
sys.path.append("../../")
|
||||||
import jieba
|
import jieba
|
||||||
@ -8,8 +9,8 @@ import jieba.posseg as pseg
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print(w.word, "/", w.flag, ", ", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import urllib2
|
from __future__ import print_function
|
||||||
import sys,time
|
import sys,time
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../../")
|
sys.path.append("../../")
|
||||||
import jieba
|
import jieba
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
jieba.enable_parallel(4)
|
jieba.enable_parallel(4)
|
||||||
|
|
||||||
url = sys.argv[1]
|
url = sys.argv[1]
|
||||||
@ -14,9 +15,8 @@ words = list(pseg.cut(content))
|
|||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
tm_cost = t2-t1
|
tm_cost = t2-t1
|
||||||
|
|
||||||
log_f = open("1.log","wb")
|
log_f = open("1.log","w")
|
||||||
for w in words:
|
log_f.write(' / '.join(map(str, words)))
|
||||||
print >> log_f, w.encode("utf-8"), "/" ,
|
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import jieba
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent)
|
result = jieba.cut(test_sent)
|
||||||
print " / ".join(result)
|
print(" / ".join(result))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -5,5 +5,5 @@ import jieba
|
|||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
words=pseg.cut("又跛又啞")
|
words=pseg.cut("又跛又啞")
|
||||||
for w in words:
|
for w in words:
|
||||||
print w.word,w.flag
|
print(w.word,w.flag)
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import jieba
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent)
|
result = jieba.cut(test_sent)
|
||||||
print " ".join(result)
|
print(" ".join(result))
|
||||||
|
|
||||||
def testcase():
|
def testcase():
|
||||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
@ -22,6 +22,6 @@ def testcase():
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
testcase()
|
testcase()
|
||||||
jieba.set_dictionary("foobar.txt")
|
jieba.set_dictionary("foobar.txt")
|
||||||
print "================================"
|
print("================================")
|
||||||
testcase()
|
testcase()
|
||||||
|
|
||||||
|
@ -6,8 +6,8 @@ import jieba
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut_for_search(test_sent)
|
result = jieba.cut_for_search(test_sent)
|
||||||
for word in result:
|
for word in result:
|
||||||
print word, "/",
|
print(word, "/", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -6,8 +6,8 @@ import jieba
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent,cut_all=True)
|
result = jieba.cut(test_sent,cut_all=True)
|
||||||
for word in result:
|
for word in result:
|
||||||
print word, "/",
|
print(word, "/", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import urllib2
|
|
||||||
import sys,time
|
import sys,time
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
@ -17,6 +16,6 @@ log_f = open("1.log","wb")
|
|||||||
log_f.write(words.encode('utf-8'))
|
log_f.write(words.encode('utf-8'))
|
||||||
log_f.close()
|
log_f.close()
|
||||||
|
|
||||||
print 'cost',tm_cost
|
print('cost ' + tm_cost)
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
print('speed %s bytes/second' % (len(content)/tm_cost))
|
||||||
|
|
||||||
|
@ -8,18 +8,18 @@ import jieba
|
|||||||
class Worker(threading.Thread):
|
class Worker(threading.Thread):
|
||||||
def run(self):
|
def run(self):
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
||||||
print "Full Mode:" + "/ ".join(seg_list) #全模式
|
print("Full Mode:" + "/ ".join(seg_list)) #全模式
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
||||||
print "Default Mode:" + "/ ".join(seg_list) #默认模式
|
print("Default Mode:" + "/ ".join(seg_list)) #默认模式
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
workers = []
|
workers = []
|
||||||
for i in xrange(10):
|
for i in range(10):
|
||||||
worker = Worker()
|
worker = Worker()
|
||||||
workers.append(worker)
|
workers.append(worker)
|
||||||
worker.start()
|
worker.start()
|
||||||
|
@ -6,7 +6,7 @@ import jieba
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent,HMM=False)
|
result = jieba.cut(test_sent,HMM=False)
|
||||||
print " / ".join(result)
|
print(" / ".join(result))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
@ -6,8 +7,8 @@ import jieba.posseg as pseg
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print(w.word, "/", w.flag, ", ", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import urllib2
|
from __future__ import print_function
|
||||||
import sys,time
|
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba
|
import jieba
|
||||||
jieba.initialize()
|
jieba.initialize()
|
||||||
@ -14,9 +14,8 @@ words = list(pseg.cut(content))
|
|||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
tm_cost = t2-t1
|
tm_cost = t2-t1
|
||||||
|
|
||||||
log_f = open("1.log","wb")
|
log_f = open("1.log","w")
|
||||||
for w in words:
|
log_f.write(' / '.join(map(str, words)))
|
||||||
print >> log_f, w.encode("utf-8"), "/" ,
|
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||||
|
|
||||||
|
@ -6,8 +6,8 @@ import jieba.posseg as pseg
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent,HMM=False)
|
result = pseg.cut(test_sent,HMM=False)
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print(w.word, "/", w.flag, ", ", end=' ')
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -7,10 +7,9 @@ g_mode="default"
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
global g_mode
|
global g_mode
|
||||||
test_sent = test_sent.decode('utf-8')
|
|
||||||
result = jieba.tokenize(test_sent,mode=g_mode)
|
result = jieba.tokenize(test_sent,mode=g_mode)
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -7,10 +7,9 @@ g_mode="default"
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
global g_mode
|
global g_mode
|
||||||
test_sent = test_sent.decode('utf-8')
|
|
||||||
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
|
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -9,19 +9,19 @@ test_sent = "李小福是创新办主任也是云计算方面的专家; 什么
|
|||||||
test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||||
words = jieba.cut(test_sent)
|
words = jieba.cut(test_sent)
|
||||||
for w in words:
|
for w in words:
|
||||||
print w
|
print(w)
|
||||||
|
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
|
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print(w.word, "/", w.flag, ", ", end=' ')
|
||||||
|
|
||||||
print "\n========"
|
print("\n========")
|
||||||
|
|
||||||
terms = jieba.cut('easy_install is great')
|
terms = jieba.cut('easy_install is great')
|
||||||
for t in terms:
|
for t in terms:
|
||||||
print t
|
print(t)
|
||||||
print '-------------------------'
|
print('-------------------------')
|
||||||
terms = jieba.cut('python 的正则表达式是好用的')
|
terms = jieba.cut('python 的正则表达式是好用的')
|
||||||
for t in terms:
|
for t in terms:
|
||||||
print t
|
print(t)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys,os
|
import sys,os
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
from whoosh.index import create_in,open_dir
|
from whoosh.index import create_in,open_dir
|
||||||
@ -18,46 +19,46 @@ ix = create_in("tmp", schema) # for create new index
|
|||||||
writer = ix.writer()
|
writer = ix.writer()
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document1",
|
title="document1",
|
||||||
path=u"/a",
|
path="/a",
|
||||||
content=u"This is the first document we’ve added!"
|
content="This is the first document we’ve added!"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document2",
|
title="document2",
|
||||||
path=u"/b",
|
path="/b",
|
||||||
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
|
content="The second one 你 中文测试中文 is even more interesting! 吃水果"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document3",
|
title="document3",
|
||||||
path=u"/c",
|
path="/c",
|
||||||
content=u"买水果然后来世博园。"
|
content="买水果然后来世博园。"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document4",
|
title="document4",
|
||||||
path=u"/c",
|
path="/c",
|
||||||
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document4",
|
title="document4",
|
||||||
path=u"/c",
|
path="/c",
|
||||||
content=u"咱俩交换一下吧。"
|
content="咱俩交换一下吧。"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.commit()
|
writer.commit()
|
||||||
searcher = ix.searcher()
|
searcher = ix.searcher()
|
||||||
parser = QueryParser("content", schema=ix.schema)
|
parser = QueryParser("content", schema=ix.schema)
|
||||||
|
|
||||||
for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"):
|
for keyword in ("水果世博园","你","first","中文","交换机","交换"):
|
||||||
print "result of ",keyword
|
print("result of ",keyword)
|
||||||
q = parser.parse(keyword)
|
q = parser.parse(keyword)
|
||||||
results = searcher.search(q)
|
results = searcher.search(q)
|
||||||
for hit in results:
|
for hit in results:
|
||||||
print hit.highlights("content")
|
print(hit.highlights("content"))
|
||||||
print "="*10
|
print("="*10)
|
||||||
|
|
||||||
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
|
for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
|
||||||
print t.text
|
print(t.text)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
@ -23,8 +24,8 @@ with open(file_name,"rb") as inf:
|
|||||||
for line in inf:
|
for line in inf:
|
||||||
i+=1
|
i+=1
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"line"+str(i),
|
title="line"+str(i),
|
||||||
path=u"/a",
|
path="/a",
|
||||||
content=line.decode('gbk','ignore')
|
content=line.decode('gbk','ignore')
|
||||||
)
|
)
|
||||||
writer.commit()
|
writer.commit()
|
||||||
@ -32,10 +33,10 @@ writer.commit()
|
|||||||
searcher = ix.searcher()
|
searcher = ix.searcher()
|
||||||
parser = QueryParser("content", schema=ix.schema)
|
parser = QueryParser("content", schema=ix.schema)
|
||||||
|
|
||||||
for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"):
|
for keyword in ("水果小姐","你","first","中文","交换机","交换"):
|
||||||
print "result of ",keyword
|
print("result of " + keyword)
|
||||||
q = parser.parse(keyword)
|
q = parser.parse(keyword)
|
||||||
results = searcher.search(q)
|
results = searcher.search(q)
|
||||||
for hit in results:
|
for hit in results:
|
||||||
print hit.highlights("content")
|
print(hit.highlights("content"))
|
||||||
print "="*10
|
print("="*10)
|
@ -1,4 +1,5 @@
|
|||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
@ -18,10 +19,10 @@ ix = open_dir("tmp")
|
|||||||
searcher = ix.searcher()
|
searcher = ix.searcher()
|
||||||
parser = QueryParser("content", schema=ix.schema)
|
parser = QueryParser("content", schema=ix.schema)
|
||||||
|
|
||||||
for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
|
for keyword in ("水果小姐","你","first","中文","交换机","交换","少林","乔峰"):
|
||||||
print "result of ",keyword
|
print("result of ",keyword)
|
||||||
q = parser.parse(keyword)
|
q = parser.parse(keyword)
|
||||||
results = searcher.search(q)
|
results = searcher.search(q)
|
||||||
for hit in results:
|
for hit in results:
|
||||||
print hit.highlights("content")
|
print(hit.highlights("content"))
|
||||||
print "="*10
|
print("="*10)
|
Loading…
x
Reference in New Issue
Block a user