mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix self.FREQ in cut_for_search; make pair object iterable
This commit is contained in:
parent
3b76328f2a
commit
ceb5c26be4
@ -200,8 +200,8 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
|||||||
```pycon
|
```pycon
|
||||||
>>> import jieba.posseg as pseg
|
>>> import jieba.posseg as pseg
|
||||||
>>> words = pseg.cut("我爱北京天安门")
|
>>> words = pseg.cut("我爱北京天安门")
|
||||||
>>> for w in words:
|
>>> for word, flag in words:
|
||||||
... print('%s %s' % (w.word, w.flag))
|
... print('%s %s' % (word, flag))
|
||||||
...
|
...
|
||||||
我 r
|
我 r
|
||||||
爱 v
|
爱 v
|
||||||
|
@ -310,12 +310,12 @@ class Tokenizer(object):
|
|||||||
if len(w) > 2:
|
if len(w) > 2:
|
||||||
for i in xrange(len(w) - 1):
|
for i in xrange(len(w) - 1):
|
||||||
gram2 = w[i:i + 2]
|
gram2 = w[i:i + 2]
|
||||||
if FREQ.get(gram2):
|
if self.FREQ.get(gram2):
|
||||||
yield gram2
|
yield gram2
|
||||||
if len(w) > 3:
|
if len(w) > 3:
|
||||||
for i in xrange(len(w) - 2):
|
for i in xrange(len(w) - 2):
|
||||||
gram3 = w[i:i + 3]
|
gram3 = w[i:i + 3]
|
||||||
if FREQ.get(gram3):
|
if self.FREQ.get(gram3):
|
||||||
yield gram3
|
yield gram3
|
||||||
yield w
|
yield w
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ class pair(object):
|
|||||||
return '%s/%s' % (self.word, self.flag)
|
return '%s/%s' % (self.word, self.flag)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.__str__()
|
return 'pair(%r, %r)' % (self.word, self.flag)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if PY2:
|
if PY2:
|
||||||
@ -78,6 +78,9 @@ class pair(object):
|
|||||||
else:
|
else:
|
||||||
return self.__unicode__()
|
return self.__unicode__()
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter((self.word, self.flag))
|
||||||
|
|
||||||
def encode(self, arg):
|
def encode(self, arg):
|
||||||
return self.__unicode__().encode(arg)
|
return self.__unicode__().encode(arg)
|
||||||
|
|
||||||
|
@ -62,8 +62,8 @@ print('4. 词性标注')
|
|||||||
print('-'*40)
|
print('-'*40)
|
||||||
|
|
||||||
words = jieba.posseg.cut("我爱北京天安门")
|
words = jieba.posseg.cut("我爱北京天安门")
|
||||||
for w in words:
|
for word, flag in words:
|
||||||
print('%s %s' % (w.word, w.flag))
|
print('%s %s' % (word, flag))
|
||||||
|
|
||||||
print('='*40)
|
print('='*40)
|
||||||
print('6. Tokenize: 返回词语在原文的起止位置')
|
print('6. Tokenize: 返回词语在原文的起止位置')
|
||||||
|
@ -6,8 +6,8 @@ import jieba.posseg as pseg
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
for w in result:
|
for word, flag in result:
|
||||||
print(w.word, "/", w.flag, ", ", end=' ')
|
print(word, "/", flag, ", ", end=' ')
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,9 +5,9 @@ sys.path.append("../")
|
|||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent,HMM=False)
|
result = pseg.cut(test_sent, HMM=False)
|
||||||
for w in result:
|
for word, flag in result:
|
||||||
print(w.word, "/", w.flag, ", ", end=' ')
|
print(word, "/", flag, ", ", end=' ')
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user