mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
build stable sort for graph iteration, then we can get stable result.
This commit is contained in:
parent
49657c976d
commit
1152db7736
@ -3,10 +3,12 @@
|
|||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
import collections
|
import collections
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
|
||||||
class UndirectWeightedGraph:
|
class UndirectWeightedGraph:
|
||||||
d = 0.85
|
d = 0.85
|
||||||
|
|
||||||
@ -18,17 +20,23 @@ class UndirectWeightedGraph:
|
|||||||
self.graph[start].append((start, end, weight))
|
self.graph[start].append((start, end, weight))
|
||||||
self.graph[end].append((end, start, weight))
|
self.graph[end].append((end, start, weight))
|
||||||
|
|
||||||
|
def refactor(self):
|
||||||
|
for n, _ in self.graph.items():
|
||||||
|
self.graph[n].sort()
|
||||||
|
|
||||||
def rank(self):
|
def rank(self):
|
||||||
ws = collections.defaultdict(float)
|
ws = collections.defaultdict(float)
|
||||||
outSum = collections.defaultdict(float)
|
outSum = collections.defaultdict(float)
|
||||||
|
|
||||||
|
giter = list(self.graph.items()) # these two lines for build stable iteration
|
||||||
|
giter.sort()
|
||||||
wsdef = 1.0 / (len(self.graph) or 1.0)
|
wsdef = 1.0 / (len(self.graph) or 1.0)
|
||||||
for n, out in self.graph.items():
|
for n, out in giter:
|
||||||
ws[n] = wsdef
|
ws[n] = wsdef
|
||||||
outSum[n] = sum((e[2] for e in out), 0.0)
|
outSum[n] = sum((e[2] for e in out), 0.0)
|
||||||
|
|
||||||
for x in xrange(10): # 10 iters
|
for x in range(10): # 10 iters
|
||||||
for n, inedges in self.graph.items():
|
for n, inedges in giter:
|
||||||
s = 0
|
s = 0
|
||||||
for e in inedges:
|
for e in inedges:
|
||||||
s += e[2] / outSum[e[1]] * ws[e[1]]
|
s += e[2] / outSum[e[1]] * ws[e[1]]
|
||||||
@ -36,7 +44,7 @@ class UndirectWeightedGraph:
|
|||||||
|
|
||||||
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
|
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
|
||||||
|
|
||||||
for w in itervalues(ws):
|
for _, w in ws.items():
|
||||||
if w < min_rank:
|
if w < min_rank:
|
||||||
min_rank = w
|
min_rank = w
|
||||||
elif w > max_rank:
|
elif w > max_rank:
|
||||||
@ -64,9 +72,9 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
|
|||||||
cm = collections.defaultdict(int)
|
cm = collections.defaultdict(int)
|
||||||
span = 5
|
span = 5
|
||||||
words = list(pseg.cut(sentence))
|
words = list(pseg.cut(sentence))
|
||||||
for i in xrange(len(words)):
|
for i in range(len(words)):
|
||||||
if words[i].flag in pos_filt:
|
if words[i].flag in pos_filt:
|
||||||
for j in xrange(i + 1, i + span):
|
for j in range(i + 1, i + span):
|
||||||
if j >= len(words):
|
if j >= len(words):
|
||||||
break
|
break
|
||||||
if words[j].flag not in pos_filt:
|
if words[j].flag not in pos_filt:
|
||||||
@ -75,7 +83,6 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
|
|||||||
|
|
||||||
for terms, w in cm.items():
|
for terms, w in cm.items():
|
||||||
g.addEdge(terms[0], terms[1], w)
|
g.addEdge(terms[0], terms[1], w)
|
||||||
|
|
||||||
nodes_rank = g.rank()
|
nodes_rank = g.rank()
|
||||||
if withWeight:
|
if withWeight:
|
||||||
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user