根據jieba textrank演算法的思路,手動復現textrank演算法。
思路:1.分詞,確定視窗大小。2.根據視窗大小,組合共現詞和頻率,頻率代表共現權重。 trick:正反雙向共現詞。 3.根據textrank 每個詞的權重的迭代公式,採用氣泡排序的方法,將一個詞的所有共現詞的權重代入公式。 4.迭代10次,使每個詞的權重收斂。 5.根據權重排序,輸出top words。
import collections
import sys
import jieba
import jieba.posseg as psg
from operator import itemgetter
class UndirectWeightedGraph:
d=0.85
def __init__(self):
self.edges=collections.defaultdict(list)
def add_edge(self,start,end,weight):
self.edges[start].append((start,end,weight))
self.edges[end].append((end,start,weight))
def rank(self):
ws=collections.defaultdict(float)
outSum=collections.defaultdict(float)
wsdef=1.0/(len(self.edges) or 1.0)
for n,elem in self.edges.items():
outSum[n]=sum([e[2] for e in elem])
ws[n]=wsdef
for epoch in range(10):
for n,elems in self.edges.items():
s=0
for elem in elems:
s+=elem[2]/outSum[elem[1]]*ws[elem[1]]
ws[n]=s
min_rank,max_rank=sys.float_info[0],sys.float_info[3]
for n,w in ws.items():
if w<min_rank:
min_rank=w
if w>max_rank:
max_rank=w
for n,w in ws.items():
ws[n]=((n-min_rank)/10.0)/((max_rank-min_rank)/10.0)
return ws
class TextRank(object):
def __init__(self):
self.stopwords=[]
self.pos_filter=[]
self.span=5
def pairfilter(self,wp):
return wp.flag in self.pos_filter and len(wp.word)>=2 and wp.word.lower not in self.stopwords
def textrank(self,sentence,topk=20):
uwg=UndirectWeightedGraph()
words=psg.lcut(sentence)
wm=collections.defaultdict(int)
for word_index,wp in enumerate(words):
if self.pairfilter(wp):
for index_assit in range(word_index+1,word_index+5):
if index_assit>=len(words):
break
if not self.pairfilter(words[index_assit]):
continue
wm[(wp,words[index_assit])]+=1
# uwg.add_edge(wp.word,words[index_assit].word,1)
for words_tuple,w in wm.items():
uwg.add_edge(words_tuple[0],words_tuple[1],w)
g=uwg.rank()
g=sorted(g.items(),key=itemgetter(1),reverse=True)
return g[:topk]