


首先我想说的是,确实通过embedding(glove or word2vec)这样的技术后,去计算SimilarityMatrix可能确实会有不错方方面面的性能提升(准确率,算法效率,时间复杂度,空间复杂度等等),但我还没研究过来。首先咱们一步一步研究一下仓库一的核心代码Summarize.py:

rom math import log10from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodesdef _set_graph_edge_weights(graph):for sentence_1 in graph.nodes():for sentence_2 in graph.nodes():edge = (sentence_1, sentence_2)if sentence_1 != sentence_2 and not graph.has_edge(edge):similarity = _get_similarity(sentence_1, sentence_2)if similarity != 0:graph.add_edge(edge, similarity)# Handles the case in which all similarities are zero.# The resultant summary will consist of random sentences.if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):_create_valid_graph(graph)def _create_valid_graph(graph):nodes = graph.nodes()for i in range(len(nodes)):for j in range(len(nodes)):if i == j:continueedge = (nodes[i], nodes[j])if graph.has_edge(edge):graph.del_edge(edge)graph.add_edge(edge, 1)def _get_similarity(s1, s2):words_sentence_one = s1.split()words_sentence_two = s2.split()common_word_count = _count_common_words(words_sentence_one, words_sentence_two)log_s1 = log10(len(words_sentence_one))log_s2 = log10(len(words_sentence_two))if log_s1 + log_s2 == 0:return 0return common_word_count / (log_s1 + log_s2)def _count_common_words(words_sentence_one, words_sentence_two):return len(set(words_sentence_one) & set(words_sentence_two))def _format_results(extracted_sentences, split, score):if score:return [(sentence.text, sentence.score) for sentence in extracted_sentences]if split:return [sentence.text for sentence in extracted_sentences]return "\n".join([sentence.text for sentence in extracted_sentences])def _add_scores_to_sentences(sentences, scores):for sentence in sentences:# Adds the score to the object if it has one.if sentence.token in scores:sentence.score = scores[sentence.token]else:sentence.score = 0def _get_sentences_with_word_count(sentences, words):""" Given a list of sentences, returns a list of sentences with atotal word count similar to the word count provided."""word_count = 0selected_sentences = []# Loops until the word count is reached.for sentence in sentences:words_in_sentence = len(sentence.text.split())# Checks if the inclusion of the sentence gives a better approximation# to the word parameter.if abs(words - word_count - words_in_sentence) > abs(words - word_count):return selected_sentencesselected_sentences.append(sentence)word_count += words_in_sentencereturn selected_sentencesdef _extract_most_important_sentences(sentences, ratio, words):sentences.sort(key=lambda s: s.score, reverse=True)# If no "words" option is selected, the number of sentences is# reduced by the provided ratio.if words is None:length = len(sentences) * ratioreturn sentences[:int(length)]# Else, the ratio is ignored.else:return _get_sentences_with_word_count(sentences, words)def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None):if not isinstance(text, str):raise ValueError("Text parameter must be a Unicode object (str)!")# Gets a list of processed sentences.sentences = _clean_text_by_sentences(text, language, additional_stopwords)# Creates the graph and calculates the similarity coefficient for every pair of nodes.graph = _build_graph([sentence.token for sentence in sentences])_set_graph_edge_weights(graph)# Remove all nodes with all edges weights equal to zero._remove_unreachable_nodes(graph)# PageRank cannot be run in an empty graph.if len(graph.nodes()) == 0:return [] if split else ""# Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> scorepagerank_scores = _pagerank(graph)# Adds the summa scores to the sentence objects._add_scores_to_sentences(sentences, pagerank_scores)# Extracts the most important sentences with the selected criterion.extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)# Sorts the extracted sentences by apparition order in the original text.extracted_sentences.sort(key=lambda s: s.index)return _format_results(extracted_sentences, split, scores)def get_graph(text, language="english"):sentences = _clean_text_by_sentences(text, language)graph = _build_graph([sentence.token for sentence in sentences])_set_graph_edge_weights(graph)return graph



The design of the Olympic medals will be unveiled tonight in a live ceremony from Trafalgar Square.Over at the brand new Aquatics Centre, Britain's star diver Tom Daley is going to perform an official launch dive into the Olympic pool.With this building, the organisers have attempted to give London a landmark to rival Beijing's Water Cube from 2008.It was designed by the prestigious architect Zaha Hadid and has a wave-like roof that is 160 metres long.Today's special events are designed to arouse interest in the Olympics around the world and to encourage British fans too.Many failed to get Olympic tickets in the recent sales process.


    # Gets a list of processed sentences.sentences = _clean_text_by_sentences(text, language, additional_stopwords)



   # Creates the graph and calculates the similarity coefficient for every pair of nodes.graph = _build_graph([sentence.token for sentence in sentences])_set_graph_edge_weights(graph)




# Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

# Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

# Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

return _format_results(extracted_sentences, split, scores)



第二个问题,提问者也很有礼貌,也是同样我的问题,我们看第一张图里面上有sentence转化为vector这个步骤的,但这个仓库没有用这样的方法(genism用到了),owner也回答了,论文链接在这儿给出textrank paper,还有“Variations of the Similarity”,说实话,还没来得及去看。


def pagerank_weighted(graph, initial_value=None, damping=0.85):"""Calculates PageRank for an undirected graph"""if initial_value == None: initial_value = 1.0 / len(graph.nodes())scores = dict.fromkeys(graph.nodes(), initial_value)iteration_quantity = 0for iteration_number in range(100):iteration_quantity += 1convergence_achieved = 0for i in graph.nodes():rank = 1 - dampingfor j in graph.neighbors(i):neighbors_sum = sum(graph.edge_weight((j, k)) for k in graph.neighbors(j))rank += damping * scores[j] * graph.edge_weight((j, i)) / neighbors_sumif abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:convergence_achieved += 1scores[i] = rankif convergence_achieved == len(graph.nodes()):breakreturn scores
def pagerank_weighted_scipy(graph, damping=0.85):adjacency_matrix = build_adjacency_matrix(graph)probability_matrix = build_probability_matrix(graph)# Suppress deprecation warnings from numpy.# See https://github.com/summanlp/textrank/issues/57import warningswith warnings.catch_warnings():from numpy import VisibleDeprecationWarningwarnings.filterwarnings("ignore", category=VisibleDeprecationWarning)warnings.filterwarnings("ignore", category=PendingDeprecationWarning)pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrixvals, vecs = eig(pagerank_matrix, left=True, right=False)return process_results(graph, vecs)


GitHub - summanlp/textrank: TextRank implementation for Python 3.


TextRank算法详细讲解与代码实现(完整) - 方格田 - 博客园





summa vs gensim · Issue #78 · summanlp/textrank · GitHub


Markov chains

Markov chains


The Markov transform matrix is a stochastic matrix. The Perron–Frobenius theorem ensures that every irreducible stochastic matrix has a stationary vector, and that the largest absolute value of an eigenvalue is always 1.You can find explanations of the proof in the two previous links or just google more simplified explanations.Thank you for your interest!



Why did not you use iterative method here? · Issue #77 · summanlp/textrank · GitHub


