赞
踩
基于TextRank的关键词提取是一种基于图的排序算法,以下是基于TextRank的关键词提取的步骤:
1.文本预处理: 清洗文本数据,去除无关字符、标点符号、停用词等。 进行分词,将文本划分为单独的词语。 2.构建图(图的节点和边): 将文本中的词语作为图的节点。根据词语之间的共现关系构建边。通常可以选择在一个窗口内共同出现的词语之间建立边。 这一步的具体操作包括以下几个阶段: 2.1 分词: 将文本进行分词,将文本拆分为一个个独立的词语。 2.2 窗口建立共现关系: 设定一个窗口大小,遍历文本中的词语。 在每个窗口内,窗口内的词语被认为是共现的。 窗口的大小可以根据具体任务和文本特点来调整。 2.3 建立图的节点和边:将文本中的每个词语作为图的节点。 对于在窗口内共现的词语,为它们之间建立一条边。 这样,如果两个词在同一个窗口内同时出现,它们之间就有一条边。 例如,考虑以下文本: "TextRank is an algorithm for keyword extraction." 分词后得到词语序列 ["TextRank", "is", "an", "algorithm", "for", "keyword", "extraction"]。 如果设置窗口大小为2,就会得到以下边的关系: ("TextRank", "is"), ("TextRank", "an"), ("is", "an"), ("is", "algorithm"), ("an", "algorithm"), ("an", "for"), ("algorithm", "for"), ("algorithm", "keyword"), ("for", "keyword"), ("for", "extraction"), ("keyword", "extraction") 在这里,每个词语对应一个节点,窗口内共现的词语之间构建边。 这样,通过窗口内的共现关系,就建立了图的节点和边,为后续的TextRank算法提供了基础。 在TextRank的迭代过程中,节点之间的权重将根据共现关系进行调整,最终得到关键词的排序。 2.4 计算节点之间的权重:为图中的每个节点计算权重。 通常使用词语之间的共现次数或者其他相关度作为权重。 2.5 利用TextRank算法进行迭代: 初始化节点的权重。 迭代更新节点的权重,直到收敛。TextRank的更新公式类似于PageRank算法,涉及到节点之间的传播和权重的调整。 2.6 根据节点的权重排序: 根据节点的权重值进行排序,选择排名靠前的词作为关键词。 2.7 提取关键词: 根据排序后的节点,选择排名靠前的词作为关键词。 可以根据需求选择提取的关键词数量。
整个流程涉及到图的构建和基于图的算法进行迭代,TextRank考虑了词语之间的关系,具有一定的上下文信息,因此在关键词提取中较为有效。这种基于图的关键词提取方法不仅可以应用于单一文档,还可以用于多篇文档的关键词提取,提高了对文本的整体理解。
textrank基本代码实现:
概念可以参考:https://zhuanlan.zhihu.com/p/359232044
很详细
import networkx as nx from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk import pos_tag # 示例文本 text = "TextRank is an algorithm for keyword extraction. It is based on graph theory and used for ranking words in a text." # 分词和词性标注 def preprocess_text(text): words = word_tokenize(text) words = [word.lower() for word in words if word.isalpha()] # 去除非字母字符 words = [word for word in words if word not in stopwords.words("english")] # 去除停用词 words_pos = pos_tag(words) # 获取词性标注 return words_pos # 构建共现图 def build_graph(words_pos): G = nx.Graph() G.add_nodes_from(set(words_pos)) for i in range(len(words_pos)-1): for j in range(i+1, len(words_pos)): if words_pos[i][1] == words_pos[j][1]: # 仅考虑同一词性的词语 if not G.has_edge(words_pos[i], words_pos[j]): G.add_edge(words_pos[i], words_pos[j], weight=1) else: G[words_pos[i]][words_pos[j]]['weight'] += 1 return G # TextRank算法 def textrank(G, max_iter=100, tol=1e-4, damping_factor=0.85): nodes = list(G.nodes) n = len(nodes) p = {node: 1 / n for node in nodes} for _ in range(max_iter): new_p = {node: (1 - damping_factor) / n + damping_factor * sum(G[neighbor][node]['weight'] * p[neighbor] / sum(G[neighbor][neighbor2]['weight'] for neighbor2 in G.neighbors(neighbor)) for neighbor in G.neighbors(node)) for node in nodes} if sum(abs(new_p[node] - p[node]) for node in nodes) < tol: break p = new_p return p # 获取关键词 def get_keywords(text, top_n=5): words_pos = preprocess_text(text) G = build_graph(words_pos) scores = textrank(G) sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) keywords = [word[0][0] for word in sorted_scores[:top_n]] return keywords # 输出关键词 keywords = get_keywords(text) print("Keywords:", keywords)
官方给出的代码:
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import sys from operator import itemgetter from collections import defaultdict import jieba.posseg from .tfidf import KeywordExtractor from .._compat import * class UndirectWeightedGraph: d = 0.85 def __init__(self): self.graph = defaultdict(list) def addEdge(self, start, end, weight): # use a tuple (start, end, weight) instead of a Edge object self.graph[start].append((start, end, weight)) self.graph[end].append((end, start, weight)) def rank(self): ws = defaultdict(float) outSum = defaultdict(float) wsdef = 1.0 / (len(self.graph) or 1.0) for n, out in self.graph.items(): ws[n] = wsdef outSum[n] = sum((e[2] for e in out), 0.0) # this line for build stable iteration sorted_keys = sorted(self.graph.keys()) for x in xrange(10): # 10 iters for n in sorted_keys: s = 0 for e in self.graph[n]: s += e[2] / outSum[e[1]] * ws[e[1]] ws[n] = (1 - self.d) + self.d * s (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) for w in itervalues(ws): if w < min_rank: min_rank = w if w > max_rank: max_rank = w for n, w in ws.items(): # to unify the weights, don't *100. ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) return ws class TextRank(KeywordExtractor): def __init__(self): self.tokenizer = self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.pos_filt = frozenset(('ns', 'n', 'vn', 'v')) self.span = 5 def pairfilter(self, wp): return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2 and wp.word.lower() not in self.stop_words) def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False): """ Extract keywords from sentence using TextRank algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. if the POS of w is not in this list, it will be filtered. - withFlag: if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ self.pos_filt = frozenset(allowPOS) g = UndirectWeightedGraph() cm = defaultdict(int) words = tuple(self.tokenizer.cut(sentence)) for i, wp in enumerate(words): if self.pairfilter(wp): for j in xrange(i + 1, i + self.span): if j >= len(words): break if not self.pairfilter(words[j]): continue if allowPOS and withFlag: cm[(wp, words[j])] += 1 else: cm[(wp.word, words[j].word)] += 1 for terms, w in cm.items(): g.addEdge(terms[0], terms[1], w) nodes_rank = g.rank() if withWeight: tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) else: tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) if topK: return tags[:topK] else: return tags extract_tags = textrank
下面是使用封装好的代码实现的:
# coding=utf-8 import pandas as pd import jieba.analyse # 处理标题和摘要,提取关键词 def words_textrank(data, topK): idList, titleList, abstractList = data['id'], data['title'], data['abstract'] ids, titles, keys = [], [], [] for index in range(len(idList)): # 拼接标题和摘要 text = '%s。%s' % (titleList[index], abstractList[index]) jieba.analyse.set_stop_words("data/stopWord.txt") # 加载自定义停用词表 print("\"", titleList[index], "\"", " 10 Keywords - TextRank :") # TextRank关键词提取,词性筛选 keywords = jieba.analyse.textrank(text, topK=topK, allowPOS=('n', 'nz', 'v','vd', 'vn','l', 'a', 'd')) word_split = " ".join(keywords) keys.append(word_split.encode("utf-8").decode("utf-8")) ids.append(idList[index]) titles.append(titleList[index]) result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key']) return result if __name__ == '__main__': dataFile = 'data/text.csv' data = pd.read_csv(dataFile) result = words_textrank(data, 10) result.to_csv("result/textrank.csv", index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。