赞
踩
import jieba from collections import defaultdict def textrank(text, top_k=10): # 分词 words = list(jieba.cut(text)) # 过滤停用词(可选步骤,需要提供停用词列表) # stopwords = set(['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一个', '上', '到', '也']) # words = [word for word in words if word not in stopwords] # 构建词图 graph = defaultdict(set) for i in range(len(words)): for j in range(i+1, len(words)): # 在实际应用中,通常设置一个窗口大小,这里为了简化,直接取相邻的词 if j == i + 1: graph[words[i]].add(words[j]) graph[words[j]].add(words[i]) # 初始化权重 weights = defaultdict(float) for node in graph: weights[node] = 1.0 # 迭代传播权重 for _ in range(10): # 迭代次数可以根据需要调整 new_weights = defaultdict(float) for node in graph: for neighbor in graph[node]: new_weights[node] += weights[neighbor] / len(graph[neighbor]) weights = new_weights # 对权重进行排序,提取关键词 ranked_words = sorted(weights.items(), key=lambda x: x[1], reverse=True) # 输出前top_k个关键词 print("Keywords:") for word, weight in ranked_words[:top_k]: print(f"{word}: {weight}") # 示例文本 text = "这是一个关于TextRank算法的示例文本,我们将使用这段文本来提取关键词。" # 调用TextRank函数并输出结果 textrank(text)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。