赞
踩
jieba.analyse.textrank()算法支持提取关键词,但是因为太通用了,所以对某些词频低但是重要性强的词语无法提取出来
如我的毕设对政府工作报告的文本分析中,词频最高的是“发展”和“建设”,但是这两个词太宽泛了,反而不如词频相对较低的“科技”、“民生”、“生态”等更能体现政府工作,所以对其进行了修改
主要是进入这个模块,把他的核心类以及相关类、变量、import都给复制到本地即可。
按住ctrl点进jieba.analyse,找到class TextRank,复制进来
看到他需要keywordExtractor类,按住ctrl点进去,把这个复制进来
里面又创建了UndirectWeightedGraph对象,点进去再复制进来
# TextRank类,其内包含textrank的核心方法 class TextRank(KeywordExtractor): def __init__(self): self.tokenizer = self.postokenizer = jieba.posseg.dt self.span = 5 def textrank(self, sentence,dealwordsList,bestRank=18,betterRank=14,normalRank=5, topK=50, withWeight=False):#调参时去掉三个参数值 g = UndirectWeightedGraph()#创建UndirectWeightedGraph(无向有权图)对象 cm = defaultdict(int)#创建int型的默认字典,用于存放边与边之间的权重 #此处的for循环即改变权重的部分,请根据自己的数据情况决定 for i,wp in enumerate(dealwordsList): if len(wp.strip()) >= 2: for j in xrange(i+1,i+self.span): if j>= len(dealwordsList): break if wp in topicMain or dealwordsList[j] in topicMain: cm[(wp, dealwordsList[j])] += bestRank for k in range(len(topicGroup)): if wp in topicGroup[k][1:2] or dealwordsList[j] in topicGroup[k][1:2]: cm[(wp, dealwordsList[j])] += betterRank if wp in topicGroup[k][3:] or dealwordsList[j] in topicGroup[k][3:]: cm[(wp, dealwordsList[j])] += normalRank cm[(wp, dealwordsList[j])] += 1 # 依次遍历字典的每个元素,将词i,词j作为一条边起始点和终止点,将两词共现的次数w作为边的权重 for terms, w in cm.items(): g.addEdge(terms[0], terms[1], w)#往无向有权图中加入各个节点及相应的权重 nodes_rank = g.rank()#无向有权图对象调用rank算法,得到各个词语的权重 # 判断是否要求带权重,如要求则降序输出带权重的列表 if withWeight: tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) else: tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) # 判断是否要求topk个词作为关键词 if topK: return tags[:topK] else: return tags extract_tags = textrank
# KeywordExtractor类,其内进行停用词的载入
class KeywordExtractor(object):
STOP_WORDSDICT = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ])
STOP_WORDS = list(STOP_WORDSDICT.keys())
def set_stop_words(self, stop_words_path):
abs_path = _get_abs_path(stop_words_path)
if not os.path.isfile(abs_path):
raise Exception("jieba: file does not exist: " + abs_path)
content = open(abs_path, 'rb').read().decode('utf-8')
for line in content.splitlines():
self.stop_words.add(line)
def extract_tags(self, *args, **kwargs):
raise NotImplementedError
# UndirectWeightedGraph无向有权图对象,其内包含计算权重值的rank方法 class UndirectWeightedGraph: d = 0.85 def __init__(self): self.graph = defaultdict(list) #将词i、词j作为一条边起始点和终止点,并录入权重值 def addEdge(self, start, end, weight): self.graph[start].append((start, end, weight)) self.graph[end].append((end, start, weight)) def rank(self): ws = defaultdict(float)#创建float型的默认字典 outSum = defaultdict(float)#创建float型的默认字典 wsdef = 1.0 / (len(self.graph) or 1.0)#每个节点权重的初始值 for n, out in self.graph.items(): ws[n] = wsdef #初始化各个节点的权重值 outSum[n] = sum((e[2] for e in out), 0.0)# 统计各个结点词语出现的次数之和 sorted_keys = sorted(self.graph.keys())#构建键的排序 for x in xrange(10): # 遍历十次 for n in sorted_keys:# 遍历各个节点的键 s = 0 for e in self.graph[n]: # 遍历节点的键对应的值 s += e[2] / outSum[e[1]] * ws[e[1]]# 将这些入度结点贡献后的权值相加,贡献率 = 入度结点与结点n的共现次数 / 入度结点的所有出度的次数 ws[n] = (1 - self.d) + self.d * s # 更新节点n的权值 (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) # 获取权值的最大值和最小值 for w in itervalues(ws): if w < min_rank: min_rank = w if w > max_rank: max_rank = w # 对权值进行归一化 for n, w in ws.items(): ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) return ws
import jieba import os import sys from operator import itemgetter from collections import defaultdict import jieba.posseg _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),os.path.dirname(__file__), path)) _get_abs_path = jieba._get_abs_path text_type = str string_types = (str,) xrange = range iterkeys = lambda d: iter(d.keys()) itervalues = lambda d: iter(d.values()) iteritems = lambda d: iter(d.items())
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。