(2).TextRank 算法是一种用于文本的基于图的排序算法,通过把文本分割成若干组成单元(句子/词),构建节点连接图,用句子(词)之间的相似度作为边的权重,通过循环迭代计算句子(词)的TextRank值,最后抽取排名高的句子(词)组合成文本摘要。
TextRank中一个单词 i 的权重取决于与在 i 前面的各个单词j组成的( j , i ) 这条边的权重,以及j这个单词到其他边的权重之和。
此处ner用的模块为pyltp。语言技术平台(LTP) 是由 哈工大社会计算与信息检索研究中心 11
Link Library, DLL)的应用程序接口,可视化工具,并且能够以网络服务(Web Service)的形式进行使用。pyltp即python版的ltp。
- from sentence_parser import *
- import re
- from collections import Counter
- from GraphShow import *
- from keywords_textrank import *
- '''事件挖掘'''
- class CrimeMining:
- def __init__(self):
- self.textranker = TextRank()
- self.parser = LtpParser()
- self.ners = ['nh', 'ni', 'ns']
- self.ner_dict = {
- 'nh':'人物',
- 'ni':'机构',
- 'ns':'地名'
- }
- self.graph_shower = GraphShow()
- '''移除括号内的信息,去除噪声'''
- def remove_noisy(self, content):
- p1 = re.compile(r'([^)]*)')
- p2 = re.compile(r'\([^\)]*\)')
- return p2.sub('', p1.sub('', content))
- '''收集命名实体'''
- def collect_ners(self, words, postags):
- ners = []
- for index, pos in enumerate(postags):
- if pos in self.ners:
- ners.append(words[index] + '/' + pos)
- # print(ners)
- return ners
- '''对文章进行分句处理'''
- def seg_content(self, content):
- return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence]
- '''对句子进行分词,词性标注处理'''
- def process_sent(self, sent):
- words, postags = self.parser.basic_process(sent)
- # print('words:',words)
- # print('postags:',postags)
- return words, postags
- '''构建实体之间的共现关系'''
- def collect_coexist(self, ner_sents, ners):
- co_list = []
- for sent in ner_sents:
- words = [i[0] + '/' + i[1] for i in zip(sent[0], sent[1])]
- co_ners = set(ners).intersection(set(words))
- co_info = self.combination(list(co_ners))
- co_list += co_info
- if not co_list:
- return []
- return {i[0]:i[1] for i in Counter(co_list).most_common()}
- '''列表全排列'''
- def combination(self, a):
- combines = []
- if len(a) == 0:
- return []
- for i in a:
- for j in a:
- if i == j:
- continue
- combines.append('@'.join([i, j]))
- return combines
- '''抽取出事件三元组'''
- def extract_triples(self, words, postags):
- svo = []
- tuples, child_dict_list = self.parser.parser_main(words, postags)
- # print(tuples)
- for tuple in tuples:
- rel = tuple[-1]
- if rel in ['SBV']:
- sub_wd = tuple[1]
- verb_wd = tuple[3]
- obj = self.complete_VOB(verb_wd, child_dict_list)
- subj = sub_wd
- verb = verb_wd
- if not obj:
- svo.append([subj, verb])
- else:
- svo.append([subj, verb+obj])
- return svo
- '''过滤出与命名实体相关的事件三元组'''
- def filter_triples(self, triples, ners):
- ner_triples = []
- for ner in ners:
- for triple in triples:
- if ner in triple:
- ner_triples.append(triple)
- return ner_triples
- '''根据SBV找VOB'''
- def complete_VOB(self, verb, child_dict_list):
- for child in child_dict_list:
- wd = child[0]
- attr = child[3]
- if wd == verb:
- if 'VOB' not in attr:
- continue
- vob = attr['VOB'][0]
- obj = vob[1]
- return obj
- return ''
- '''对文章进行关键词挖掘'''
- def extract_keywords(self, words_list):
- return self.textranker.extract_keywords(words_list, 10)
- '''基于文章关键词,建立起实体与关键词之间的关系'''
- def rel_entity_keyword(self, ners, keyword, subsent):
- events = []
- rels = []
- sents = []
- ners = [i.split('/')[0] for i in set(ners)]
- keyword = [i[0] for i in keyword]
- for sent in subsent:
- tmp = []
- for wd in sent:
- if wd in ners + keyword:
- tmp.append(wd)
- if len(tmp) > 1:
- sents.append(tmp)
- for ner in ners:
- for sent in sents:
- if ner in sent:
- tmp = ['->'.join([ner, wd]) for wd in sent if wd in keyword and wd != ner and len(wd) > 1]
- if tmp:
- rels += tmp
- for e in set(rels):
- events.append([e.split('->')[0], e.split('->')[1]])
- return events
- '''利用标点符号,将文章进行短句切分处理'''
- def seg_short_content(self, content):
- return [sentence for sentence in re.split(r'[,,??!!。;;::\n\r\t ]', content) if sentence]
- '''挖掘主控函数'''
- def main(self, content):
- if not content:
- return []
- # 对文章进行去噪处理
- content = self.remove_noisy(content)
- # 对文章进行长句切分处理
- sents = self.seg_content(content)
- # 对文章进行短句切分处理
- subsents = self.seg_short_content(content)
- subsents_seg = []
- # words_list存储整篇文章的词频信息
- words_list = []
- # ner_sents保存具有命名实体的句子
- ner_sents = []
- # ners保存命名实体
- ners = []
- # triples保存主谓宾短语
- triples = []
- # 存储文章事件
- events = []
- for sent in subsents:
- words, postags = self.process_sent(sent)
- words_list += [[i[0], i[1]] for i in zip(words, postags)]
- subsents_seg.append([i[0] for i in zip(words, postags)])
- ner = self.collect_ners(words, postags)
- # print(sent,ners)
- if ner:
- triple = self.extract_triples(words, postags)
- if not triple:
- continue
- triples += triple
- ners += ner
- ner_sents.append([words, postags])
- # print(triples)
- # 获取文章关键词, 并图谱组织, 这个可以做
- keywords = [i[0] for i in self.extract_keywords(words_list)]
- for keyword in keywords:
- name = keyword
- cate = '关键词'
- events.append([name, cate])
- # 对三元组进行event构建,这个可以做
- for t in triples:
- if (t[0] in keywords or t[1] in keywords) and len(t[0]) > 1 and len(t[1]) > 1:
- events.append([t[0], t[1]])
- # 获取文章词频信息话,并图谱组织,这个可以做
- word_dict = [i for i in Counter([i[0] for i in words_list if i[1][0] in ['n', 'v'] and len(i[0]) > 1]).most_common()][:10]
- for wd in word_dict:
- name = wd[0]
- cate = '高频词'
- events.append([name, cate])
- # 获取全文命名实体,这个可以做
- ner_dict = {i[0]:i[1] for i in Counter(ners).most_common()}
- for ner in ner_dict:
- name = ner.split('/')[0]
- cate = self.ner_dict[ner.split('/')[1]]
- events.append([name, cate])
- # 获取全文命名实体共现信息,构建事件共现网络
- co_dict = self.collect_coexist(ner_sents, list(ner_dict.keys()))
- co_events = [[i.split('@')[0].split('/')[0], i.split('@')[1].split('/')[0]] for i in co_dict]
- events += co_events
- #将关键词与实体进行关系抽取
- events_entity_keyword = self.rel_entity_keyword(ners, keywords, subsents_seg)
- events += events_entity_keyword
- print(events)
- #对事件网络进行图谱化展示
- self.graph_shower.create_page(events)
- if __name__ == '__main__':
- content8 = '''
- 身为一名学习委员,同学的作业要立马交了就立马送到老师办公室。
- 因为我们是两个语文学习文员,但共同干同一件事——收全班同学的语文作业。
- 当时竞选班长,我被选上当语文学习委员,而另一个同学曾经已经是语文学习委员了,
- 又被选上了,所以她所得的经验会比我多一些。但渐渐的我发现了一个有趣的现象……
- 这位和我担任同样职位的人,名叫李鹂歌。她收作业从来不慌不忙,
- 这星期的作业可以收到下星期也没上交给班主任。我真的是挺佩服她的。
- 做事从来不紧不慢。因为当时我刚上任,也不太了解要做些啥,我只知道跟着李鹂歌做就行了。
- 但却没想到她收的作业,可以收那么久。
- 或许因为我当时刚上任,小组组长交作业都已经习惯交给李鹂歌了,记名也是她记,也够累的。
- 至于,其实现在也是这样。当时我只负责送作业,好轻松啊!
- 李鹂歌收作业慢慢吞吞,而我正好相反,今天做的作业,巴不得一来到学校就收齐交给班主任。
- 所以最近,一做完操我就开始向每一组的小组长催语文作业,没写完的也不等了,直接记名,
- 上报给班主任,没写完的下午留堂。所以,现在收的作业可以第一节课还没上就送到老师办公室。
- 但偶尔有时候,作业过多,收得乱七八糟,又有好多人没做作业,我们又记不过来,
- 经常有人浑水摸鱼就过了关。因为我渐渐发现,只要在第一节课上课之前没收完作业,
- 你就甭想再把作业收好。一下课大家都跑去疯玩了,教室里乱糟糟的。所以在第一节课上课之前,
- 我都会催组长交作业。再把镜头转向李鹂歌,人家正在优哉游哉的吃早餐呢,谁叫我不在学校吃,
- 在家里早早的就吃完了,不过,她也是有工作的,记名。有些交的慢的组长,直接把名字告诉我,
- 由我来汇报给李鹂歌,由李鹂歌来把名字填写在黑名单里。我怕忘了谁的名字,就一直再念,
- 没想到李鹂歌刚拿出本子就不紧不慢的说:“今天几号?”我一回答,把刚才记得名字全给忘了。
- '''
- handler = CrimeMining()
- handler.main(content8)
- import os
- from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer
- class LtpParser():
- def __init__(self):
- LTP_DIR = "./ltp_data"
- cws_path = os.path.join(LTP_DIR, "cws.model")
- self.segmentor = Segmentor(cws_path)
- pos_path = os.path.join(LTP_DIR, "pos.model")
- self.postagger = Postagger(pos_path)
- parser_path = os.path.join(LTP_DIR, "parser.model")
- self.parser = Parser(parser_path)
- ner_path = os.path.join(LTP_DIR, "ner.model")
- self.recognizer = NamedEntityRecognizer(ner_path)
- '''ltp基本操作'''
- def basic_parser(self, words):
- postags = list(self.postagger.postag(words))
- netags = self.recognizer.recognize(words, postags)
- return postags, netags
- '''ltp获取词性'''
- def get_postag(self, words):
- return list(self.postagger.postag(words))
- '''基于实体识别结果,整理输出实体列表'''
- def format_entity(self, words, netags, postags):
- name_entity_dist = {}
- name_entity_list = []
- place_entity_list = []
- organization_entity_list = []
- ntag_E_Nh = ""
- ntag_E_Ni = ""
- ntag_E_Ns = ""
- index = 0
- for item in zip(words, netags):
- word = item[0]
- ntag = item[1]
- if ntag[0] != "O":
- if ntag[0] == "S":
- if ntag[-2:] == "Nh":
- name_entity_list.append(word+'_%s ' % index)
- elif ntag[-2:] == "Ni":
- organization_entity_list.append(word+'_%s ' % index)
- else:
- place_entity_list.append(word + '_%s ' % index)
- elif ntag[0] == "B":
- if ntag[-2:] == "Nh":
- ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
- elif ntag[-2:] == "Ni":
- ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
- else:
- ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
- elif ntag[0] == "I":
- if ntag[-2:] == "Nh":
- ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
- elif ntag[-2:] == "Ni":
- ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
- else:
- ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
- else:
- if ntag[-2:] == "Nh":
- ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
- name_entity_list.append(ntag_E_Nh)
- ntag_E_Nh = ""
- elif ntag[-2:] == "Ni":
- ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
- organization_entity_list.append(ntag_E_Ni)
- ntag_E_Ni = ""
- else:
- ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
- place_entity_list.append(ntag_E_Ns)
- ntag_E_Ns = ""
- index += 1
- name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh')
- name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni')
- name_entity_dist['nss'] = self.modify_entity(place_entity_list,words, postags, 'ns')
- return name_entity_dist
- '''entity修正,为rebuild_wordspostags做准备'''
- def modify_entity(self, entity_list, words, postags, tag):
- entity_modify = []
- if entity_list:
- for entity in entity_list:
- entity_dict = {}
- subs = entity.split(' ')[:-1]
- start_index = subs[0].split('_')[1]
- end_index = subs[-1].split('_')[1]
- entity_dict['stat_index'] = start_index
- entity_dict['end_index'] = end_index
- if start_index == entity_dict['end_index']:
- consist = [words[int(start_index)] + '/' + postags[int(start_index)]]
- else:
- consist = [words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index)+1)]
- entity_dict['consist'] = consist
- entity_dict['name'] = ''.join(tmp.split('_')[0] for tmp in subs) + '/' + tag
- entity_modify.append(entity_dict)
- return entity_modify
- '''基于命名实体识别,修正words,postags'''
- def rebuild_wordspostags(self, name_entity_dist, words, postags):
- pre = ' '.join([item[0] + '/' + item[1] for item in zip(words, postags)])
- post = pre
- for et, infos in name_entity_dist.items():
- if infos:
- for info in infos:
- post = post.replace(' '.join(info['consist']), info['name'])
- post = [word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0]]
- words = [tmp.split('/')[0] for tmp in post]
- postags = [tmp.split('/')[1] for tmp in post]
- return words, postags
- '''依存关系格式化'''
- def syntax_parser(self, words, postags):
- arcs = self.parser.parse(words, postags)
- words = ['Root'] + words
- postags = ['w'] + postags
- tuples = list()
- for index in range(len(words)-1):
- # print(arcs[index])
- # arc_index = arcs[index].head
- # arc_relation = arcs[index].relation
- arc_index = arcs[index][0]
- arc_relation = arcs[index][1]
- tuples.append([index+1, words[index+1], postags[index+1], words[arc_index], postags[arc_index], arc_index, arc_relation])
- # print(tuples)
- return tuples
- '''为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
- def build_parse_child_dict(self, words, postags, tuples):
- child_dict_list = list()
- for index, word in enumerate(words):
- child_dict = dict()
- for arc in tuples:
- if arc[3] == word:
- if arc[-1] in child_dict:
- child_dict[arc[-1]].append(arc)
- else:
- child_dict[arc[-1]] = []
- child_dict[arc[-1]].append(arc)
- child_dict_list.append([word, postags[index], index, child_dict])
- return child_dict_list
- '''parser主函数'''
- def parser_main(self, words, postags):
- tuples = self.syntax_parser(words, postags)
- child_dict_list = self.build_parse_child_dict(words, postags, tuples)
- return tuples, child_dict_list
- '''基础语言分析'''
- def basic_process(self, sentence):
- words = list(self.segmentor.segment(sentence))
- postags, netags = self.basic_parser(words)
- name_entity_dist = self.format_entity(words, netags, postags)
- words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags)
- return words, postags
- import jieba.posseg as pseg
- from collections import defaultdict
- import sys
- '''textrank图算法'''
- class textrank_graph:
- def __init__(self):
- self.graph = defaultdict(list)
- self.d = 0.85 #d是阻尼系数,一般设置为0.85
- self.min_diff = 1e-5 #设定收敛阈值
- #添加节点之间的边
- def addEdge(self, start, end, weight):
- self.graph[start].append((start, end, weight))
- self.graph[end].append((end, start, weight))
- #节点排序
- def rank(self):
- #默认初始化权重
- weight_deafault = 1.0 / (len(self.graph) or 1.0)
- #nodeweight_dict, 存储节点的权重
- nodeweight_dict = defaultdict(float)
- #outsum,存储节点的出度权重
- outsum_node_dict = defaultdict(float)
- #根据图中的边,更新节点权重
- for node, out_edge in self.graph.items():
- #是 [('是', '全国', 1), ('是', '调查', 1), ('是', '失业率', 1), ('是', '城镇', 1)]
- nodeweight_dict[node] = weight_deafault
- outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)
- #初始状态下的textrank重要性权重
- sorted_keys = sorted(self.graph.keys())
- #设定迭代次数,
- step_dict = [0]
- for step in range(1, 1000):
- for node in sorted_keys:
- s = 0
- #计算公式:(edge_weight/outsum_node_dict[edge_node])*node_weight[edge_node]
- for e in self.graph[node]:
- s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]
- #计算公式:(1-d) + d*s
- nodeweight_dict[node] = (1 - self.d) + self.d * s
- step_dict.append(sum(nodeweight_dict.values()))
- if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
- break
- #利用Z-score进行权重归一化,也称为离差标准化,是对原始数据的线性变换,使结果值映射到[0 - 1]之间。
- #先设定最大值与最小值均为系统存储的最大值和最小值
- (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
- for w in nodeweight_dict.values():
- if w < min_rank:
- min_rank = w
- if w > max_rank:
- max_rank = w
- for n, w in nodeweight_dict.items():
- nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)
- return nodeweight_dict
- '''基于textrank图算法的关键词提取'''
- class TextRank:
- def __init__(self):
- self.candi_pos = ['n', 'v']
- self.stop_pos = ['nt']
- self.span = 5
- def extract_keywords(self, word_list, num_keywords):
- g = textrank_graph()
- cm = defaultdict(int)
- for i, word in enumerate(word_list):
- if word[1][0] in self.candi_pos and len(word[0]) > 1:
- for j in range(i + 1, i + self.span):
- if j >= len(word_list):
- break
- if word_list[j][1][0] not in self.candi_pos or word_list[j][1] in self.stop_pos or len(word_list[j][0]) < 2:
- continue
- pair = tuple((word[0], word_list[j][0]))
- cm[(pair)] += 1
- for terms, w in cm.items():
- g.addEdge(terms[0], terms[1], w)
- nodes_rank = g.rank()
- nodes_rank = sorted(nodes_rank.items(), key=lambda asd:asd[1], reverse=True)
- return nodes_rank[:num_keywords]
- '''创建展示页面'''
- class GraphShow():
- def __init__(self):
- self.base = '''
- <html>
- <head>
- <script type="text/javascript" src="VIS/dist/vis.js"></script>
- <link href="VIS/dist/vis.css" rel="stylesheet" type="text/css">
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
- </head>
- <body>
- <div id="VIS_draw"></div>
- <script type="text/javascript">
- var nodes = data_nodes;
- var edges = data_edges;
- var container = document.getElementById("VIS_draw");
- var data = {
- nodes: nodes,
- edges: edges
- };
- var options = {
- nodes: {
- shape: 'circle',
- size: 15,
- font: {
- size: 15
- }
- },
- edges: {
- font: {
- size: 10,
- align: 'center'
- },
- color: 'red',
- arrows: {
- to: {enabled: true, scaleFactor: 1.2}
- },
- smooth: {enabled: true}
- },
- physics: {
- enabled: true
- }
- };
- var network = new vis.Network(container, data, options);
- </script>
- </body>
- </html>
- '''
- '''读取文件数据'''
- def create_page(self, events):
- nodes = []
- for event in events:
- nodes.append(event[0])
- nodes.append(event[1])
- # print(nodes)
- node_dict = {node: index for index, node in enumerate(nodes)}
- # print(node_dict)
- data_nodes = []
- data_edges = []
- for node, id in node_dict.items():
- data = {}
- data["group"] = 'Event'
- data["id"] = id
- data["label"] = node
- data_nodes.append(data)
- for edge in events:
- data = {}
- data['from'] = node_dict.get(edge[0])
- data['label'] = ''
- data['to'] = node_dict.get(edge[1])
- data_edges.append(data)
- # print(data_nodes)
- # print(data_edges)
- self.create_html(data_nodes, data_edges)
- return
- '''生成html文件'''
- def create_html(self, data_nodes, data_edges):
- f = open('graph_show_02.html', 'w+',encoding='utf-8')
- # print('data_nodes',str(data_nodes))
- # print('data_edges',str(data_edges))
- html = self.base.replace('data_nodes', str(data_nodes)).replace('data_edges', str(data_edges))
- print(html)
- f.write(html)
- f.close()
本文随机选用了网上的一篇文章text = '''身为一名学习委员,同学的作业要立马交了就立马送到老师办公室。因为我们是两个语文学习文员,但共同干同一件事——收全班同学的语文作业。当时竞选班长,我被选上当语文学习委员,而另一个同学曾经已经是语文学习委员了,又被选上了,所以她所得的经验会比我多一些。但渐渐的我发现了一个有趣的现象……这位和我担任同样职位的人,名叫李鹂歌。她收作业从来不慌不忙,这星期的作业可以收到下星期也没上交给班主任。我真的是挺佩服她的。做事从来不紧不慢。因为当时我刚上任,也不太了解要做些啥,我只知道跟着李鹂歌做就行了。但却没想到她收的作业,可以收那么久。或许因为我当时刚上任,小组组长交作业都已经习惯交给李鹂歌了,记名也是她记,也够累的。至于,其实现在也是这样。当时我只负责送作业,好轻松啊!李鹂歌收作业慢慢吞吞,而我正好相反,今天做的作业,巴不得一来到学校就收齐交给班主任。所以最近,一做完操我就开始向每一组的小组长催语文作业,没写完的也不等了,直接记名,上报给班主任,没写完的下午留堂。所以,现在收的作业可以第一节课还没上就送到老师办公室。但偶尔有时候,作业过多,收得乱七八糟,又有好多人没做作业,我们又记不过来,经常有人浑水摸鱼就过了关。因为我渐渐发现,只要在第一节课上课之前没收完作业,你就甭想再把作业收好。一下课大家都跑去疯玩了,教室里乱糟糟的。所以在第一节课上课之前,我都会催组长交作业。再把镜头转向李鹂歌,人家正在优哉游哉的吃早餐呢,谁叫我不在学校吃,在家里早早的就吃完了,不过,她也是有工作的,记名。有些交的慢的组长,直接把名字告诉我,由我来汇报给李鹂歌,由李鹂歌来把名字填写在黑名单里。我怕忘了谁的名字,就一直再念,没想到李鹂歌刚拿出本子就不紧不慢的说:“今天几号?”我一回答,把刚才记得名字全给忘了。'''
代码参考刘焕勇老师的项目,连接如下,Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
