赞
踩
实现步骤:
import re import torch from transformers import BertTokenizer#中文分词器 from transformers import AlbertModel#Albert预训练模型获得embedding from nltk.cluster import KMeansClusterer#k均值聚类 from scipy.spatial import distance_matrix#距离计算模块distance import nltk import pandas as pd content = """ 内容 """ title = '摘要' # ********** 分句,清理标点符号 ********** # # 分句,清理标点符号 def split_document(para): line_split = re.split(r'[|。|!|;|?|]|\n|,', para.strip()) _seg_sents = [re.sub(r'[^\w\s]','',sent) for sent in line_split] _seg_sents = [sent for sent in _seg_sents if sent != ''] return _seg_sents # sentences=['新冠肺炎疫情暴发以来', '频繁出现的无症状感染者病例', '再次引起恐慌', '近日'...] sentences = split_document(content) # ********** 计算句子向量 ********** # #Mean Pooling:考虑attention mask以获得正确的平均值 def mean_pooling(model_output, attention_mask): # :所有句子的embedding:token_embeddings=[bs,sentence-len+2,hidden_dim]=[2, 15, 312] token_embeddings = model_output[0] # 扩展attention mask维度:[bs,sentence-len+2]--->[bs,sentence-len+2,hidden_dim] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() a = torch.sum(token_embeddings * input_mask_expanded) return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) # 下载模型 tokenizer = BertTokenizer.from_pretrained('clue/albert_chinese_tiny') model = AlbertModel.from_pretrained('clue/albert_chinese_tiny') # 得到句子的embedding def _get_sentence_embeddings(sentences): # Tokenize sentences # sentences=['新冠肺炎疫情暴发以来', '频繁出现的无症状感染者病例']-->encoded_input : 输出3个tensor #加开始和结束后的'input_ids': tensor([[101,3173,1094,5511,4142,4554,2658,3274,1355,809,3341,102,0,0,0],[101,7574,5246,1139,4385,4638,3187,4568,4307,2697,3381,5442,4567,891,102]]) #'token_type_ids': tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]) #'attention_mask': tensor([[1,1,1,1,1,1,1,1,1,1,1,1,0,0,0],[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]]) encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # Compute token embeddings # with torch.no_grad的作用:在该模块下,所有计算得出的tensor的requires_grad都自动设置为False。 # model_output:输出2个tensor # last_hidden_status:[bs,sentence-len+2,hidden_dim]=[2, 15, 312] # pooler_output:[bs,hidden_dim]=[2, 312] with torch.no_grad(): model_output = model(**encoded_input) # 考虑attention mask以获得正确的平均值 Mean Pooling=[bs,hidden_dim] sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) return sentence_embeddings # 得到句子的embedding=[bs,hidden_dim] sentence_embeddings = _get_sentence_embeddings(sentences) # ********** 直接用句子向量来聚类 ********** # NUM_CLUSTERS=10 # 分为多少组 iterations=25 #迭代次数 X = sentence_embeddings.numpy() # k均值聚类 kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,repeats=iterations,avoid_empty_clusters=True) # assigned_clusters =[6, 6, 4, 8, 6, 8, 6, 2...] assigned_clusters = kclusterer.cluster(X, assign_clusters=True) # 计算所有句子的分组 # ********** 计算每个句子到质心的距离 ********** # # data: # sentence embedding cluster centroid distance_from_centroid #0 新冠肺炎疫情暴发以来 [-0.2, 0.3,....] 2 [-0.17, 0.20,...] 3.476364 #1 频繁出现的无症状感染者病例 [-0.2, 0.1,....] 9 [-0.19, -0.16,...] 3.096487 data = pd.DataFrame(sentences) data.columns=['sentence'] data['embedding'] = sentence_embeddings.numpy().tolist()# .tolist()数组转化为列表 # 句子分为10簇 data['cluster']=pd.Series(assigned_clusters, index=data.index) # 每个质心的向量:计算求平均 data['centroid']=data['cluster'].apply(lambda x: kclusterer.means()[x]) # 计算sentence的embedding和质心的距离 def distance_from_centroid(row): return distance_matrix([row['embedding']], [row['centroid'].tolist()])[0][0] data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1) # ********** 得到摘要 ********** # # 1. 按照cluster 进行分组 # 2. 组内排序 # 3. 按照文章顺序顺序取原来的句子 summary=data.sort_values('distance_from_centroid',ascending = True).groupby('cluster').head(1).sort_index()['sentence'].tolist() print(summary)
摘要10句
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。