赞
踩
代码参考:【文本分析实操干货】短文本主题建模利器 - BERTopic
• https://github.com/MaartenGr/BERTopic
(论文:https://arxiv.org/abs/2203.05794)
• 深度语义向量 + 传统聚类方法:
(1)通过Bert计算得到语句的深度语义向量
(2)通过HDBSCAN进行聚簇处理
(3)通过c-tfidf进行调整聚簇的粒度并提取主题关键词
链接:https://pan.baidu.com/s/1I8HLhNvFQCj5ogNpa0qDyw
提取码:0lwt
250万篇新闻( 原始数据9G,压缩文件3.6G;新闻内容跨度:2014-2016年,新闻来源涵盖了6.3万个媒体)
json格式:title标题,content正文,keywords关键词,desc描述,source来源
1.数据预处理:数据预处理:读取json文件,分词,去除标点符号后存入CSV文件(保留了"keywords", “title”, “desc”, “content”)
#解压 import zipfile zf = zipfile.ZipFile('./data/new2016zh.zip') print(zf.namelist()) zf.extractall() zf.close() #读取json文件 import json with open('./data/news2016zh_train.json', 'r', encoding="utf-8") as f: lines = f.readlines() #数据预处理,并存到csv文件 import csv import os import jieba import re stopwords = [i.strip() for i in open('./cn_stop_words.txt',"r", encoding="utf-8").readlines()] #分词并且去除停用词 def pretty_cut(sentence): cut_list = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', sentence)), cut_all=True) for i in range(len(cut_list) - 1, -1, -1): if cut_list[i] in stopwords: del cut_list[i] return cut_list with open(os.path.join("./data/news2016zh_valid.csv"), "w", encoding="utf-8", newline='') as g: writer = csv.writer(g) writer.writerow(["keywords", "title", "desc", "content"]) for line in lines: news = json.loads(line) keywords = news["keywords"].strip(" ") title = news["desc"].strip(" ") desc = news["desc"].strip(" ") content = news["content"].strip(" ") cut_keywords = " ".join(pretty_cut(keywords)) cut_title = " ".join(pretty_cut(title)) cut_desc = " ".join(pretty_cut(desc)) cut_content = " ".join(pretty_cut(content)) writer.writerow([cut_keywords, cut_title, cut_desc, cut_content])
2.导入必要的库
import sys
import numpy as np
import pandas as pd
import jieba
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt
3.读取刚刚处理好的额文本
data = pd.read_csv('/content/drive/MyDrive/news2016zh_valid.csv')
data.head()
4.处理异常值:空值和float
#处理异常值
print("在 news 列中总共有 %d 个空值." % data['content'].isnull().sum())
data[data.isnull().values==True]#isnull返回一个布尔数组
data = data[pd.notnull(data['content'])]#保留非null的news
data['content']=data['content'].astype(str) #将数据类型都换成str
5.创建句子嵌入
#创建语句嵌入
#model = SBert('paraphrase-multilingual-MiniLM-L12-v2')
%%time
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(data['content'].tolist(), show_progress_bar=True)
embeddings.shape
#(75787, 384)
6.句子嵌入降维处理
sys.setrecursionlimit(1000000)
umap_embeddings = umap.UMAP(n_neighbors=25,
n_components=10,
min_dist=0.00,
metric='cosine',
random_state=2020).fit_transform(embeddings)
7.利用HDBSCAN进行文档聚类
cluster = hdbscan.HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True).fit(umap_embeddings) # Prepare data umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) result = pd.DataFrame(umap_data, columns=['x', 'y']) result['labels'] = cluster.labels_ # Visualize clusters fig, ax = plt.subplots(figsize=(25, 15)) outliers = result.loc[result.labels == -1, :] clustered = result.loc[result.labels != -1, :] plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05) plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r') plt.colorbar() plt.savefig("result1.png", dpi = 300)
8.c-TF-IDF
def c_tf_idf(documents, m, ngram_range=(1, 1)): my_stopwords = [i.strip() for i in open('/content/drive/MyDrive/cn_stop_words.txt',encoding='utf-8').readlines()] """ Calculate a class-based TF-IDF where m is the number of total documents. """ count = CountVectorizer(ngram_range=ngram_range, #tokenizer = lambda x : ' '.join(jieba.lcut(x)), stop_words= my_stopwords).fit(documents) t = count.transform(documents).toarray() w = t.sum(axis=1) tf = np.divide(t.T, w) sum_t = t.sum(axis=0) idf = np.log(np.divide(m, sum_t)).reshape(-1, 1) tf_idf = np.multiply(tf, idf) return tf_idf, count def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20): words = count.get_feature_names() labels = list(docs_per_topic.Topic) tf_idf_transposed = tf_idf.T indices = tf_idf_transposed.argsort()[:, -n:] top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)} return top_n_words def extract_topic_sizes(df): topic_sizes = (df.groupby(['Topic']) .Doc .count() .reset_index() .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns') .sort_values("Size", ascending=False)) return topic_sizes
9.计算每个主题下的TOP主题词
docs_df = pd.DataFrame(data['content'].tolist(), columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
#产生的主题数
len(docs_per_topic.Doc.tolist())
#238
#提取每个簇中TF-IDF值最高的词作为该主题下的主题词
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m = len(data))
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)
10.查看#看看主题索引为10的主题下的主题词是哪些:
top_n_words[10]
11.查看目前的所有主题及其对应的主题词列表
# 删除索引为-1的“噪声"聚类簇群
top_n_words[-1]
#查看目前的所有主题及其对应的主题词列表
from pprint import pprint
for i in list(range(len(top_n_words) - 1)):
print('Most 20 Important words in TOPIC {} :\n'.format(i))
pprint(top_n_words[i])
pprint('***'*20)
12.主题归并
for i in tqdm(range(20)): # Calculate cosine similarity similarities = cosine_similarity(tf_idf.T) np.fill_diagonal(similarities, 0) # Extract label to merge into and from where topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index() topic_to_merge = topic_sizes.iloc[-1].Topic topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1 # Adjust topics docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into old_topics = docs_df.sort_values("Topic").Topic.unique() map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)} docs_df.Topic = docs_df.Topic.map(map_topics) docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join}) # Calculate new topic words m = len(data) tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m) top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20) topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10) #查看归并之后的主题数 len(docs_per_topic.Doc.tolist()) #218
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。