赞
踩
import pandas as pd from collections import Counter from sklearn.feature_extraction.text import TfidfVectorizer import spacy # 加载spaCy的英语模型 nlp = spacy.load("en_core_web_sm") nlp.max_length = 1500000 # 设置一个更大的值,根据你的文本长度调整 # 读取Excel文件 file_path = 'cleaned_Laos_news.xlsx' sheet_name = 'Sheet1' # 修改为实际的工作表名称 data = pd.read_excel(file_path, sheet_name=sheet_name) # 清除NaN值,将它们替换为一个空字符串 data['cleaned_content'].fillna('', inplace=True) # 合并所有新闻文本内容 all_texts = data['cleaned_content'].tolist() # 使用spaCy进行分词和关键词提取 docs = list(nlp.pipe(all_texts)) # 统计高频词 topic_keywords = [token.text for doc in docs for token in doc if not token.is_stop and len(token.text) > 1 and not token.is_punct] word_counts = Counter(topic_keywords) # 计算TF-IDF权重 tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts) # 获取词汇表和对应的词语 vocabulary = tfidf_vectorizer.get_feature_names_out() # 定义权重 tfidf_weight = 0.7 # TF-IDF权重 word_count_weight = 0.3 # 高频词得分权重 # 综合得分并选择话题词 topic_scores = {} for i, word in enumerate(vocabulary): tfidf_score = tfidf_matrix[:, i].sum() word_score = word_counts[word] combined_score = tfidf_weight * tfidf_score + word_count_weight * word_score topic_scores[word] = combined_score # 选择综合得分最高的词作为话题词 top_n = 20 topic_keywords = sorted(topic_scores.keys(), key=lambda x: topic_scores[x], reverse=True)[:top_n] print("提取的综合得分高频关键词作为话题词:") print(topic_keywords)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。