在查阅了大量文献后,我们发现大部分是采用 NVivo12 软件对政策文本内容进行编码,软件下载失败,需要补丁,寻觅未果,遂放弃。
- # 步骤一
- import os
- import glob
- import pandas as pd
- import jieba
- import re
- from collections import Counter
- # 设置文件夹路径
- folder_paths = ["E:\\大学课程相关\\大二下学期\\1 毛概\\1",
- "E:\\大学课程相关\\大二下学期\\1 毛概\\2",
- "E:\\大学课程相关\\大二下学期\\1 毛概\\3"]
- # 读取停用词文件
- stopwords_file = "stopwords.txt"
- with open(stopwords_file, 'r', encoding='utf-8') as f:
- stopwords = set(f.read().splitlines())
- # 添加领域专属词汇到分词库
- specialized_words = ['非物质文化遗产', '政策文件', '项目', '名录', '保护','省级','县级']
- for word in specialized_words:
- jieba.add_word(word)
- # 定义函数来提取文件夹中的高频词
- def extract_top_words(folder_path):
- # 初始化一个计数器来统计词频
- word_counter = Counter()
- # 遍历文件夹中的每个txt文件
- for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
- with open(file_path, 'r', encoding='utf-8') as file:
- # 读取文件内容
- text = file.read()
- # 使用正则表达式去除数字和英文字符,只保留中文
- text = re.sub(r'[^\u4e00-\u9fa5]+', '', text)
- # 分词并去除停用词
- words = jieba.lcut(text)
- words = [word.lower() for word in words if word.isalnum() and word.lower() not in stopwords]
- # 更新词频计数器
- word_counter.update(words)
- # 返回前30个高频词及其频数
- return word_counter.most_common(30)
- # 分别提取三个文件夹中的高频词,并保存到单独的Excel文件中
- for folder_path in folder_paths:
- folder_name = os.path.basename(folder_path)
- top_words = extract_top_words(folder_path)
- # 将结果列表转换为DataFrame
- result_df = pd.DataFrame(top_words, columns=['Top Word', 'Frequency'])
- # 将结果保存到Excel文件中
- output_file = f"E:\\大学课程相关\\大二下学期\\1 毛概\\高频词统计结果_{folder_name}.xlsx"
- result_df.to_excel(output_file, index=False)
- print(f"{folder_name} 文件夹的高频词统计结果已保存到文件:", output_file)

- # 步骤二
- import pandas as pd
- from collections import defaultdict
- from itertools import combinations
- import os
- # 步骤一输出的高频词文件路径
- top_words_files = [
- r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_1.xlsx",
- r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_2.xlsx",
- r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_3.xlsx"
- ]
- # 输出共现矩阵文件的目录
- output_folder = r"E:\大学课程相关\大二下学期\1 毛概"
- # 遍历每个高频词文件,生成共现矩阵
- for file_path in top_words_files:
- # 读取高频词统计结果文件
- df = pd.read_excel(file_path)
- # 创建一个默认字典来存储共现频次
- co_occurrence_matrix = defaultdict(int)
- # 提取高频词列表
- words = df['Top Word'].tolist()
- # 生成高频词之间的所有可能组合
- word_combinations = combinations(words, 2)
- # 更新共现矩阵
- for pair in word_combinations:
- # 获取共现词对在原始文本中的共现次数
- co_occurrence_count = df.loc[(df['Top Word'] == pair[0]) | (df['Top Word'] == pair[1]), 'Frequency'].min()
- # 更新共现矩阵
- co_occurrence_matrix[pair] += co_occurrence_count
- # 将共现矩阵转换为DataFrame
- co_occurrence_df = pd.DataFrame(list(co_occurrence_matrix.items()), columns=['Word Pair', 'Co-occurrence'])
- # 拆分 Word Pair 列为两列:Word 1 和 Word 2
- co_occurrence_df[['Word 1', 'Word 2']] = pd.DataFrame(co_occurrence_df['Word Pair'].tolist(), index=co_occurrence_df.index)
- # 重新排列列的顺序
- co_occurrence_df = co_occurrence_df[['Word 1', 'Word 2', 'Co-occurrence']]
- # 获取文件夹名称以用于输出文件命名
- folder_name = os.path.basename(file_path).split('_')[-1].split('.')[0]
- # 输出文件路径
- output_file = os.path.join(output_folder, f"共现矩阵结果_{folder_name}.xlsx")
- # 将结果保存到Excel文件中
- co_occurrence_df.to_excel(output_file, index=False)
- print(f"共现矩阵结果已保存到文件: {output_file}")

- # 步骤三
- import pandas as pd
- from itertools import combinations
- import numpy as np
- import os
- # 高频词文件路径
- top_words_files = [
- r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_1.xlsx",
- r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_2.xlsx",
- r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_3.xlsx"
- ]
- # 输出文件夹路径
- output_folder = r"E:\大学课程相关\大二下学期\1 毛概"
- # 遍历每个高频词文件
- for file_path in top_words_files:
- # 读取高频词统计结果文件
- df = pd.read_excel(file_path)
- # 提取高频词列
- words = df['Top Word'].tolist()
- frequencies = df['Frequency'].tolist()
- total_word_counts = dict(zip(words, frequencies))
- # 创建共现矩阵
- co_occurrence_matrix = pd.DataFrame(0, index=words, columns=words)
- # 填充共现矩阵
- word_combinations = combinations(words, 2)
- for pair in word_combinations:
- co_occurrence_matrix.at[pair[0], pair[1]] += 1
- co_occurrence_matrix.at[pair[1], pair[0]] += 1
- # 计算Ochiai系数
- oc_matrix = co_occurrence_matrix.copy()
- for i in range(len(words)):
- for j in range(i+1, len(words)):
- word1 = words[i]
- word2 = words[j]
- co_occurrence = oc_matrix.at[word1, word2]
- word1_count = total_word_counts[word1]
- word2_count = total_word_counts[word2]
- ochiai_coefficient = co_occurrence / np.sqrt(word1_count * word2_count)
- oc_matrix.at[word1, word2] = ochiai_coefficient
- oc_matrix.at[word2, word1] = ochiai_coefficient
- # 计算相异矩阵
- dissimilarity_matrix = 1 - oc_matrix
- # 获取文件夹名称以用于输出文件命名
- folder_name = os.path.basename(file_path).split('_')[-1].split('.')[0]
- # 输出文件路径
- output_file = os.path.join(output_folder, f"相异矩阵结果_{folder_name}.xlsx")
- # 保存相异矩阵到Excel文件
- dissimilarity_matrix.to_excel(output_file)
- print("相异矩阵结果已保存到文件:", output_file)

- # 步骤四
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from scipy.cluster import hierarchy
- plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
- # 读取相异矩阵
- file_paths = [
- r"E:\大学课程相关\大二下学期\1 毛概\相异矩阵结果_1.xlsx",
- r"E:\大学课程相关\大二下学期\1 毛概\相异矩阵结果_2.xlsx",
- r"E:\大学课程相关\大二下学期\1 毛概\相异矩阵结果_3.xlsx"
- ]
- # 遍历每个相异矩阵文件路径
- for i, file_path in enumerate(file_paths, start=1):
- # 读取相异矩阵
- df = pd.read_excel(file_path, index_col=0)
- # 转换为数组
- data = np.array(df)
- # 计算层次聚类
- Z = hierarchy.linkage(data, method='average')
- # 绘制树状图
- plt.figure(figsize=(12, 10)) # 调整图表大小
- dn = hierarchy.dendrogram(Z, labels=df.index, orientation='left', leaf_font_size=8) # 减小叶子节点字体大小
- plt.xlabel('相异度', fontsize=12)
- plt.ylabel('样本', fontsize=12)
- plt.title(f'树状图 {i}', fontsize=14)
- plt.grid(True)
- plt.show()

- # 导入所需的库
- import os
- import glob
- import pandas as pd
- import jieba
- import re
- from gensim import corpora
- from gensim.models import LdaModel
- from gensim.models.ldamulticore import LdaMulticore
- # 设置文件夹路径
- folder_paths = ["E:\\大学课程相关\\大二下学期\\1 毛概\\1",
- "E:\\大学课程相关\\大二下学期\\1 毛概\\2",
- "E:\\大学课程相关\\大二下学期\\1 毛概\\3"]
- # 读取停用词文件
- stopwords_file = "stopwords.txt"
- with open(stopwords_file, 'r', encoding='utf-8') as f:
- stopwords = set(f.read().splitlines())
- # 创建一个空列表来存储文档内容
- texts = []
- # 读取每个文件夹中的文本文件,并进行分词和去除停用词处理
- for folder_path in folder_paths:
- for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
- with open(file_path, 'r', encoding='utf-8') as file:
- text = file.read()
- text = re.sub(r'[^\u4e00-\u9fa5]+', '', text) # 只保留中文字符
- words = [word for word in jieba.lcut(text) if word not in stopwords] # 分词并去除停用词
- texts.append(words)
- # 创建词典
- dictionary = corpora.Dictionary(texts)
- # 创建语料库
- corpus = [dictionary.doc2bow(text) for text in texts]
- # 运行LDA主题建模
- num_topics = 5 # 指定主题数量
- lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics)
- # 打印每个主题的词分布
- for idx, topic in lda_model.print_topics(-1):
- print("主题 {}: {}".format(idx, topic))
- # 提取主题词
- topics_words = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)
- for i, topic_words in enumerate(topics_words):
- topic_num = topic_words[0]
- words = [word[0] for word in topic_words[1]]
- print("主题 {} 的关键词:{}".format(topic_num, words))

- # K-means
- import pandas as pd
- import jieba
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.cluster import KMeans
- from sklearn.metrics import silhouette_score
- import re
- # 读取Excel文件
- excel_file = "E:\\大学课程相关\\大二下学期\\1 毛概\\高频词统计结果.xlsx"
- data = pd.read_excel(excel_file)
- # 获取文本数据
- texts = data['Top Word'].tolist()
- # 文本预处理:分词、去除停用词等
- stopwords_file = "stopwords.txt"
- with open(stopwords_file, 'r', encoding='utf-8') as f:
- stopwords = set(f.read().splitlines())
- def text_preprocessing(text):
- text = re.sub(r'[^\u4e00-\u9fa5]+', ' ', text) # 只保留中文字符
- words = jieba.lcut(text) # 分词
- words = [word for word in words if word not in stopwords] # 去除停用词
- return " ".join(words)
- # 对文本进行预处理
- preprocessed_texts = [text_preprocessing(text) for text in texts]
- # 使用TF-IDF向量化文本数据
- vectorizer = TfidfVectorizer()
- X = vectorizer.fit_transform(preprocessed_texts)
- # 使用KMeans算法进行聚类
- num_clusters = 3 # 指定聚类簇的数量
- kmeans = KMeans(n_clusters=num_clusters)
- kmeans.fit(X)
- # 将聚类结果添加到数据中
- data['Cluster'] = kmeans.labels_
- # 打印每个聚类的关键词
- cluster_centers = kmeans.cluster_centers_
- feature_names = vectorizer.get_feature_names_out()
- for i, cluster_center in enumerate(cluster_centers):
- top_keywords_idx = cluster_center.argsort()[-10:][::-1] # 获取每个聚类的前10个关键词的索引
- top_keywords = [feature_names[idx] for idx in top_keywords_idx]
- print("Cluster {} 的关键词:{}".format(i, top_keywords))
- # 将结果保存到Excel文件中
- output_file = "E:\\大学课程相关\\大二下学期\\1 毛概\\文档聚类结果.xlsx"
- data.to_excel(output_file, index=False)
- print("文档聚类结果已保存到文件:", output_file)

