在查阅了大量文献后,我们发现大部分是采用 NVivo12 软件对政策文本内容进行编码,软件下载失败,需要补丁,寻觅未果,遂放弃。
















  1. # 步骤一
  2. import os
  3. import glob
  4. import pandas as pd
  5. import jieba
  6. import re
  7. from collections import Counter
  8. # 设置文件夹路径
  9. folder_paths = ["E:\\大学课程相关\\大二下学期\\1 毛概\\1",
  10. "E:\\大学课程相关\\大二下学期\\1 毛概\\2",
  11. "E:\\大学课程相关\\大二下学期\\1 毛概\\3"]
  12. # 读取停用词文件
  13. stopwords_file = "stopwords.txt"
  14. with open(stopwords_file, 'r', encoding='utf-8') as f:
  15. stopwords = set(f.read().splitlines())
  16. # 添加领域专属词汇到分词库
  17. specialized_words = ['非物质文化遗产', '政策文件', '项目', '名录', '保护','省级','县级']
  18. for word in specialized_words:
  19. jieba.add_word(word)
  20. # 定义函数来提取文件夹中的高频词
  21. def extract_top_words(folder_path):
  22. # 初始化一个计数器来统计词频
  23. word_counter = Counter()
  24. # 遍历文件夹中的每个txt文件
  25. for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
  26. with open(file_path, 'r', encoding='utf-8') as file:
  27. # 读取文件内容
  28. text = file.read()
  29. # 使用正则表达式去除数字和英文字符,只保留中文
  30. text = re.sub(r'[^\u4e00-\u9fa5]+', '', text)
  31. # 分词并去除停用词
  32. words = jieba.lcut(text)
  33. words = [word.lower() for word in words if word.isalnum() and word.lower() not in stopwords]
  34. # 更新词频计数器
  35. word_counter.update(words)
  36. # 返回前30个高频词及其频数
  37. return word_counter.most_common(30)
  38. # 分别提取三个文件夹中的高频词,并保存到单独的Excel文件中
  39. for folder_path in folder_paths:
  40. folder_name = os.path.basename(folder_path)
  41. top_words = extract_top_words(folder_path)
  42. # 将结果列表转换为DataFrame
  43. result_df = pd.DataFrame(top_words, columns=['Top Word', 'Frequency'])
  44. # 将结果保存到Excel文件中
  45. output_file = f"E:\\大学课程相关\\大二下学期\\1 毛概\\高频词统计结果_{folder_name}.xlsx"
  46. result_df.to_excel(output_file, index=False)
  47. print(f"{folder_name} 文件夹的高频词统计结果已保存到文件:", output_file)


  1. # 步骤二
  2. import pandas as pd
  3. from collections import defaultdict
  4. from itertools import combinations
  5. import os
  6. # 步骤一输出的高频词文件路径
  7. top_words_files = [
  8. r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_1.xlsx",
  9. r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_2.xlsx",
  10. r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_3.xlsx"
  11. ]
  12. # 输出共现矩阵文件的目录
  13. output_folder = r"E:\大学课程相关\大二下学期\1 毛概"
  14. # 遍历每个高频词文件,生成共现矩阵
  15. for file_path in top_words_files:
  16. # 读取高频词统计结果文件
  17. df = pd.read_excel(file_path)
  18. # 创建一个默认字典来存储共现频次
  19. co_occurrence_matrix = defaultdict(int)
  20. # 提取高频词列表
  21. words = df['Top Word'].tolist()
  22. # 生成高频词之间的所有可能组合
  23. word_combinations = combinations(words, 2)
  24. # 更新共现矩阵
  25. for pair in word_combinations:
  26. # 获取共现词对在原始文本中的共现次数
  27. co_occurrence_count = df.loc[(df['Top Word'] == pair[0]) | (df['Top Word'] == pair[1]), 'Frequency'].min()
  28. # 更新共现矩阵
  29. co_occurrence_matrix[pair] += co_occurrence_count
  30. # 将共现矩阵转换为DataFrame
  31. co_occurrence_df = pd.DataFrame(list(co_occurrence_matrix.items()), columns=['Word Pair', 'Co-occurrence'])
  32. # 拆分 Word Pair 列为两列:Word 1 和 Word 2
  33. co_occurrence_df[['Word 1', 'Word 2']] = pd.DataFrame(co_occurrence_df['Word Pair'].tolist(), index=co_occurrence_df.index)
  34. # 重新排列列的顺序
  35. co_occurrence_df = co_occurrence_df[['Word 1', 'Word 2', 'Co-occurrence']]
  36. # 获取文件夹名称以用于输出文件命名
  37. folder_name = os.path.basename(file_path).split('_')[-1].split('.')[0]
  38. # 输出文件路径
  39. output_file = os.path.join(output_folder, f"共现矩阵结果_{folder_name}.xlsx")
  40. # 将结果保存到Excel文件中
  41. co_occurrence_df.to_excel(output_file, index=False)
  42. print(f"共现矩阵结果已保存到文件: {output_file}")


  1. # 步骤三
  2. import pandas as pd
  3. from itertools import combinations
  4. import numpy as np
  5. import os
  6. # 高频词文件路径
  7. top_words_files = [
  8. r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_1.xlsx",
  9. r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_2.xlsx",
  10. r"E:\大学课程相关\大二下学期\1 毛概\高频词统计结果_3.xlsx"
  11. ]
  12. # 输出文件夹路径
  13. output_folder = r"E:\大学课程相关\大二下学期\1 毛概"
  14. # 遍历每个高频词文件
  15. for file_path in top_words_files:
  16. # 读取高频词统计结果文件
  17. df = pd.read_excel(file_path)
  18. # 提取高频词列
  19. words = df['Top Word'].tolist()
  20. frequencies = df['Frequency'].tolist()
  21. total_word_counts = dict(zip(words, frequencies))
  22. # 创建共现矩阵
  23. co_occurrence_matrix = pd.DataFrame(0, index=words, columns=words)
  24. # 填充共现矩阵
  25. word_combinations = combinations(words, 2)
  26. for pair in word_combinations:
  27. co_occurrence_matrix.at[pair[0], pair[1]] += 1
  28. co_occurrence_matrix.at[pair[1], pair[0]] += 1
  29. # 计算Ochiai系数
  30. oc_matrix = co_occurrence_matrix.copy()
  31. for i in range(len(words)):
  32. for j in range(i+1, len(words)):
  33. word1 = words[i]
  34. word2 = words[j]
  35. co_occurrence = oc_matrix.at[word1, word2]
  36. word1_count = total_word_counts[word1]
  37. word2_count = total_word_counts[word2]
  38. ochiai_coefficient = co_occurrence / np.sqrt(word1_count * word2_count)
  39. oc_matrix.at[word1, word2] = ochiai_coefficient
  40. oc_matrix.at[word2, word1] = ochiai_coefficient
  41. # 计算相异矩阵
  42. dissimilarity_matrix = 1 - oc_matrix
  43. # 获取文件夹名称以用于输出文件命名
  44. folder_name = os.path.basename(file_path).split('_')[-1].split('.')[0]
  45. # 输出文件路径
  46. output_file = os.path.join(output_folder, f"相异矩阵结果_{folder_name}.xlsx")
  47. # 保存相异矩阵到Excel文件
  48. dissimilarity_matrix.to_excel(output_file)
  49. print("相异矩阵结果已保存到文件:", output_file)


  1. # 步骤四
  2. import pandas as pd
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. from scipy.cluster import hierarchy
  6. plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
  7. # 读取相异矩阵
  8. file_paths = [
  9. r"E:\大学课程相关\大二下学期\1 毛概\相异矩阵结果_1.xlsx",
  10. r"E:\大学课程相关\大二下学期\1 毛概\相异矩阵结果_2.xlsx",
  11. r"E:\大学课程相关\大二下学期\1 毛概\相异矩阵结果_3.xlsx"
  12. ]
  13. # 遍历每个相异矩阵文件路径
  14. for i, file_path in enumerate(file_paths, start=1):
  15. # 读取相异矩阵
  16. df = pd.read_excel(file_path, index_col=0)
  17. # 转换为数组
  18. data = np.array(df)
  19. # 计算层次聚类
  20. Z = hierarchy.linkage(data, method='average')
  21. # 绘制树状图
  22. plt.figure(figsize=(12, 10)) # 调整图表大小
  23. dn = hierarchy.dendrogram(Z, labels=df.index, orientation='left', leaf_font_size=8) # 减小叶子节点字体大小
  24. plt.xlabel('相异度', fontsize=12)
  25. plt.ylabel('样本', fontsize=12)
  26. plt.title(f'树状图 {i}', fontsize=14)
  27. plt.grid(True)
  28. plt.show()





  1. # 导入所需的库
  2. import os
  3. import glob
  4. import pandas as pd
  5. import jieba
  6. import re
  7. from gensim import corpora
  8. from gensim.models import LdaModel
  9. from gensim.models.ldamulticore import LdaMulticore
  10. # 设置文件夹路径
  11. folder_paths = ["E:\\大学课程相关\\大二下学期\\1 毛概\\1",
  12. "E:\\大学课程相关\\大二下学期\\1 毛概\\2",
  13. "E:\\大学课程相关\\大二下学期\\1 毛概\\3"]
  14. # 读取停用词文件
  15. stopwords_file = "stopwords.txt"
  16. with open(stopwords_file, 'r', encoding='utf-8') as f:
  17. stopwords = set(f.read().splitlines())
  18. # 创建一个空列表来存储文档内容
  19. texts = []
  20. # 读取每个文件夹中的文本文件,并进行分词和去除停用词处理
  21. for folder_path in folder_paths:
  22. for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
  23. with open(file_path, 'r', encoding='utf-8') as file:
  24. text = file.read()
  25. text = re.sub(r'[^\u4e00-\u9fa5]+', '', text) # 只保留中文字符
  26. words = [word for word in jieba.lcut(text) if word not in stopwords] # 分词并去除停用词
  27. texts.append(words)
  28. # 创建词典
  29. dictionary = corpora.Dictionary(texts)
  30. # 创建语料库
  31. corpus = [dictionary.doc2bow(text) for text in texts]
  32. # 运行LDA主题建模
  33. num_topics = 5 # 指定主题数量
  34. lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics)
  35. # 打印每个主题的词分布
  36. for idx, topic in lda_model.print_topics(-1):
  37. print("主题 {}: {}".format(idx, topic))
  38. # 提取主题词
  39. topics_words = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)
  40. for i, topic_words in enumerate(topics_words):
  41. topic_num = topic_words[0]
  42. words = [word[0] for word in topic_words[1]]
  43. print("主题 {} 的关键词:{}".format(topic_num, words))


  1. # K-means
  2. import pandas as pd
  3. import jieba
  4. from sklearn.feature_extraction.text import TfidfVectorizer
  5. from sklearn.cluster import KMeans
  6. from sklearn.metrics import silhouette_score
  7. import re
  8. # 读取Excel文件
  9. excel_file = "E:\\大学课程相关\\大二下学期\\1 毛概\\高频词统计结果.xlsx"
  10. data = pd.read_excel(excel_file)
  11. # 获取文本数据
  12. texts = data['Top Word'].tolist()
  13. # 文本预处理:分词、去除停用词等
  14. stopwords_file = "stopwords.txt"
  15. with open(stopwords_file, 'r', encoding='utf-8') as f:
  16. stopwords = set(f.read().splitlines())
  17. def text_preprocessing(text):
  18. text = re.sub(r'[^\u4e00-\u9fa5]+', ' ', text) # 只保留中文字符
  19. words = jieba.lcut(text) # 分词
  20. words = [word for word in words if word not in stopwords] # 去除停用词
  21. return " ".join(words)
  22. # 对文本进行预处理
  23. preprocessed_texts = [text_preprocessing(text) for text in texts]
  24. # 使用TF-IDF向量化文本数据
  25. vectorizer = TfidfVectorizer()
  26. X = vectorizer.fit_transform(preprocessed_texts)
  27. # 使用KMeans算法进行聚类
  28. num_clusters = 3 # 指定聚类簇的数量
  29. kmeans = KMeans(n_clusters=num_clusters)
  30. kmeans.fit(X)
  31. # 将聚类结果添加到数据中
  32. data['Cluster'] = kmeans.labels_
  33. # 打印每个聚类的关键词
  34. cluster_centers = kmeans.cluster_centers_
  35. feature_names = vectorizer.get_feature_names_out()
  36. for i, cluster_center in enumerate(cluster_centers):
  37. top_keywords_idx = cluster_center.argsort()[-10:][::-1] # 获取每个聚类的前10个关键词的索引
  38. top_keywords = [feature_names[idx] for idx in top_keywords_idx]
  39. print("Cluster {} 的关键词:{}".format(i, top_keywords))
  40. # 将结果保存到Excel文件中
  41. output_file = "E:\\大学课程相关\\大二下学期\\1 毛概\\文档聚类结果.xlsx"
  42. data.to_excel(output_file, index=False)
  43. print("文档聚类结果已保存到文件:", output_file)


