当前位置:   article > 正文

【Python】自然语言处理——实现批量文本的分词、提取关键词、制作词云思路_自然语言处理文本分词与虚词过滤详细代码

自然语言处理文本分词与虚词过滤详细代码

  1. #coding=utf-8
  2. import jieba
  3. from string import punctuation
  4. import re
  5. import jieba.analyse
  6. from wordcloud import WordCloud, ImageColorGenerator
  7. import matplotlib.pyplot as plt
  8. import os
  9. import json
  10. #去除文件中特殊字符,避免对分词精准性产生影响
  11. def clean(tian_char):
  12. punc = punctuation + u'.,;《》?!“”‘’@#¥%…&×()——+【】{};;●,。&~、|\s::'
  13. tian_char_clean = re.sub(r"[{}]+".format(punc), " ", tian_char)
  14. return tian_char_clean
  15. #jieba分词并将分词文件存储
  16. def jiebacut(stopword_list, tian_char_clean, cut_file_path):
  17. seg_list = jieba.cut(tian_char_clean, cut_all=False)
  18. tian_char_by_cut = "/".join(seg_list)
  19. tian_char_by_cut = tian_char_by_cut.replace(' ', '')
  20. word_list = tian_char_by_cut.split('/')
  21. while '' in word_list:
  22. word_list.remove('')
  23. # 去除停用词
  24. if stopword_list != None:
  25. num = len(word_list)
  26. flag = 0
  27. while(flag != num):
  28. if word_list[flag] in stopword_list:
  29. del word_list[flag]
  30. num -= 1
  31. else:
  32. flag += 1
  33. file = open(cut_file_path, 'a', encoding='gb18030')
  34. for i in range(len(word_list)):
  35. s = word_list[i]
  36. s = s + '\n' # 每行末尾追加换行符
  37. file.write(s)
  38. file.close()
  39. print(cut_file_path+"分词文件保存成功")
  40. return word_list
  41. #获取文章关键词基于TF-IDF
  42. def keywords(word_list, keywords_path):
  43. tian_content = " ".join(word_list)
  44. keywords_n = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('n', 'nr', 'ns'))
  45. keywords_v = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('v', 'vd', 'vi','vn'))
  46. keywords_a = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('a', 'an', 'ad'))
  47. f = open(keywords_path, 'a')
  48. f.write("名词关键词-前200个\n")
  49. for item in keywords_n:
  50. f.write(str(item[0])+' '+str(item[1]))
  51. f.write('\n')
  52. f.write("动词关键词-前200个\n")
  53. for item in keywords_v:
  54. f.write(str(item[0])+' '+str(item[1]))
  55. f.write('\n')
  56. f.write("形容词关键词-前200个\n")
  57. for item in keywords_a:
  58. f.write(str(item[0])+' '+str(item[1]))
  59. f.write('\n')
  60. f.close()
  61. print(keywords_path+"关键词保存成功")
  62. return keywords_n, keywords_v, keywords_a
  63. #利用关键词做出词云
  64. def wordcloud(im_path,keywords,type):
  65. #backgroud_Image = plt.imread('zhengxie.jpg')
  66. path = im_path + type +'.jpg'
  67. keywords_dict = dict(keywords)
  68. wc = WordCloud(background_color='black', # 背景颜色
  69. max_words=2000, # 最大词数
  70. #mask=backgroud_Image, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
  71. max_font_size=100, # 显示字体的最大值
  72. #stopwords=STOPWORDS.add(''), # 使用内置的屏蔽词,再添加其他词
  73. font_path="C:/Windows/Fonts/STFANGSO.ttf", # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
  74. random_state=42, # 为每个词返回一个PIL颜色
  75. width=1000, # 图片的宽
  76. height=860 #图片的长
  77. )
  78. wc.generate_from_frequencies(keywords_dict)
  79. #img_colors = ImageColorGenerator(backgroud_Image)
  80. #wc.recolor(color_func=img_colors)
  81. plt.imshow(wc) # 显示词云
  82. plt.axis('off')
  83. wc.to_file(path)
  84. print(path+"词云图保存成功")
  85. def stopword(stopword_path):
  86. stopword_list = []
  87. with open(stopword_path, "r", encoding="gb18030") as f:
  88. for line in f.readlines():
  89. line = line.strip('\n')
  90. stopword_list.append(line)
  91. return stopword_list
  92. def get_all_path(open_file_path):
  93. rootdir = open_file_path
  94. path_list = []
  95. list = os.listdir(rootdir) # 列出文件夹下所有的目录与文件
  96. for i in range(0, len(list)):
  97. com_path = os.path.join(rootdir, list[i])
  98. #print(com_path)
  99. if os.path.isfile(com_path):
  100. path_list.append(com_path)
  101. if os.path.isdir(com_path):
  102. path_list.extend(get_all_path(com_path))
  103. #print(path_list)
  104. return path_list
  105. def im_com_path(im_path, open_file_path):
  106. in_com_name = str(os.path.join(im_path, os.path.basename(open_file_path)))[:-3]
  107. return in_com_name
  108. #判断输入的存储文件路径是否存在,若不存在则创建
  109. def judge_path(File_Path):
  110. if not os.path.exists(File_Path):
  111. os.makedirs(File_Path)
  112. return File_Path
  113. def flow_path(open_file_path, stopword_path, cut_file_name, im_name, keywords_name):
  114. f = open(open_file_path, encoding='gb18030')
  115. txt_data = f.read()
  116. f.close()
  117. tian_char = str(txt_data)
  118. tian_char_clean = clean(tian_char)
  119. if stopword_path == None:
  120. stopword_list = None
  121. else:
  122. stopword_list = stopword(stopword_path)
  123. word_list = jiebacut(stopword_list, tian_char_clean, cut_file_name)
  124. keywords_n, keywords_v, keywords_a = keywords(word_list, keywords_name)
  125. wordcloud(im_name, keywords_n, 'n')
  126. wordcloud(im_name, keywords_v, 'v')
  127. wordcloud(im_name, keywords_a, 'a')
  128. #主函数,输入打开文件路径,分词结果保存路径,词云图保存路径,输出分词结果及词云图
  129. def main(open_file_path, stopword_path, cut_file_path, im_path, keywords_path):
  130. if os.path.isfile(open_file_path):
  131. cut_file_path_exist = judge_path(cut_file_path)
  132. im_path_exist = judge_path(im_path)
  133. keywords_path_exist = judge_path(keywords_path)
  134. im_com_name = im_com_path(im_path_exist,open_file_path)
  135. flow_path(open_file_path,stopword_path,os.path.join(cut_file_path_exist, os.path.basename(open_file_path)),im_com_name,os.path.join(keywords_path_exist, os.path.basename(open_file_path)))
  136. if os.path.isdir(open_file_path):
  137. path_list = get_all_path(open_file_path)
  138. cut_file_path_exist = judge_path(cut_file_path)
  139. im_path_exist = judge_path(im_path)
  140. keywords_path_exist = judge_path(keywords_path)
  141. for com_open_path in path_list:
  142. im_com_name = im_com_path(im_path_exist, com_open_path)
  143. flow_path(com_open_path,stopword_path,os.path.join(cut_file_path_exist, os.path.basename(com_open_path)),im_com_name,os.path.join(keywords_path_exist, os.path.basename(com_open_path)))
  144. if __name__ == '__main__':
  145. main()

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/天景科技苑/article/detail/878625
推荐阅读
相关标签
  

闽ICP备14008679号