赞
踩
- #coding=utf-8
- import jieba
- from string import punctuation
- import re
- import jieba.analyse
- from wordcloud import WordCloud, ImageColorGenerator
- import matplotlib.pyplot as plt
- import os
- import json
-
-
- #去除文件中特殊字符,避免对分词精准性产生影响
- def clean(tian_char):
- punc = punctuation + u'.,;《》?!“”‘’@#¥%…&×()——+【】{};;●,。&~、|\s::'
- tian_char_clean = re.sub(r"[{}]+".format(punc), " ", tian_char)
- return tian_char_clean
-
-
- #jieba分词并将分词文件存储
- def jiebacut(stopword_list, tian_char_clean, cut_file_path):
-
- seg_list = jieba.cut(tian_char_clean, cut_all=False)
- tian_char_by_cut = "/".join(seg_list)
- tian_char_by_cut = tian_char_by_cut.replace(' ', '')
- word_list = tian_char_by_cut.split('/')
- while '' in word_list:
- word_list.remove('')
-
- # 去除停用词
- if stopword_list != None:
- num = len(word_list)
- flag = 0
- while(flag != num):
- if word_list[flag] in stopword_list:
- del word_list[flag]
- num -= 1
- else:
- flag += 1
-
- file = open(cut_file_path, 'a', encoding='gb18030')
- for i in range(len(word_list)):
- s = word_list[i]
- s = s + '\n' # 每行末尾追加换行符
- file.write(s)
- file.close()
- print(cut_file_path+"分词文件保存成功")
-
- return word_list
-
-
- #获取文章关键词基于TF-IDF
- def keywords(word_list, keywords_path):
- tian_content = " ".join(word_list)
- keywords_n = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('n', 'nr', 'ns'))
- keywords_v = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('v', 'vd', 'vi','vn'))
- keywords_a = jieba.analyse.extract_tags(tian_content, topK=200, withWeight=True, allowPOS=('a', 'an', 'ad'))
- f = open(keywords_path, 'a')
-
- f.write("名词关键词-前200个\n")
- for item in keywords_n:
- f.write(str(item[0])+' '+str(item[1]))
- f.write('\n')
-
- f.write("动词关键词-前200个\n")
- for item in keywords_v:
- f.write(str(item[0])+' '+str(item[1]))
- f.write('\n')
-
- f.write("形容词关键词-前200个\n")
- for item in keywords_a:
- f.write(str(item[0])+' '+str(item[1]))
- f.write('\n')
-
- f.close()
- print(keywords_path+"关键词保存成功")
-
- return keywords_n, keywords_v, keywords_a
-
-
- #利用关键词做出词云
- def wordcloud(im_path,keywords,type):
- #backgroud_Image = plt.imread('zhengxie.jpg')
- path = im_path + type +'.jpg'
- keywords_dict = dict(keywords)
- wc = WordCloud(background_color='black', # 背景颜色
- max_words=2000, # 最大词数
- #mask=backgroud_Image, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
- max_font_size=100, # 显示字体的最大值
- #stopwords=STOPWORDS.add(''), # 使用内置的屏蔽词,再添加其他词
- font_path="C:/Windows/Fonts/STFANGSO.ttf", # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
- random_state=42, # 为每个词返回一个PIL颜色
- width=1000, # 图片的宽
- height=860 #图片的长
- )
- wc.generate_from_frequencies(keywords_dict)
- #img_colors = ImageColorGenerator(backgroud_Image)
- #wc.recolor(color_func=img_colors)
- plt.imshow(wc) # 显示词云
- plt.axis('off')
- wc.to_file(path)
- print(path+"词云图保存成功")
-
-
- def stopword(stopword_path):
- stopword_list = []
- with open(stopword_path, "r", encoding="gb18030") as f:
- for line in f.readlines():
- line = line.strip('\n')
- stopword_list.append(line)
- return stopword_list
-
-
- def get_all_path(open_file_path):
- rootdir = open_file_path
- path_list = []
- list = os.listdir(rootdir) # 列出文件夹下所有的目录与文件
- for i in range(0, len(list)):
- com_path = os.path.join(rootdir, list[i])
- #print(com_path)
- if os.path.isfile(com_path):
- path_list.append(com_path)
- if os.path.isdir(com_path):
- path_list.extend(get_all_path(com_path))
- #print(path_list)
- return path_list
-
-
- def im_com_path(im_path, open_file_path):
- in_com_name = str(os.path.join(im_path, os.path.basename(open_file_path)))[:-3]
- return in_com_name
-
-
- #判断输入的存储文件路径是否存在,若不存在则创建
- def judge_path(File_Path):
- if not os.path.exists(File_Path):
- os.makedirs(File_Path)
- return File_Path
-
- def flow_path(open_file_path, stopword_path, cut_file_name, im_name, keywords_name):
- f = open(open_file_path, encoding='gb18030')
- txt_data = f.read()
- f.close()
- tian_char = str(txt_data)
- tian_char_clean = clean(tian_char)
- if stopword_path == None:
- stopword_list = None
- else:
- stopword_list = stopword(stopword_path)
- word_list = jiebacut(stopword_list, tian_char_clean, cut_file_name)
- keywords_n, keywords_v, keywords_a = keywords(word_list, keywords_name)
- wordcloud(im_name, keywords_n, 'n')
- wordcloud(im_name, keywords_v, 'v')
- wordcloud(im_name, keywords_a, 'a')
-
-
- #主函数,输入打开文件路径,分词结果保存路径,词云图保存路径,输出分词结果及词云图
- def main(open_file_path, stopword_path, cut_file_path, im_path, keywords_path):
- if os.path.isfile(open_file_path):
- cut_file_path_exist = judge_path(cut_file_path)
- im_path_exist = judge_path(im_path)
- keywords_path_exist = judge_path(keywords_path)
- im_com_name = im_com_path(im_path_exist,open_file_path)
- flow_path(open_file_path,stopword_path,os.path.join(cut_file_path_exist, os.path.basename(open_file_path)),im_com_name,os.path.join(keywords_path_exist, os.path.basename(open_file_path)))
-
- if os.path.isdir(open_file_path):
- path_list = get_all_path(open_file_path)
- cut_file_path_exist = judge_path(cut_file_path)
- im_path_exist = judge_path(im_path)
- keywords_path_exist = judge_path(keywords_path)
- for com_open_path in path_list:
- im_com_name = im_com_path(im_path_exist, com_open_path)
- flow_path(com_open_path,stopword_path,os.path.join(cut_file_path_exist, os.path.basename(com_open_path)),im_com_name,os.path.join(keywords_path_exist, os.path.basename(com_open_path)))
-
-
-
- if __name__ == '__main__':
- main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。