赞
踩
Hallo,各位小伙伴大家好啊!这个专栏是用来分享数据处理以及数据可视化的一些常见操作,以及自己的一些学习笔记,希望能给大家带来帮助呀!感兴趣的小伙伴也欢迎私信或者评论区交流呀!
Python编程读取至少一篇pdf文档。并编程实现以下功能:
①实现其中的热词统计分析。
②绘制热词统计分析的词云
我选择的PDF中文字内容如下:
- import pdfplumber # 导入库
- import jieba
- from wordcloud import WordCloud
- import numpy as np
- import matplotlib.pyplot as plt
- plt.rcParams['font.sans-serif']=['SimHei']
- plt.rcParams['axes.unicode_minus']=False
- # 用pdf文件解析器读取文件
- with pdfplumber.open('中华文化.pdf') as f:
- # 用for循环读取文件中的每一页
- for page in f.pages:
- text = page.extract_text()
- txt_f = open(r'中华文化.txt', mode='a', encoding='utf-8') # 创建txt文件
- txt_f.write(text) # 写入txt文件
-
- file = open('中华文化.txt',encoding='utf-8')
- file = file.read() #读取txt文件
- txtlist = jieba.lcut(file)
- string = " ".join(txtlist)
- stop_words = {}
- counts = {}
- for txt in txtlist:
- if len(txt) == 1:
- stop_words[txt] = stop_words.get(txt, 0) + 1
- else:
- counts[txt] = counts.get(txt, 0) + 1
- items = list(counts.items())
- items.sort(key=lambda x: x[1], reverse=True)
- y1 = []
- labels = []
- for i in range(1,10):
- y1.append(items[i][1])
- labels.append(items[i][0])
- # plt.figure(figsize=(8,4))
- width = 0.3
- x = np.arange(len(y1))
- a = [i for i in range(0,9)]
- plt.xticks(a,labels,rotation = 30)
- plt.bar(x=x,height=y1,width=width)
- plt.title('PDF文件中热词统计分析')
- plt.savefig("热词统计分析.png")
- plt.show()
- print("-------热词统计分析完成!-------")
- stoplist=[]
- item = list(stop_words.items())
- for i in range(len(item)):
- txt,count = item[i]
- stoplist.append(txt)
- #print(stoplist)
- setlist = set(stoplist)
- wcd = WordCloud(width=1000, height=700, background_color='white', font_path='msyh.ttc', scale=15, stopwords=setlist)
- wcd.generate(string)
- wcd.to_image()
- print("-------热词词云生成完成!-------")
- wcd.to_file('词云.png') # 导出图片
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。