赞
踩
目录
- import json
- import re
- import requests
- from bs4 import BeautifulSoup
- import os
- import time
- import random
- import jieba
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- #爬取评论数据
- def crawlData(start,limit):#start,limit爬取的评论序号和限制
- """
- 爬取豆瓣Top250《肖申克的救赎》评论数据,返回评论字符串列表
- """
- comments=[]
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'}
- url='https://movie.douban.com/subject/1292052/comments?start={}&limit={}&status=P&sort=new_score'.format(start,limit)
- try:
- response = requests.get(url,headers=headers)
- #print(response.status_code)
-
- #将一段文档传入BeautifulSoup的构造方法,就能得到一个文档的对象, 可以传入一段字符串
- soup = BeautifulSoup(response.text,'lxml')
-
- #返回的是class为short的<span>所有标签
- shorts = soup.find_all('span',{'class':'short'})
- #print(shorts)
-
- for short in shorts:
- strs = short.get_text()
- comments.append(strs)
- print("正在爬取数据,请稍后……")
-
- except Exception as e:
- print(e)
- return comments
- #保存json文件
- def saveJsonFile(comments,JsonFileName):
- jsons = []
- id = 0#给评论添加索引
- for com in comments:
- js = {}
- js['id'] = id
- js['comment'] = com
- jsons.append(js)
- id += 1
- jsons = str(jsons).replace('\'', '\"')#json中的索引都是双引号
- json_data = json.loads(jsons)
- with open('work/' + JsonFileName + '.json', 'w', encoding='UTF-8') as f:
- json.dump(json_data, f, ensure_ascii=False)
- print("数据已保存为%s.json"%saveJsonFile())
100条评论,json文件预览:
- #解析json数据,获取评论字符串列表
- def getTextForJson(JsonFileName):
- #打开json文件获取数据
- with open('work/' + JsonFileName + '.json', 'r', encoding='UTF-8') as file:
- comments = json.loads(file.read()) # json.loads解码JSON 数据。该函数返回Python 字段的数据类型
- text = []#文本存放列表
- for com in comments:
- text.append(com['comment'])
- return text
- # 去除文本中特殊字符
- def clear_special_char(text):
- content=[]
- for con in text:
- con = re.sub(r"\n|\t|\r|\r\n|\n\r|\x08|\\", "", con)
- content.append(con)
- return content
- # 将数据保存在txt文件
- def saveTxtFile(text,TxtFileName):
- contents = ''
- for txt in text:
- contents+=txt
- with open('work/' + TxtFileName + '.txt', 'w', encoding='UTF-8') as f:
- f.write(contents)
- print("数据已保存为%s.txt"%TxtFileName)
- # 从txt文件中获取评论字符串
- def getTextForTxt(TxtFileName):
- with open('work/' + TxtFileName + '.txt', 'r', encoding='UTF-8') as file:
- text = file.read()
- return text
- # 中文分词
- def fenci(text):
- words = jieba.cut(text.strip())
- return words
- #去除停用词
- def movestopwords(words):
- #创建停用词列表
- stopwords = [line.strip() for line in open('data\stopwords.txt', encoding='UTF-8').readlines()]#列表生成式
- # 将分词后的对象进行去除停用词
- strs = ''
- # 去停用词
- for word in words:
- if word not in stopwords:
- if word != '\t':
- strs += word
- strs += " "
- return strs
- #绘制词频统计表
- def drawcounts(strs):
- words = jieba.lcut(strs)#精确模式分词
- counts = {}
- for word in words:
- if len(word) == 1: # 排除单个字符分词的影响
- continue
- else:
- counts[word] = counts.get(word, 0) + 1
- # 按词频从高到低排序
- counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
- # 输出前10个
- for i in range(10):
- word, count = counts[i]
- print('{:<10}{:>5}'.format(word, count))
- # 绘制柱状图
- x_word=[]
- y_count=[]
- for i in range(10):
- word, count = counts[i]
- x_word.append(word)
- y_count.append(count)
- # 设置显示中文
- plt.rcParams['font.sans-serif'] = ['SimHei']
- # 设置图片大小
- plt.figure(figsize=(20, 15))
- plt.bar(range(len(y_count)), y_count, color='r', tick_label=x_word, facecolor='#9999ff', edgecolor='white')
- # 这里是调节横坐标的倾斜度,rotation是度数,以及设置刻度字体大小
- plt.xticks(rotation=45, fontsize=20)
- plt.yticks(fontsize=20)
- # plt.legend()
- plt.title('''《肖申克的救赎》评论词频统计''', fontsize=24)
- plt.savefig('./work/imgs/bar_result.jpg')
- plt.show()
- return
- #绘制词云图
- def drawcloud(strs):
- # 生成对象
- wc = WordCloud(font_path="fonts\simhei.ttf", width=500, height=400, mode="RGBA",
- background_color=None).generate(strs)
- # 显示词云图
- plt.imshow(wc, interpolation="bilinear")
- plt.axis("off")
- plt.show()
- # 保存文件
- wc.to_file("./work/imgs/WordCloud.png")
- import json
- import re
- import requests
- from bs4 import BeautifulSoup
- import os
- import time
- import random
- import jieba
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- #爬取评论数据
- def crawlData(start,limit):#start,limit爬取的评论序号和限制
- """
- 爬取豆瓣Top250《肖申克的救赎》评论数据,返回评论字符串列表
- """
- comments=[]
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'}
- url='https://movie.douban.com/subject/1292052/comments?start={}&limit={}&status=P&sort=new_score'.format(start,limit)
- try:
- response = requests.get(url,headers=headers)
- #print(response.status_code)
-
- #将一段文档传入BeautifulSoup的构造方法,就能得到一个文档的对象, 可以传入一段字符串
- soup = BeautifulSoup(response.text,'lxml')
-
- #返回的是class为short的<span>所有标签
- shorts = soup.find_all('span',{'class':'short'})
- #print(shorts)
-
- for short in shorts:
- strs = short.get_text()
- comments.append(strs)
- print("正在爬取数据,请稍后……")
-
- except Exception as e:
- print(e)
- return comments
-
- #保存json文件
- def saveJsonFile(comments,JsonFileName):
- jsons = []
- id = 0#给评论添加索引
- for com in comments:
- js = {}
- js['id'] = id
- js['comment'] = com
- jsons.append(js)
- id += 1
- jsons = str(jsons).replace('\'', '\"')#json中的索引都是双引号
- json_data = json.loads(jsons)
- with open('work/' + JsonFileName + '.json', 'w', encoding='UTF-8') as f:
- json.dump(json_data, f, ensure_ascii=False)
- print("数据已保存为%s.json"%saveJsonFile())
-
- #解析json数据,获取评论字符串列表
- def getTextForJson(JsonFileName):
- #打开json文件获取数据
- with open('work/' + JsonFileName + '.json', 'r', encoding='UTF-8') as file:
- comments = json.loads(file.read()) # json.loads解码JSON 数据。该函数返回Python 字段的数据类型
- text = []#文本存放列表
- for com in comments:
- text.append(com['comment'])
- return text
-
- # 去除文本中特殊字符
- def clear_special_char(text):
- content=[]
- for con in text:
- con = re.sub(r"\n|\t|\r|\r\n|\n\r|\x08|\\", "", con)
- content.append(con)
- return content
-
- # 将数据保存在txt文件
- def saveTxtFile(text,TxtFileName):
- contents = ''
- for txt in text:
- contents+=txt
- with open('work/' + TxtFileName + '.txt', 'w', encoding='UTF-8') as f:
- f.write(contents)
- print("数据已保存为%s.txt"%TxtFileName)
-
- # 从txt文件中获取评论字符串
- def getTextForTxt(TxtFileName):
- with open('work/' + TxtFileName + '.txt', 'r', encoding='UTF-8') as file:
- text = file.read()
- return text
-
- # 中文分词
- def fenci(text):
- words = jieba.cut(text.strip())
- return words
-
- #去除停用词
- def movestopwords(words):
- #创建停用词列表
- stopwords = [line.strip() for line in open('data\stopwords.txt', encoding='UTF-8').readlines()]#列表生成式
- # 将分词后的对象进行去除停用词
- strs = ''
- # 去停用词
- for word in words:
- if word not in stopwords:
- if word != '\t':
- strs += word
- strs += " "
- return strs
-
- #绘制词频统计表
- def drawcounts(strs):
- words = jieba.lcut(strs)#精确模式分词
- counts = {}
- for word in words:
- if len(word) == 1: # 排除单个字符分词的影响
- continue
- else:
- counts[word] = counts.get(word, 0) + 1
- # 按词频从高到低排序
- counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
- # 输出前10个
- for i in range(10):
- word, count = counts[i]
- print('{:<10}{:>5}'.format(word, count))
- # 绘制柱状图
- x_word=[]
- y_count=[]
- for i in range(10):
- word, count = counts[i]
- x_word.append(word)
- y_count.append(count)
- # 设置显示中文
- plt.rcParams['font.sans-serif'] = ['SimHei']
- # 设置图片大小
- plt.figure(figsize=(20, 15))
- plt.bar(range(len(y_count)), y_count, color='r', tick_label=x_word, facecolor='#9999ff', edgecolor='white')
- # 这里是调节横坐标的倾斜度,rotation是度数,以及设置刻度字体大小
- plt.xticks(rotation=45, fontsize=20)
- plt.yticks(fontsize=20)
- # plt.legend()
- plt.title('''《肖申克的救赎》评论词频统计''', fontsize=24)
- plt.savefig('./work/imgs/bar_result.jpg')
- plt.show()
- return
-
- #绘制词云图
- def drawcloud(strs):
- # 生成对象
- wc = WordCloud(font_path="fonts\simhei.ttf", width=500, height=400, mode="RGBA",
- background_color=None).generate(strs)
- # 显示词云图
- plt.imshow(wc, interpolation="bilinear")
- plt.axis("off")
- plt.show()
- # 保存文件
- wc.to_file("./work/imgs/WordCloud.png")
-
-
- if __name__ == "__main__":
- JsonFileName = 'hhh'
- TxtFileName = 'xxx'
- # 判断文件是否存在
- if os.path.exists('work/'+JsonFileName+'.json')==False:#不存在,爬取
- print("文件不存在,开始爬取页面。")
- # 1、爬取100条评论数据
- limit = 10 # 每页最大爬取条数
- comments = []
- for i in range(0, 10):
- comments += crawlData(i * limit, limit)
- print("数据爬取成功!")
- # 2、保存json数据
- saveJsonFile(comments,JsonFileName)
- else:
- print(JsonFileName+".json文件已存在。")
- # 3、从json文件中获取评论信息
- text = getTextForJson(JsonFileName)
- #print(text)
- #清理特殊字符
- text = clear_special_char(text)
- #print(text)
- #将清洗后的字符串保存在txt文件中
- if os.path.exists('work/'+TxtFileName+'.txt')==False:
- saveTxtFile(text, TxtFileName)
- else:
- print(TxtFileName+".txt文件已存在。")
- text = getTextForTxt(TxtFileName)#获取txt文本(str)
- print(text)
- #中文分词
- words = fenci(text)
- #for word in words:
- # print(word)
- #去除停用词
- strs = movestopwords(words)
- print(strs)
- # 绘制词频统计表
- drawcounts(strs)
- # 绘制词云图
- drawcloud(strs)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。