赞
踩
电影逐渐成为人们生活的不可或缺的一部分,而了解一部电影的可以通过电影评分与大众推荐度,但以上的方式都太过于片面,了解一部电影的方法是通过已经观看完电影的人群的反馈,虽然电影评分和大众推荐度在一定程度上是观影人群的反馈,但是并没有电影评论的反馈真实。评论对影视剧的好坏与特色可以更加充分的体现。所以了解一部影视作品的最好方式是通过评论。出于对评论的大量且参差不齐的考虑,通过出现的高频词来分析,是通过评论了解影视剧较为便捷的方式。将高频词整合,通过词云图是极好的方式。所以项目基于以上背景决定基于网络爬虫获取豆瓣评论生成词云图实现指定电影豆瓣评论关键词词云生成器。
import urllib.request
import requests, re
from bs4 import BeautifulSoup
def getHtml(url):
"""获取url页面"""
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
req = urllib.request.Request(url,headers=headers)
req = urllib.request.urlopen(req)
content = req.read().decode('utf-8')
return content
def getComment(url):
"""解析HTML页面"""
html = getHtml(url)
soupComment = BeautifulSoup(html, 'html.parser')
comments = soupComment.findAll('span', 'short')
onePageComments = []
for comment in comments:
# print(comment.getText()+'\n')
onePageComments.append(comment.getText()+'\n')
return onePageComments
def getid(name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
}
movie_name = name
params = {
"q": movie_name
}
search_url = "https://www.douban.com/search"
r = requests.get(search_url, params=params, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
first_movie_info = soup.find('a', {'class': 'nbg'})['onclick']
pattern = re.compile('\d{4,}')
sid = str(pattern.search(first_movie_info).group())
return(sid)
def get_data():
with open('电影评论.txt', 'w', encoding='utf-8') as f:
sid=getid(_input.get())
for page in range(10): # 豆瓣爬取多页评论需要验证。
url = 'https://movie.douban.com/subject/'+sid+'/comments?start=' + str(20*page) + '&limit=20&sort=new_score&status=P'
for i in getComment(url):
f.write(i)
from tkinter import *
import matplotlib.pyplot as plt
app = Tk()#主窗口
_input = Entry()#输入框
_input.pack()
app.title("电影评论关键词生成器")
screenwidth = app.winfo_screenwidth()#定义窗口宽度
screenheight = app.winfo_screenheight()#定义窗口高度
dialog_width = 400
dialog_height = 170
# 前两个参数是窗口的大小,后面两个参数是窗口的位置
app.geometry(
"%dx%d+%d+%d" % (dialog_width, dialog_height, (screenwidth - dialog_width) / 2, (screenheight - dialog_height) / 2))#设置窗口局中分布
btn = Button(text='查询', command=get_data,width=10)#定义按钮,按钮的结果是调用get data函数进入爬虫模块
btn.place(x=155, y=80)#定义按钮位置
btn.pack()
app.mainloop()
from wordcloud import WordCloud
import pandas as pd
from imageio import imread
import jieba
with open("电影评论.txt", "r", encoding='UTF-8') as fin1: all_words = cut_words(fin1)#读取文本 #定义停用词 stop = ['的','你','了','将','为','例',' ','多','再','有','是','等','天','次','让','在','我','也','就','这样','啊','和','都','《','》',',','看','!','什么','怎么','这么','很','给','没有','不是','说' ,'不','吗','?','!' ,'?','。' ,'...' ,'电影','主','男','女' ] words_cut = []#定义停用词 for word in all_words: if word not in stop: words_cut.append(word) word_count = pd.Series(words_cut).value_counts() back_ground = imread("F:\\flower.jpg")#自己定义图片位置 wc = WordCloud( font_path="C:\\Windows\\Fonts\\simhei.ttf", #设置字体 background_color="white", #设置词云背景颜色 max_words=400, #词云允许最大词汇数 mask=back_ground, #词云形状 max_font_size=400, #最大字体大小 random_state=90 #配色方案的种数 ) wc1 = wc.fit_words(word_count) #生成词云 plt.figure() plt.imshow(wc1) plt.axis("off") plt.show() wc.to_file("ciyun.png")
界面如下:
输入电影名称(采用《八佰》作为示例)
返回结果:
结果分析:
成功将豆瓣社区中《八佰》的评论区出现的高频次进行生成词云图并返回。
from tkinter import * import urllib.request import requests, re from bs4 import BeautifulSoup from wordcloud import WordCloud import pandas as pd from imageio import imread import matplotlib.pyplot as plt import jieba def getHtml(url): """获取url页面""" headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} req = urllib.request.Request(url,headers=headers) req = urllib.request.urlopen(req) content = req.read().decode('utf-8') return content def cut_words(top_search): top_cut=[] for top in top_search: top_cut.extend(list(jieba.cut(top))) #使用精确模式切割词汇 return top_cut def getComment(url): """解析HTML页面""" html = getHtml(url) soupComment = BeautifulSoup(html, 'html.parser') comments = soupComment.findAll('span', 'short') onePageComments = [] for comment in comments: # print(comment.getText()+'\n') onePageComments.append(comment.getText()+'\n') return onePageComments def getid(name): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', } movie_name = name params = { "q": movie_name } search_url = "https://www.douban.com/search" r = requests.get(search_url, params=params, headers=headers) soup = BeautifulSoup(r.content, 'lxml') first_movie_info = soup.find('a', {'class': 'nbg'})['onclick'] pattern = re.compile('\d{4,}') sid = str(pattern.search(first_movie_info).group()) return(sid) def get_data(): with open('电影评论.txt', 'w', encoding='utf-8') as f: sid=getid(_input.get()) for page in range(10): # 豆瓣爬取多页评论需要验证。 url = 'https://movie.douban.com/subject/'+sid+'/comments?start=' + str(20*page) + '&limit=20&sort=new_score&status=P' for i in getComment(url): f.write(i) with open("电影评论.txt", "r", encoding='UTF-8') as fin1: all_words = cut_words(fin1) #定义停用词 stop = ['的','你','了','将','为','例',' ','多','再','有','是','等','天','次','让','在','我','也','就','这样','啊','和','都','《','》',',','看','!','什么','怎么','这么','很','给','没有','不是','说' ,'不','吗','?','!' ,'?','。' ,'...' ,'电影','主','男','女' ] words_cut = [] for word in all_words: if word not in stop: words_cut.append(word) word_count = pd.Series(words_cut).value_counts() back_ground = imread("F:\\flower.jpg") wc = WordCloud( font_path="C:\\Windows\\Fonts\\simhei.ttf", #设置字体 background_color="white", #设置词云背景颜色 max_words=400, #词云允许最大词汇数 mask=back_ground, #词云形状 max_font_size=400, #最大字体大小 random_state=90 #配色方案的种数 ) wc1 = wc.fit_words(word_count) #生成词云 plt.figure() plt.imshow(wc1) plt.axis("off") plt.show() wc.to_file("ciyun.png") print('succeed!\n') app = Tk() _input = Entry() #_input.place(x=113, y=80) _input.pack() app.title("电影评论关键词生成器") screenwidth = app.winfo_screenwidth() screenheight = app.winfo_screenheight() dialog_width = 400 dialog_height = 170 # 前两个参数是窗口的大小,后面两个参数是窗口的位置 app.geometry( "%dx%d+%d+%d" % (dialog_width, dialog_height, (screenwidth - dialog_width) / 2, (screenheight - dialog_height) / 2)) btn = Button(text='查询', command=get_data,width=10) btn.place(x=155, y=80) btn.pack() app.mainloop()
END
学业繁重,好久没更新,后续寒假可能会更新,一起加油
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。