赞
踩
转载自: CDA数据分析师
https://y.qq.com/n/yqq/album/0009C3rp3Kfwg0.html#comment_box
评论区的内容是被封装在json中
复制此条json数据,放到在线json解析中
分析数据结构
comment_list = json_data['comment']['commentlist']
# 昵称
nick_name = [i.get('nick') for i in comment_list]
# 评论内容
content = [i.get('rootcommentcontent') for i in comment_list]
# 评论时间
comment_time = [i.get('time') for i in comment_list]
# 点赞数
praise_num = [i.get('praisenum') for i in comment_list]
整个代码
# 导入包 import pandas as pd import time import requests import json from fake_useragent import UserAgent def get_qq_comment(page_num): # 存储数据 df_all = pd.DataFrame() for i in range(page_num): # 打印进度 print('我正在获取第{}页的信息'.format(i)) # 获取URL url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=GB2312¬ice=0&platform=yqq.json&needNewCode=0&cid=205360772&reqtype=2&biztype=2&topid=12924001&cmd=8&needmusiccrit=0&pagenum={}&pagesize=25'.format(i) # 添加headers headers = { 'user-agent': UserAgent().random } # 发起请求 try: r = requests.get(url, headers=headers) except Exception as e: print(e) continue # 解析网页 json_data = json.loads(r.text) # 获取数据 comment_list = json_data['comment']['commentlist'] # 昵称 nick_name = [i.get('nick') for i in comment_list] # 评论内容 content = [i.get('rootcommentcontent') for i in comment_list] # 评论时间 comment_time = [i.get('time') for i in comment_list] # 点赞数 praise_num = [i.get('praisenum') for i in comment_list] # 存储数据 df = pd.DataFrame({ 'nick_name': nick_name, 'content': content, 'comment_time': comment_time, 'praise_num': praise_num }) # 追加数据 df_all = df_all.append(df, ignore_index=True) # 休眠一秒 time.sleep(1) return df_all # 运行函数 df = get_qq_comment(page_num=5) df.to_excel('C:\\Users\\Administrator\\Desktop\\mojito.xlsx',index = False)
只要5页内容,试一下就行了
读入数据
df = pd.read_excel('C:\\Users\\Administrator\\Desktop\\mojito.xlsx')
查看重复值和空值
print(df.duplicated().sum()) # 0
print(df.isnull().sum())
'''
nick_name 0
content 0
comment_time 0
praise_num 0
dtype: int64
'''
df.info
时间格式转换
df['comment_time'] # 原格式 ''' 0 1592708649 1 1592708347 2 1592708274 3 1592708154 4 1592708011 120 1592636465 121 1592636412 122 1592636120 123 1592636047 124 1592636017 Name: comment_time, Length: 125, dtype: int64 '''
def transform_time(time_second): time_array = time.localtime(time_second) otherStyleTime = time.strftime('%Y-%m-%d %H:%M:%S', time_array) return otherStyleTime # 时间数据处理 df['comment_time'] = df['comment_time'].apply(lambda x: transform_time(x)) ''' 0 2020-06-21 11:04:09 1 2020-06-21 10:59:07 2 2020-06-21 10:57:54 3 2020-06-21 10:55:54 4 2020-06-21 10:53:31 120 2020-06-20 15:01:05 121 2020-06-20 15:00:12 122 2020-06-20 14:55:20 123 2020-06-20 14:54:07 124 2020-06-20 14:53:37 Name: comment_time, Length: 125, dtype: object '''
content 评论内容初步处理(正则—表达式)
import re
pattern = re.compile(r'\[em\](.*?)\[/em\]')
df['content'] = df.content.str.replace(pattern, '')
df.head()
按时间排序
comment_num = df.comment_time.str.split(':').str[0].value_counts().sort_index() ''' 索引 年 - 月- 日 小时 这个时点评论人数 2020-06-20 14 3 2020-06-20 15 15 2020-06-20 16 6 2020-06-20 17 15 2020-06-20 18 11 2020-06-20 19 12 2020-06-20 20 4 2020-06-20 21 10 2020-06-20 22 5 2020-06-20 23 10 2020-06-21 00 9 2020-06-21 01 1 2020-06-21 02 3 2020-06-21 05 1 2020-06-21 06 1 2020-06-21 07 1 2020-06-21 08 3 2020-06-21 09 7 2020-06-21 10 7 2020-06-21 11 1 Name: comment_time, dtype: int64 '''
产生时间序列数据
# 去掉年份
x_line1 = [i.replace('2020-','') for i in comment_num.index.to_list()]
'''
['06-20 14',
'06-20 15',
'06-20 16',
'06-20 17',
'06-20 18',
...
'''
# 这个时点评论人数
y_line1 = comment_num.values.tolist()
# [3, 15, 6, 15, 11, 12, 4, 10, 5, 10, 9, 1, 3, 1, 1, 1, 3, 7, 7, 1]
导入所需包
import jieba
import stylecloud
from pyecharts.charts import Pie, Bar, Map, Line, WordCloud, Page
from pyecharts import options as opts
from pyecharts.globals import SymbolType
Mojito评论人数走势图
c = ( Line() #初始化 .add_xaxis(x_line1) # X轴 # Y轴 .add_yaxis('', # 系列名称 y_line1,# 系列数据 # 标记点配置项 markpoint_opts=opts.MarkPointOpts(data=[ opts.MarkPointItem(type_='max', name='最大值'),# 标记最大值 opts.MarkPointItem(type_='min', name='最小值') # 标记最小值 ])) # 全局配置项 .set_global_opts( # 标题设置 title_opts=opts.TitleOpts('Mojito评论人数走势图'), # 轴标签设置 xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate='30')), # 标签旋转 ) # 系统配置项 .set_series_opts( # 标签配置项 label_opts=opts.LabelOpts(is_show=False), # 不显示标签 # 线配置项 linestyle_opts=opts.LineStyleOpts(width=3)) # 线宽3 .render("line_base.html") )
QQ音乐评论词云图
def get_cut_words(content_series): # 读入停用词表 stop_words = [] with open(r"C:\\Users\\Administrator\\Desktop\\stop_words.txt", 'r') as f: lines = f.readlines() for line in lines: stop_words.append(line.strip()) # 添加关键词 my_words = ['周杰伦', '一首歌', '好好听', '方文山', '30多岁'] for i in my_words: jieba.add_word(i) # 自定义停用词 my_stop_words = ['歌有', '真的', '这首', '一首', '一点', '反正', '一段', '一句', '首歌', '啊啊啊', '哈哈哈', '转发', '微博', '那段', '他会' ] stop_words.extend(my_stop_words) # 分词 content=';'.join([ str(c) for c in df['content'].tolist()]) word_num = jieba.lcut(content) # 条件筛选 word_num_selected = [i for i in word_num if i not in stop_words and len(i)>=2] return word_num_selected text1 = get_cut_words(content_series=df.content) stylecloud.gen_stylecloud(text=' '.join(text1), max_words=1000, collocations=False, font_path='C:\\Windows\\Fonts\\simhei.ttf', icon_name='fas fa-music', size=624, gradient='vertical' , palette='cartocolors.diverging.TealRose_2', output_name='C:\\Users\\Administrator\\Desktop\\QQ音乐评论词云图.png')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。