赞
踩
csv表格截图:
2D条形图以及3D条形图:
一个异步加载请求库:
pip install selenium (下载对应版本可用selenium == <版本号>)
两个可视化库(任选其一即可):
pip install matplotlib(下载对应版本同以上方法)
pip install pyecharts (下载对应版本同以上方法)
由于b站api参数更改了,未能够找到合适的方式进行分页爬取,故采用以下两种方式
打开开发者工具,找到图中红圈的位置
打开后查看其中的json信息找到replies,如下图所示:
每一则url会包含20条评论信息
其中包含用户的评论内容(content)评论地址(ip)用户名(uname)等信息
刷新找到多条含有url地址的保存进列表中,如以下所示
- ulist = [ "https://api.bilibili.com/x/v2/reply/wbi/main?oid=320392432&type=1&mode=3&pagination_str=%7B%2"
- "2offset%22:%22%7B%5C%22type%5C%22:1,%5C%22direction%5C%22:1,%5C%22session_id%5C%22:%5C%221734639"
- "697397073%5C%22,%5C%22data%5C%22:%7B%7D%7D%22%7D&plat=1&web_location=1315875&w_rid=0b96518e2f520"
- "2e2b4036fb3d596d4ff&wts=1693984070",
- "https://api.bilibili.com/x/v2/reply/wbi/main?oid=320392432&type=1&mode=3&pagination_str=%7B%22offset"
- "%22:%22%7B%5C%22type%5C%22:1,%5C%22direction%5C%22:1,%5C%22session_id%5C%22:%5C%221734639697397073%5C%2"
- "2,%5C%22data%5C%22:%7B%7D%7D%22%7D&plat=1&web_location=1315875&w_rid=27358d1f64a9e52b91756210beee635d&w"
- "ts=1693984100"]
'运行
此种方式缺点时需要手动去寻找对应的url地址
首先需要获取b站的cookies文件。
首先是需要读取cookies文件
- ListCookies = []
- with open('ACookies.txt', 'r') as fw:
- for line in fw:
- cookie = json.loads(line.strip()) # Parse JSON data from each line
- ListCookies.append(cookie)
将获取的cookies文件请求一次b站
- chrome_options = Options()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-gpu')
- driver = webdriver.Chrome(options=chrome_options)
- # 访问网页
- driver.get('https://www.bilibili.com/')
- print("正在解析对应的评论网址~~~")
- # 用cookies登录b站
- for cookie in ListCookies:
- driver.add_cookie(cookie)
- driver.get('https://www.bilibili.com/video/BV1fF411k7Vf/?spm_id_from=33'
- '3.1007.tianma.4-2-12.click&vd_source=1f033f5a233d6a47a02edcf7b98db3e8')
- # 等待一些时间,以确保页面加载完成(你可以根据需要使用等待条件)
- time.sleep(20)
接下来只需要三个函数,
正则表达式匹配目标url:
- def target_url():
- url_pattern = r"https?://[^\s/$.?#].[^\s]*\/main\?[^\s]*"
- xhr_requests = driver.requests
- for request in xhr_requests:
- if re.search(url_pattern, request.url):
- UrlList.append(request.url)
- print("Match the correct URL!")
- print(request.url)
- print(request.response.status_code)
'运行
时间戳转化
- def trans_date(v_timestamp):
- timeArray = time.localtime(v_timestamp)
- otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
- return otherStyleTime
'运行
提取评论的json数据保存位csv格式
- def write_to_csv():
- # 爬取所需数据
- print(UrlList)
- comments_data = [
- ['uname', 'sex', 'sign', 'ip', 'time', 'like', 'content']
- ]
- for comment in UrlList:
- comment_response = requests.get(comment, cookies=cookies)
- if comment_response.status_code == 200:
- json1 = comment_response.json()
- replies = json1['data']['replies']
- for reply in replies:
- uname = reply['member']['uname']
- sex = reply['member']['sex']
- sign = reply['member']['sign']
- original_ip = reply['reply_control']['location']
- substring_to_remove = "IP属地:"
- ip = original_ip.replace(substring_to_remove, '')
- time1 = trans_date(reply['ctime'])
- like = reply['like']
- content = reply['content']['message']
- comments_data.append([uname, sex, sign, ip, time1, like, content])
- else:
- print("wrong")
-
- # 指定要保存的CSV文件路径
-
- csv_file_path = "comments.csv"
-
- # 创建一个DataFrame
- df = pd.DataFrame(comments_data)
-
- # 将DataFrame写入CSV文件
- df.to_csv(csv_file_path, index=False, header=False, encoding="utf-8")
-
- print(f"Comments have been written to {csv_file_path}")
'运行
注意此处在使用get请求时,需要传入cookies参数。否则不能得到有含有ip地址的json文件
此种方式缺点是不能较快的获取url地址
2D条形图:
读取csv文件
- # 读取CSV文件到Pandas DataFrame
- df = pd.read_csv('comments.csv')
- # print(df)
-
- df = df.groupby(['ip', '性别'])['点赞数'].sum().reset_index()
- # print(df)
由于数据中的一个ip地址需要有三个性别才能进行堆叠。所以需要用pandas文件进行数据处理。筛选出IP只对应一个性别的数据,为其添加另外两个性别,并重新组合列表。最后使用pyecharts来实现每一个性别数据的堆叠。代码实现如下
- # 获取每个 IP 地址的性别数量
-
- gender_counts = df.groupby('ip')['性别'].nunique().reset_index()
- print(gender_counts)
- # 为每个 IP 地址添加缺失的性别,并将点赞数设为0
-
- missing_genders = gender_counts[gender_counts['性别'] < 3]
- for index, row in missing_genders.iterrows():
- ip = row['ip']
- missing_genders_data = [
- {"ip": ip, "性别": "男", "点赞数": 0},
- {"ip": ip, "性别": "女", "点赞数": 0},
- {"ip": ip, "性别": "保密", "点赞数": 0}
- ]
- df = pd.concat([df, pd.DataFrame(missing_genders_data)])
- # 重新计算点赞数,将相同 IP 和性别的点赞数合并
-
- df = df.groupby(['ip', '性别'])['点赞数'].sum().reset_index()
- # 将 DataFrame 转换为字典列表
- result = df.to_dict(orient='records')
- # 打印结果
- print(result)
-
- bar = Bar()
- # 提取数据中的城市和性别
-
- cities = pd.Series([i['ip'] for i in result]).drop_duplicates().tolist()
- genders = pd.Series([i['性别'] for i in result]).drop_duplicates().tolist()
- print(cities)
- print(genders)
- # 遍历性别列表,为每个性别创建一组数据
-
- for gender in genders:
- likes = [item['点赞数'] for item in result if item['性别'] == gender]
- bar.add_xaxis(cities) # X 轴数据为城市
- bar.add_yaxis(gender, likes, stack="stack") # Y 轴数据为点赞数,堆叠方式
-
- # 设置全局选项
-
- bar.set_global_opts(
- title_opts=opts.TitleOpts(title="各城市各性别点赞数堆叠条形图"),
- xaxis_opts=opts.AxisOpts(type_="category"),
- yaxis_opts=opts.AxisOpts(type_="value"),
- )
-
- # 渲染图表到 HTML 文件中
-
- bar.render("stacked_bar_chart.html")
3D直接用pyecharts就能够直接转化为3D条形图,代码如下
- bar3d = (
- Bar3D()
- .add(
- '',
- df[['ip', '性别', '点赞数']].values.tolist(),
- xaxis3d_opts=opts.Axis3DOpts(df['ip'].unique().tolist(), type_="category"),
- yaxis3d_opts=opts.Axis3DOpts(df['性别'].unique().tolist(), type_="category"),
- zaxis3d_opts=opts.Axis3DOpts(type_="value"),
- )
- .set_global_opts(
- visualmap_opts=opts.VisualMapOpts(max_=df['点赞数'].max()),
- title_opts=opts.TitleOpts(title="IP对应的性别和点赞的3D条形图"),
- )
- )
-
- # 渲染图表
- bar3d.render("bar3d_chart.html")
解析评论并保存为csv部分
- import requests
- import time
- import pandas as pd
- # 请求的url地址列表:
- UrlList = [
-
- ]
-
- # 请求评论url地址时需要的cookies
- cookies = {
-
- }
- # 转化评论时间
- def trans_date(v_timestamp):
- """10位时间戳转换为时间字符串"""
- timeArray = time.localtime(v_timestamp)
- otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
- return otherStyleTime
-
- # 将文件写入csv中
- def write_to_csv():
- # 爬取所需数据
- comments_data = [
- ['用户名', '性别', '个性签名', 'ip', '评论时间', '点赞数', '内容']
- ]
- for comment in UrlList:
- comment_response = requests.get(comment, cookies=cookies)
- if comment_response.status_code == 200:
- json = comment_response.json()
- replies = json['data']['replies']
- for reply in replies:
- uname = reply['member']['uname']
- sex = reply['member']['sex']
- sign = reply['member']['sign']
- original_ip = reply['reply_control']['location']
- substring_to_remove = "IP属地:"
- ip = original_ip.replace(substring_to_remove, '')
- time1 = trans_date(reply['ctime'])
- like = reply['like']
- content = reply['content']['message']
- # data = {'uname': uname, 'sex': sex, 'sign': sign, 'content': content}
- comments_data.append([uname, sex, sign, ip, time1, like, content])
- else:
- print("wrong")
- time.sleep(5)
-
- # 指定要保存的CSV文件路径
-
- csv_file_path = "comments.csv"
-
- # 创建一个DataFrame
- df = pd.DataFrame(comments_data)
-
- # 将DataFrame写入CSV文件
- df.to_csv(csv_file_path, index=False, header=False, encoding="utf-8")
-
- print(f"Comments have been written to {csv_file_path}")
- res = requests.get("https://api.bilibili.com/x/v2/reply/wbi/main?oid=320392432&type=1&mode=3&pagination_str=%7B%2"
- "2offset%22:%22%7B%5C%22type%5C%22:1,%5C%22direction%5C%22:1,%5C%22session_id%5C%22:%5C%221734639"
- "697397073%5C%22,%5C%22data%5C%22:%7B%7D%7D%22%7D&plat=1&web_location=1315875&w_rid=0b96518e2f520"
- "2e2b4036fb3d596d4ff&wts=1693984070", cookies=cookies)
- print(res.status_code)
- print("初次请求已完成,正在写入csv数据~~~")
- if __name__ == '__main__':
- print(UrlList)
- write_to_csv()
将csv数据处理并可视化部分
- import pandas as pd
- from pyecharts.charts import Bar
- from pyecharts import options as opts
- from pyecharts.charts import Bar3D
-
- # 读取CSV文件到Pandas DataFrame
- df = pd.read_csv('comments.csv')
- # print(df)
-
- df = df.groupby(['ip', '性别'])['点赞数'].sum().reset_index()
- # print(df)
-
-
- # 获取每个 IP 地址的性别数量
-
- gender_counts = df.groupby('ip')['性别'].nunique().reset_index()
- print(gender_counts)
- # 为每个 IP 地址添加缺失的性别,并将点赞数设为0
-
- missing_genders = gender_counts[gender_counts['性别'] < 3]
- for index, row in missing_genders.iterrows():
- ip = row['ip']
- missing_genders_data = [
- {"ip": ip, "性别": "男", "点赞数": 0},
- {"ip": ip, "性别": "女", "点赞数": 0},
- {"ip": ip, "性别": "保密", "点赞数": 0}
- ]
- df = pd.concat([df, pd.DataFrame(missing_genders_data)])
- # 重新计算点赞数,将相同 IP 和性别的点赞数合并
-
- df = df.groupby(['ip', '性别'])['点赞数'].sum().reset_index()
- # 将 DataFrame 转换为字典列表
- result = df.to_dict(orient='records')
- # 打印结果
- print(result)
-
- bar = Bar()
- # 提取数据中的城市和性别
-
- cities = pd.Series([i['ip'] for i in result]).drop_duplicates().tolist()
- genders = pd.Series([i['性别'] for i in result]).drop_duplicates().tolist()
- print(cities)
- print(genders)
- # 遍历性别列表,为每个性别创建一组数据
-
- for gender in genders:
- likes = [item['点赞数'] for item in result if item['性别'] == gender]
- bar.add_xaxis(cities) # X 轴数据为城市
- bar.add_yaxis(gender, likes, stack="stack") # Y 轴数据为点赞数,堆叠方式
-
- # 设置全局选项
-
- bar.set_global_opts(
- title_opts=opts.TitleOpts(title="各城市各性别点赞数堆叠条形图"),
- xaxis_opts=opts.AxisOpts(type_="category"),
- yaxis_opts=opts.AxisOpts(type_="value"),
- )
-
- # 渲染图表到 HTML 文件中
-
- bar.render("stacked_bar_chart.html")
-
-
-
-
- # 用3d图来表示数据的可视化
- bar3d = (
- Bar3D()
- .add(
- '',
- df[['ip', '性别', '点赞数']].values.tolist(),
- xaxis3d_opts=opts.Axis3DOpts(df['ip'].unique().tolist(), type_="category"),
- yaxis3d_opts=opts.Axis3DOpts(df['性别'].unique().tolist(), type_="category"),
- zaxis3d_opts=opts.Axis3DOpts(type_="value"),
- )
- .set_global_opts(
- visualmap_opts=opts.VisualMapOpts(max_=df['点赞数'].max()),
- title_opts=opts.TitleOpts(title="IP对应的性别和点赞的3D条形图"),
- )
- )
-
- # 渲染图表
- bar3d.render("bar3d_chart.html")
如何错误或则该进之处,欢迎各位指出!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。