当前位置:   article > 正文

python实验四_20191318实验四 《Python程序设计》实验报告

哩哔哩直播间信息抓取练习 实验的实验报告

20191318 《Python程序设计》实验四报告

课程:《Python程序设计》

班级: 1913

姓名: 王泽文

学号:20191318

实验教师:王志强

实验日期:2020年6月10日

必修/选修: 公选课

1. 实验内容

Python综合应用:爬虫、数据处理、可视化、机器学习、神经网络、游戏、网络安全等。

我选择了一个爬虫,爬取bilibili弹幕网站单个视频和up主的一些信息。

2. 实验过程及结果

在这次的编程中需要用到re,requests,json,tkinter,xlwt,xmltodict,time的Python库。

首先需要对B站的网页进行分析,在网上进行查询后我得知可以通过一些api的接口获得B站的数据。于是我便使用了一些api网址来简化我的网页分析。

BilibiliAPI合集

对于一些api中没有的信息,爬取便通过正则表达式获得。

爬虫分两个部分,一是对视频的爬取,二是对up主的爬取。

o_200610120858%E5%AE%9E%E9%AA%8C%E8%AF%BE41.png

o_200610120911%E5%AE%9E%E9%AA%8C%E8%AF%BE42.png

运行效果如图

以up主 崩坏三 (UUID:256667467)为例:

o_200610122301%E5%AE%9E%E9%AA%8C%E8%AF%BE43.png

o_200610122408%E5%AE%9E%E9%AA%8C%E8%AF%BE47.png

以视频BV号:BV1Yz411q7iT为例:

o_200610122315%E5%AE%9E%E9%AA%8C%E8%AF%BE44.png

o_200610122335%E5%AE%9E%E9%AA%8C%E8%AF%BE45.png

o_200610122354%E5%AE%9E%E9%AA%8C%E8%AF%BE46.png

代码如下

import tkinter as tk

from tkinter import ttk

import tkinter.scrolledtext as tks

import re

import requests

import json

import tkinter.messagebox

import xlwt

import xmltodict

import time

a = '''

//

\ \ //

\ \ //

##DDDDDDDDDDDDDDDDDDDDDD##

## DDDDDDDDDDDDDDDDDDDD ## ________ ___ ___ ___ ________ ___ ___ ___

## hh hh ## |\ __ \ |\ \ |\ \ |\ \ |\ __ \ |\ \ |\ \ |\ \

## hh // \ \ hh ## \ \ \|\ /_\ \ \ \ \ \ \ \ \ \ \ \|\ /_ \ \ \ \ \ \ \ \ \

## hh // \ \ hh ## \ \ __ \ \ \ \ \ \ \ \ \ \ \ \ __ \ \ \ \ \ \ \ \ \ \

## hh hh ## \ \ \|\ \ \ \ \ \ \ \____ \ \ \ \ \ \|\ \ \ \ \ \ \ \____ \ \ \

## hh wwww hh ## \ \______\ \ \__\ \ \_____\ \ \__\ \ \_______\ \ \_\ \ \______\ \ \__\

## hh hh ## \|_______| \|__| \|_______| \|__| \|_______| \|__| \|_______| \|__|

## MMMMMMMMMMMMMMMMMMMM ##

##MMMMMMMMMMMMMMMMMMMMMM## Bilibili Tool 1.0.1. Designed by wzwWhitecat.

\/ \/

'''

# BV号转AV号

def bv_to_av(bv):

bv_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M",

"N",

"P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",

"k",

"m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]

data = [13, 12, 46, 31, 43, 18, 40, 28, 5, 54, 20, 15, 8, 39, 57, 45, 36, 38, 51, 42, 49, 52, 53, 7, 4, 9, 50, 10,

44, 34, 6, 25, 1, 26, 29, 56, 3, 24, 0, 47, 27, 22, 41, 16, 11, 37, 2, 35, 21, 17, 33, 30, 48, 23, 55, 32,

14, 19]

num0 = [6, 2, 4, 8, 5, 9, 3, 7, 1, 0]

num1 = 100618342136696320

num2 = 177451812

bv_new = []

for i in bv:

bv_new.append(i)

del bv_new[0]

del bv_new[0]

for i in range(len(bv_new)):

for n in range(len(bv_data)):

if bv_new[i] == bv_data[n]:

bv_new[i] = data[n]

data_sum = 0

for i in range(len(bv_new)):

bv_new[i] = bv_new[i] * (58 ** num0[i])

data_sum = data_sum + bv_new[i]

av_result = (data_sum - num1) ^ num2

return av_result

# 视频爬取按键控制

def searchav(var):

global aid

t31.delete('1.0', 'end')

t32.delete('1.0', 'end')

t33.delete('1.0', 'end')

id = entry3.get()

if var.get() == 1:

aid = id

elif var.get() == 2:

aid = bv_to_av(id)

main_message(aid)

find_danmu(aid)

find_pinglun(aid)

# 爬取视频主要信息

def main_message(aid):

global response3, barrages, barrages_2

url = 'http://api.bilibili.com/archive_stat/stat?aid=' + str(aid) + '&type=jsonp'

url2 = 'https://api.bilibili.com/x/web-interface/archive/desc?&aid=' + str(aid)

url3 = 'https://www.bilibili.com/video/av' + str(aid)

headers = {"user-agent": "Mozilla/5.0"}

response1 = requests.get(url, headers=headers).text

response2 = requests.get(url2, headers=headers).text

barrages = json.loads(response1)

barrages_2 = json.loads(response2)

try:

response3 = requests.get(url3, headers=headers).text

text = r'

(.*?)'

text0 = r''

text1 = re.findall(text, response3)

text2 = re.findall(text0, response3)

t31.insert("end", " 标题:" + text1[0] + "\n" + " up:" + text2[0] + "\n")

except:

try:

url4 = 'https://search.bilibili.com/all?keyword=' + str(aid)

response4 = requests.get(url4, headers=headers).text

text11 = r'title="(.*?)"'

text3 = re.findall(text11, response4)

t31.insert("end", " 标题:" + text3[0] + "\n")

except:

t31.insert('end', "Error!\n")

t31.insert("end", " av号:av" + str(aid) + "\n" + " 观看:" + str(barrages['data']['view']) + " 弹幕:" + str(

barrages['data']['danmaku']) + " 评论:" +

str(barrages['data']['reply']) + " 点赞:" + str(barrages['data']['like']) + " 投币:" + str(

barrages['data']['coin']) + " 收藏:" +

str(barrages['data']['favorite']) + " 分享:" + str(barrages['data']['share']) + "\n")

t31.insert("end", " 视频简介:" + barrages_2['data'] + "\n")

# 爬取视频弹幕

def find_danmu(aid):

barrages_cs.clear()

global barrages

headers = {"user-agent": "Mozilla/5.0"}

url = f"https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp"

response = requests.get(url, headers=headers).text

cid_dict_list = json.loads(response)["data"]

# print(cid_dict_list)

# print(len(cid_dict_list))

for cid in cid_dict_list:

cid = cid["cid"]

url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"

print(url)

try:

barrages_xml = requests.get(url, headers=headers).content.decode("utf-8")

barrages_json = xmltodict.parse(barrages_xml)

barrages_str = json.dumps(barrages_json)

barrages = json.loads(barrages_str).get("i").get("d")

except requests.HTTPError as e:

t32.insert('end', e)

except requests.RequestException as e:

t32.insert('end', e)

except:

t32.insert('end', "Unknown Error!")

for barrage in barrages:

if "#text" in barrage:

barrage = str(barrage["#text"]) + ","

# t32.insert('end', barrage+"\n")

barrages_cs.append(barrage)

for i in range(len(barrages_cs)):

t32.insert('end', str(i + 1) + "." + barrages_cs[i] + "\n")

t32.insert('end', "总共" + str(len(barrages_cs)) + "条\n")

# 爬取视频评论

def find_pinglun(aid):

commentlist.clear()

global barrages

hlist = []

hlist.append("UUID")

hlist.append("名字")

hlist.append("性别")

hlist.append("时间")

hlist.append("评论")

hlist.append("点赞数")

hlist.append("回复数")

commentlist.append(hlist)

for n in range(20):

page = str(n + 1)

headers = {"user-agent": "Mozilla/5.0"}

url = f"http://api.bilibili.com/x/v2/reply?jsonp=jsonp&;pn={page}&type=1&oid={aid}"

try:

response = requests.get(url, headers=headers).text

barrages = json.loads(response)

for i in range(20):

comment = barrages['data']['replies'][i]

blist = []

mid = comment['member']['mid']

username = comment['member']['uname']

sex = comment['member']['sex']

ctime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(comment['ctime']))

content = comment['content']['message']

likes = comment['like']

rcounts = comment['rcount']

signature = comment['member']['sign']

level = comment['member']['level_info']['current_level']

blist.append(mid)

blist.append(username)

blist.append(sex)

blist.append(ctime)

blist.append(content)

blist.append(likes)

blist.append(rcounts)

blist.append(signature)

blist.append(level)

commentlist.append(blist)

except requests.HTTPError as e:

t33.insert('end', e)

except requests.RequestException as e:

t33.insert('end', e)

except IndexError:

t33.insert('end', "第" + str(n + 1) + "页" + "IndexError\n")

except:

t33.insert('end', "第" + str(n + 1) + "页" + "Unknown Error!\n")

for i in range(len(commentlist)):

t33.insert('end',

str(commentlist[i][3]) + "\n客户端:" + str(commentlist[i][1]) + "\n评论:" + str(commentlist[i][4]) + "\n")

# 爬取up主按键控制

def searchup():

t41.delete('1.0', 'end')

t43.delete('1.0', 'end')

uid = entry4.get()

up_message(uid)

find_up_video(uid)

# 爬取Up主信息

def up_message(uid):

url = f'https://api.bilibili.com/x/relation/stat?vmid={uid}&jsonp=jsonp'

url2 = f'https://api.bilibili.com/x/space/upstat?mid={uid}'

url3 = f'https://space.bilibili.com/{uid}'

text0 = r'

(.*?)的个人空间 - 哔哩哔哩 [(] ゜- ゜[)]つロ 乾杯~ Bilibili'

text1 = r''

headers = {"user-agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers).text

response2 = requests.get(url2, headers=headers).text

response3 = requests.get(url3, headers=headers).text

up_message_barrages1 = json.loads(response).get("data")

up_message_barrages2 = json.loads(response2)

up_uid = str(up_message_barrages1["mid"])

up_following = str(up_message_barrages1["following"])

up_watchv = str(up_message_barrages2['data']['archive']['view'])

up_watcha = str(up_message_barrages2["data"]["article"]["view"])

up_likes = str(up_message_barrages2["data"]["likes"])

up_follower = str(up_message_barrages1["follower"])

up_name = re.findall(text0, response3)

up_main = re.findall(text1, response3)

t41.insert("end", "up:" + up_name[

0] + "\nuid:" + up_uid + "\n关注数:" + up_following + " 视频总播放量:" + up_watchv + " 文章总观看量:" + up_watcha + " 总点赞数:" + up_likes + " 粉丝数:" + up_follower + "\n简介:" +

up_main[0])

# 爬取up主投稿视频列表

def find_up_video(uid):

videolist.clear()

hlist = []

hlist.append("av号")

hlist.append("视频标题")

hlist.append("作者")

hlist.append("作者UUID")

hlist.append("发布时间")

hlist.append("视频长度")

hlist.append("收藏数")

hlist.append("评论数")

hlist.append("弹幕数")

hlist.append("播放量")

hlist.append("视频简介")

videolist.append(hlist)

for n in range(20):

page = str(n + 1)

headers = {"user-agent": "Mozilla/5.0"}

url = 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + uid + '&pagesize=20&page=' + page + '&jsonp=jsonp'

try:

response = requests.get(url, headers=headers).text

barrages = json.loads(response)

v_message = barrages['data']['vlist']

for i in v_message:

blist = []

aid = i['aid']

v_tiltle = i['title']

v_author = i['author']

mid = i['mid']

v_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(i['created']))

v_length = i['length']

v_favorites = i['favorites']

v_comment = i['comment']

video_review = i['video_review']

v_play = i['play']

v_description = i['description']

blist.append(aid)

blist.append(v_tiltle)

blist.append(v_author)

blist.append(mid)

blist.append(v_time)

blist.append(v_length)

blist.append(v_favorites)

blist.append(v_comment)

blist.append(video_review)

blist.append(v_play)

blist.append(v_description)

videolist.append(blist)

except requests.HTTPError as e:

t43.insert('end', e)

except requests.RequestException as e:

t43.insert('end', e)

except IndexError:

t43.insert('end', "第" + str(n + 1) + "页" + "IndexError\n")

except:

t43.insert('end', "第" + str(n + 1) + "页" + "Unknown Error!\n")

for i in range(len(videolist)):

t43.insert("end",

"\n" + str(i + 1) + ".av" + str(videolist[i + 1][0]) + "\n标题:" + videolist[i + 1][1] + "\nup主:" +

videolist[i + 1][2] + " UUID:" + str(videolist[i + 1][3]) + "\n" +

str(videolist[i + 1][4]) + " 时长:" + str(videolist[i + 1][5]) + " 播放:" + str(

videolist[i + 1][9]) + " 弹幕:" + str(videolist[i + 1][8]) +

" 评论:" + str(videolist[i + 1][7]) + " 收藏:" + str(videolist[i + 1][6]) + "\n视频简介:" +

videolist[i + 1][10])

# 保存信息按键控制

def save_data(n):

if n == 1:

id = entry3.get()

for i in range(len(barrages_cs)):

b = open(str(id) + "的弹幕.txt", "a", newline="\n", encoding="utf-8")

b.write(str(i + 1) + "." + barrages_cs[i] + "\n")

b.close()

tkinter.messagebox.showinfo(title='Success', message='保存弹幕成功!')

if n == 2:

id = entry3.get()

work_book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = work_book.add_sheet(str(id) + '的评论', cell_overwrite_ok=True)

for i in range(len(commentlist)):

for m in range(len(commentlist[i])):

sheet.write(i, m, commentlist[i][m])

work_book.save(str(id) + '的评论.xls')

tkinter.messagebox.showinfo(title='Success', message='保存评论成功!')

if n == 3:

id = entry4.get()

work_book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = work_book.add_sheet('UUID' + str(id) + '的视频列表', cell_overwrite_ok=True)

for i in range(len(videolist)):

for m in range(len(videolist[i])):

sheet.write(i, m, videolist[i][m])

work_book.save('UUID' + str(id) + '的视频列表.xls')

tkinter.messagebox.showinfo(title='Success', message='保存视频列表成功!')

# 窗口页面跳转控制

def changetab(a):

fr3.pack_forget()

fr4.pack_forget()

if a == 3:

fr3.pack()

if a == 4:

fr4.pack()

def loading():

tkinter.messagebox.showinfo(title='Thanks', message='更多功能敬请期待!')

# 窗口设计

print(a)

win = tk.Tk()

win.title("bilibili")

menubar = tk.Menu(win)

moremenu = tk.Menu(menubar, tearoff=0)

menubar.add_command(label='视频检索', command=lambda: changetab(3))

menubar.add_command(label='up检索', command=lambda: changetab(4))

menubar.add_cascade(label='更多', menu=moremenu)

moremenu.add_cascade(label='高级功能', command=lambda: loading())

moremenu.add_separator()

moremenu.add_command(label='退出', command=win.quit)

win.config(menu=menubar)

fr3 = ttk.Frame(win)

fr3.pack()

l31 = ttk.Label(fr3, text='请输入av号/Bv号:')

l31.grid(column=0, row=0)

entry3 = ttk.Entry(fr3, justify="left", font=1, width=20)

entry3.grid(row=0, column=1, columnspan=6)

b3_search = ttk.Button(fr3, text='搜索', command=lambda: searchav(var3))

b3_search.grid(row=0, column=7)

var3 = tk.IntVar()

var3.set(1)

r31 = ttk.Radiobutton(fr3, text='av号', variable=var3, value=1, command=0)

r31.grid(row=1, column=2)

r32 = ttk.Radiobutton(fr3, text='Bv号', variable=var3, value=2, command=0)

r32.grid(row=1, column=5)

l32 = ttk.Label(fr3, text='基本信息')

l32.grid(row=2, column=0, columnspan=8)

t31 = tks.ScrolledText(fr3, height=5, width=50)

t31.grid(row=3, column=0, columnspan=8, padx=10, pady=5)

l32 = ttk.Label(fr3, text='弹幕列表')

l32.grid(row=4, column=0, columnspan=4)

l32 = ttk.Label(fr3, text='视频评论')

l32.grid(row=4, column=5, columnspan=4)

t32 = tks.ScrolledText(fr3, height=20, width=25)

t32.grid(row=5, column=0, columnspan=4, padx=5, pady=5)

t33 = tks.ScrolledText(fr3, height=20, width=25)

t33.grid(row=5, column=5, columnspan=4, padx=5, pady=5)

b31 = ttk.Button(fr3, text='保存弹幕', command=lambda: save_data(1))

b31.grid(row=6, column=0, columnspan=4)

b31 = ttk.Button(fr3, text='保存评论', command=lambda: save_data(2))

b31.grid(row=6, column=5, columnspan=4)

commentlist = []

barrages_cs = []

fr4 = ttk.Frame(win)

l41 = ttk.Label(fr4, text='请输入up的uid:')

l41.grid(column=0, row=0)

entry4 = ttk.Entry(fr4, justify="left", font=1, width=20)

entry4.grid(row=0, column=1, columnspan=6)

b4_search = ttk.Button(fr4, text='搜索', command=lambda: searchup())

b4_search.grid(row=0, column=7)

l42 = ttk.Label(fr4, text='基本信息')

l42.grid(row=2, column=0, columnspan=8)

t41 = tks.ScrolledText(fr4, height=5, width=50)

t41.grid(row=3, column=0, columnspan=8, rowspan=2)

l44 = ttk.Label(fr4, text='视频列表')

l44.grid(row=5, column=0, columnspan=8)

t43 = tks.ScrolledText(fr4, height=20, width=50)

t43.grid(row=6, column=0, columnspan=8)

b41 = ttk.Button(fr4, text='保存列表', command=lambda: save_data(3))

b41.grid(row=7, column=0, columnspan=8)

videolist = []

win.mainloop()

3. 实验过程中遇到的问题和解决过程

网络上提供的B站api接口都是以AV号为基础,但在三月份时B站将AV号升级为了BV号

解决方法:通过查询网络得知AV号与BV号之间的转换其实是一个base58编码的魔改,于是根据其转换原理编写了从BV号转为AV号的程序。

def bv_to_av(bv):

bv_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M",

"N",

"P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",

"k",

"m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]

data = [13, 12, 46, 31, 43, 18, 40, 28, 5, 54, 20, 15, 8, 39, 57, 45, 36, 38, 51, 42, 49, 52, 53, 7, 4, 9, 50, 10,

44, 34, 6, 25, 1, 26, 29, 56, 3, 24, 0, 47, 27, 22, 41, 16, 11, 37, 2, 35, 21, 17, 33, 30, 48, 23, 55, 32,

14, 19]

num0 = [6, 2, 4, 8, 5, 9, 3, 7, 1, 0]

num1 = 100618342136696320

num2 = 177451812

bv_new = []

for i in bv:

bv_new.append(i)

del bv_new[0]

del bv_new[0]

for i in range(len(bv_new)):

for n in range(len(bv_data)):

if bv_new[i] == bv_data[n]:

bv_new[i] = data[n]

data_sum = 0

for i in range(len(bv_new)):

bv_new[i] = bv_new[i] * (58 ** num0[i])

data_sum = data_sum + bv_new[i]

av_result = (data_sum - num1) ^ num2

return av_result

4. 思考和感悟

经过一学期的Python程序设计课程学习,我对Python这门语言有了一个较为深刻的理解,对我运用编程解决问题的能力有了很大的提升。

参考资料

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/716275
推荐阅读
相关标签
  

闽ICP备14008679号