赞
踩
目录
搜索关键词:跨年
视频标题、发布时间、视频链接、up主id、up主首页链接
- import requests # 导入用于发送HTTP请求的requests库
- from bs4 import BeautifulSoup # 导入用于解析HTML的BeautifulSoup库
- import pandas as pd # 导入用于处理数据的pandas库
网址(只需修改文字即可),page为第1页,设置请求头(跳过反爬)
- # 网址
- url = "https://search.bilibili.com/all?vt=53655423&keyword=跨年&page=1"
- # 设置请求头,用于模拟浏览器发送请求
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
- # 发送 HTTP 请求并获取响应内容
- response = requests.get(url=url, headers=headers)
- soup = BeautifulSoup(response.content, "html.parser")
在B站搜索界面,按F12或者浏览器右上角菜单,可以打开开发者工具然后在网页结构中,鼠标移动到指定位置,左边会有对应的颜色显示,当前网页结构是哪部分内容,然后找到自己想要的内容, 就可以开始解析了
- # 解析搜索结果
- items = soup.find_all("div", class_="bili-video-card__info __scale-disable")
-
- # 创建空的DataFrame,用于存储解析后的数据
- titles = pd.DataFrame([], columns=['标题'])
- times = pd.DataFrame([], columns=['发布时间'])
- urls = pd.DataFrame([], columns=['网址'])
- ups = pd.DataFrame([], columns=['up主id'])
- upurls = pd.DataFrame([], columns=['up主首页'])
-
- for item in items:
- # 提取视频标题
- title = item.find('h3', class_='bili-video-card__info--tit').text
- title = pd.DataFrame([title], columns=['标题'])
-
- # 提取发布时间
- time = item.find('span', class_='bili-video-card__info--date').get_text(strip=True)
- time = pd.DataFrame([time], columns=['发布时间'])
-
- # 提取视频链接
- url = item.find('a', href=True)['href']
- url = pd.DataFrame([url], columns=['网址'])
-
- # 提取up主id
- up = item.find('span', class_='bili-video-card__info--author').get_text(strip=True)
- up = pd.DataFrame([up], columns=['up主id'])
-
- # 提取up主首页链接
- upurl = item.find('a', class_='bili-video-card__info--owner').get('href')
- upurl = pd.DataFrame([upurl], columns=['up主首页'])
-
- # 将提取的数据添加到相应的DataFrame中
- titles = pd.concat([titles, title], ignore_index=True)
- times = pd.concat([times, time], ignore_index=True)
- urls = pd.concat([urls, url], ignore_index=True)
- ups = pd.concat([ups, up], ignore_index=True)
- upurls = pd.concat([upurls, upurl], ignore_index=True)
-
- # 合并所有DataFrame为一个内容DataFrame
- contents_1 = pd.concat([titles, times, urls, ups, upurls], axis=1, ignore_index=False)
进入刚才搜索关键词所爬到的,各个视频的链接,然后爬取视频的点赞量、投币量、收藏量、转发量、播放量、弹幕量
- # 创建空的DataFrame,用于存储解析后的数据
- dianzan = pd.DataFrame([], columns=['点赞量'])
- toubi = pd.DataFrame([], columns=['投币量'])
- shoucang = pd.DataFrame([], columns=['收藏量'])
- zhuanfa = pd.DataFrame([], columns=['转发量'])
- bofang = pd.DataFrame([], columns=['播放量'])
- danmu = pd.DataFrame([], columns=['弹幕量'])
-
- for www in contents_1['网址']:
- url = "https:"+www
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
- # 发送 HTTP 请求并获取响应内容
- response = requests.get(url=url, headers=headers)
- soup = BeautifulSoup(response.content, "html.parser")
-
- # 点赞量的位置
- dian = soup.find_all("div", class_="video-like video-toolbar-left-item")
- # 投币量的位置
- tou = soup.find_all("div", class_="video-coin video-toolbar-left-item")
- # 收藏量的位置
- shou = soup.find_all("div", class_="video-fav video-toolbar-left-item")
- # 转发量的位置
- zhuan = soup.find_all("div", class_="video-share-wrap video-toolbar-left-item")
- # 播放量的位置
- bo = soup.find_all("div", class_="video-info-detail-list")
- # 弹幕量的位置
- dan = soup.find_all("div", class_="video-info-detail-list")
-
- for d in dian:
- d = d.find('span', class_='video-like-info video-toolbar-item-text').get_text(strip=True)
- d = pd.DataFrame([d], columns=['点赞量'])
- for t in tou:
- t = t.find('span', class_='video-coin-info video-toolbar-item-text').get_text(strip=True)
- t = pd.DataFrame([t], columns=['投币量'])
- for s in shou:
- s = s.find('span', class_='video-fav-info video-toolbar-item-text').get_text(strip=True)
- s = pd.DataFrame([s], columns=['收藏量'])
- for z in zhuan:
- z = z.find('span', class_='video-share-info video-toolbar-item-text').get_text(strip=True)
- z = pd.DataFrame([z], columns=['转发量'])
- for b in bo:
- b = b.find('span', class_='view item').get_text(strip=True)
- b = pd.DataFrame([b], columns=['播放量'])
- for da in dan:
- da = da.find('span', class_='dm item').get_text(strip=True)
- da = pd.DataFrame([da], columns=['弹幕量'])
-
- # 将提取的数据添加到相应的DataFrame中
- dianzan = pd.concat([dianzan, d], ignore_index=True)
- toubi = pd.concat([toubi, t], ignore_index=True)
- shoucang = pd.concat([shoucang, s], ignore_index=True)
- zhuanfa = pd.concat([zhuanfa, z], ignore_index=True)
- bofang = pd.concat([bofang, b], ignore_index=True)
- danmu = pd.concat([danmu, da], ignore_index=True)
-
- # 合并所有DataFrame为一个内容DataFrame
- contents_2 = pd.concat([dianzan, toubi, shoucang, zhuanfa, bofang, danmu], axis=1, ignore_index=False)
合并 '在搜索页面爬到的数据' 和 '进入视频链接后爬到的数据'
- # 合并
- contents = pd.concat([contents_1, contents_2], axis=1)
以下是部分结果展示
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。