当前位置:   article > 正文

python爬取B站视频

python爬取B站视频

参考:https://cloud.tencent.com/developer/article/1768680

参考的代码有点问题,请求头需要修改,上代码:

  1. import requests
  2. import re # 正则表达式
  3. import pprint
  4. import json
  5. from moviepy.editor import AudioFileClip, VideoFileClip
  6. from bs4 import BeautifulSoup as bs
  7. headers = {
  8. # 防盗链 告诉服务器 我们请求的url网址是从哪里跳转过来的
  9. 'referer': 'https://www.bilibili.com/a',
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
  11. }
  12. def send_request(url):
  13. response = requests.get(url=url, headers=headers)
  14. return response
  15. def get_video_data(html_data):
  16. """解析视频数据"""
  17. # 提取视频的标题
  18. soup = bs(html_data, 'lxml')
  19. title = soup.find_all(name='h1',attrs={"class":"video-title special-text-indent"})[0].get_text()
  20. # print(title)
  21. # 提取视频对应的json数据
  22. json_data = re.findall('<script>window\.__playinfo__=(.*?)</script>', html_data)[0]
  23. # print(json_data) # json_data 字符串
  24. json_data = json.loads(json_data)
  25. pprint.pprint(json_data)
  26. # 提取音频的url地址
  27. audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
  28. print('解析到的音频地址:', audio_url)
  29. # 提取视频画面的url地址
  30. video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]
  31. print('解析到的视频地址:', video_url)
  32. video_data = [title, audio_url, video_url]
  33. return video_data
  34. def save_data(file_name, audio_url, video_url):
  35. # 请求数据
  36. print('正在请求音频数据')
  37. audio_data = send_request(audio_url).content
  38. print('正在请求视频数据')
  39. video_data = send_request(video_url).content
  40. with open(file_name + '.mp3', mode='wb') as f:
  41. f.write(audio_data)
  42. print('正在保存音频数据')
  43. with open(file_name + '.mp4', mode='wb') as f:
  44. f.write(video_data)
  45. print('正在保存视频数据')
  46. def merge_data(video_name):
  47. print('视频合成开始:', video_name)
  48. audioclip = AudioFileClip(video_name+'.mp3')
  49. videoclip = VideoFileClip(video_name+'.mp4')
  50. # 3.获取视频和音频的时长
  51. video_time = videoclip.duration
  52. audio_time = audioclip.duration
  53. # 4.对视频或者音频进行裁剪
  54. if video_time > audio_time:
  55. # 视频时长>音频时长,对视频进行截取
  56. videoclip_new = videoclip.subclip(0, audio_time)
  57. audioclip_new = audioclip
  58. else:
  59. # 音频时长>视频时长,对音频进行截取
  60. videoclip_new = videoclip
  61. audioclip_new = audioclip.subclip(0, video_time)
  62. # 5.视频中加入音频
  63. video_with_new_audio = videoclip_new.set_audio(audioclip_new)
  64. # 6.写入到新的视频文件中
  65. video_with_new_audio.write_videofile("output.mp4",
  66. codec='libx264',
  67. audio_codec='aac',
  68. temp_audiofile='temp-video.m4a',
  69. remove_temp=True
  70. )
  71. print('视频合成结束:', video_name)
  72. url = 'https://www.bilibili.com/video/BV1bK421a7qG/?spm_id_from=333.1007.tianma.6-4-22.click'
  73. response = send_request(url)
  74. response.encoding = requests.utils.get_encodings_from_content(response.text)[0]
  75. html_data = response.text
  76. video_data = get_video_data(html_data)
  77. save_data(video_data[0], video_data[1], video_data[2])
  78. merge_data(video_data[0])

效果

小姐姐挺靓,就是左下角有水印,想办法去除水印,参考:python实战之去除视频水印&字幕_python 去除视频水印-CSDN博客

  1. import os
  2. import sys
  3. import cv2
  4. import numpy
  5. from moviepy import editor
  6. TEMP_VIDEO = 'temp.mp4'
  7. class WatermarkRemover():
  8. def __init__(self, video_path, output, threshold: int, kernel_size: int):
  9. self.threshold = threshold # 阈值分割所用阈值
  10. self.kernel_size = kernel_size # 膨胀运算核尺寸
  11. self.video_path = video_path
  12. self.output = output
  13. #根据用户手动选择的ROI(Region of Interest,感兴趣区域)框选水印或字幕位置。
  14. def select_roi(self, img: numpy.ndarray, hint: str) -> list:
  15. '''
  16. 框选水印或字幕位置,SPACE或ENTER键退出
  17. :param img: 显示图片
  18. :return: 框选区域坐标
  19. '''
  20. COFF = 0.7
  21. w, h = int(COFF * img.shape[1]), int(COFF * img.shape[0])
  22. resize_img = cv2.resize(img, (w, h))
  23. roi = cv2.selectROI(hint, resize_img, False, False)
  24. cv2.destroyAllWindows()
  25. watermark_roi = [int(roi[0] / COFF), int(roi[1] / COFF), int(roi[2] / COFF), int(roi[3] / COFF)]
  26. return watermark_roi
  27. #对输入的蒙版进行膨胀运算,扩大蒙版的范围
  28. def dilate_mask(self, mask: numpy.ndarray) -> numpy.ndarray:
  29. '''
  30. 对蒙版进行膨胀运算
  31. :param mask: 蒙版图片
  32. :return: 膨胀处理后蒙版
  33. '''
  34. kernel = numpy.ones((self.kernel_size, self.kernel_size), numpy.uint8)
  35. mask = cv2.dilate(mask, kernel)
  36. return mask
  37. #根据手动选择的ROI区域,在单帧图像中生成水印或字幕的蒙版。
  38. def generate_single_mask(self, img: numpy.ndarray, roi: list, threshold: int) -> numpy.ndarray:
  39. '''
  40. 通过手动选择的ROI区域生成单帧图像的水印蒙版
  41. :param img: 单帧图像
  42. :param roi: 手动选择区域坐标
  43. :param threshold: 二值化阈值
  44. :return: 水印蒙版
  45. '''
  46. # 区域无效,程序退出
  47. if len(roi) != 4:
  48. print('NULL ROI!')
  49. sys.exit()
  50. # 复制单帧灰度图像ROI内像素点
  51. roi_img = numpy.zeros((img.shape[0], img.shape[1]), numpy.uint8)
  52. start_x, end_x = int(roi[1]), int(roi[1] + roi[3])
  53. start_y, end_y = int(roi[0]), int(roi[0] + roi[2])
  54. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  55. roi_img[start_x:end_x, start_y:end_y] = gray[start_x:end_x, start_y:end_y]
  56. # 阈值分割
  57. _, mask = cv2.threshold(roi_img, threshold, 255, cv2.THRESH_BINARY)
  58. return mask
  59. #通过截取视频中多帧图像生成多张水印蒙版,并通过逻辑与计算生成最终的水印蒙版
  60. def generate_watermark_mask(self, video_path: str) -> numpy.ndarray:
  61. '''
  62. 截取视频中多帧图像生成多张水印蒙版,通过逻辑与计算生成最终水印蒙版
  63. :param video_path: 视频文件路径
  64. :return: 水印蒙版
  65. '''
  66. video = cv2.VideoCapture(video_path)
  67. success, frame = video.read()
  68. roi = self.select_roi(frame, 'select watermark ROI')
  69. mask = numpy.ones((frame.shape[0], frame.shape[1]), numpy.uint8)
  70. mask.fill(255)
  71. step = video.get(cv2.CAP_PROP_FRAME_COUNT) // 5
  72. index = 0
  73. while success:
  74. if index % step == 0:
  75. mask = cv2.bitwise_and(mask, self.generate_single_mask(frame, roi, self.threshold))
  76. success, frame = video.read()
  77. index += 1
  78. video.release()
  79. return self.dilate_mask(mask)
  80. #根据手动选择的ROI区域,在单帧图像中生成字幕的蒙版。
  81. def generate_subtitle_mask(self, frame: numpy.ndarray, roi: list) -> numpy.ndarray:
  82. '''
  83. 通过手动选择ROI区域生成单帧图像字幕蒙版
  84. :param frame: 单帧图像
  85. :param roi: 手动选择区域坐标
  86. :return: 字幕蒙版
  87. '''
  88. mask = self.generate_single_mask(frame, [0, roi[1], frame.shape[1], roi[3]], self.threshold) # 仅使用ROI横坐标区域
  89. return self.dilate_mask(mask)
  90. def inpaint_image(self, img: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
  91. '''
  92. 修复图像
  93. :param img: 单帧图像
  94. :parma mask: 蒙版
  95. :return: 修复后图像
  96. '''
  97. telea = cv2.inpaint(img, mask, 1, cv2.INPAINT_TELEA)
  98. return telea
  99. def merge_audio(self, input_path: str, output_path: str, temp_path: str):
  100. '''
  101. 合并音频与处理后视频
  102. :param input_path: 原视频文件路径
  103. :param output_path: 封装音视频后文件路径
  104. :param temp_path: 无声视频文件路径
  105. '''
  106. with editor.VideoFileClip(input_path) as video:
  107. audio = video.audio
  108. with editor.VideoFileClip(temp_path) as opencv_video:
  109. clip = opencv_video.set_audio(audio)
  110. clip.to_videofile(output_path)
  111. def remove_video_watermark(self):
  112. '''
  113. 去除视频水印
  114. '''
  115. if not os.path.exists(self.output):
  116. os.makedirs(self.output)
  117. filenames = [os.path.join(self.video_path, i) for i in os.listdir(self.video_path)]
  118. mask = None
  119. for i, name in enumerate(filenames):
  120. if i == 0:
  121. # 生成水印蒙版
  122. mask = self.generate_watermark_mask(name)
  123. # 创建待写入文件对象
  124. video = cv2.VideoCapture(name)
  125. fps = video.get(cv2.CAP_PROP_FPS)
  126. size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
  127. video_writer = cv2.VideoWriter(TEMP_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
  128. # 逐帧处理图像
  129. success, frame = video.read()
  130. while success:
  131. frame = self.inpaint_image(frame, mask)
  132. video_writer.write(frame)
  133. success, frame = video.read()
  134. video.release()
  135. video_writer.release()
  136. # 封装视频
  137. (_, filename) = os.path.split(name)
  138. output_path = os.path.join(self.output, filename.split('.')[0] + '_no_watermark.mp4') # 输出文件路径
  139. self.merge_audio(name, output_path, TEMP_VIDEO)
  140. if os.path.exists(TEMP_VIDEO):
  141. os.remove(TEMP_VIDEO)
  142. def remove_video_subtitle(self):
  143. '''去除视频字幕'''
  144. if not os.path.exists(self.output):
  145. os.makedirs(self.output)
  146. filenames = [os.path.join(self.video_path, i) for i in os.listdir(self.video_path)]
  147. roi = []
  148. for i, name in enumerate(filenames):
  149. # 创建待写入文件对象
  150. video = cv2.VideoCapture(name)
  151. fps = video.get(cv2.CAP_PROP_FPS)
  152. size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
  153. video_writer = cv2.VideoWriter(TEMP_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
  154. # 逐帧处理图像
  155. success, frame = video.read()
  156. if i == 0:
  157. roi = self.select_roi(frame, 'select subtitle ROI')
  158. while success:
  159. mask = self.generate_subtitle_mask(frame, roi)
  160. frame = self.inpaint_image(frame, mask)
  161. video_writer.write(frame)
  162. success, frame = video.read()
  163. video.release()
  164. video_writer.release()
  165. # 封装视频
  166. (_, filename) = os.path.split(name)
  167. output_path = os.path.join(OUTPUT_PATH, filename.split('.')[0] + '_no_sub.mp4') # 输出文件路径
  168. self.merge_audio(name, output_path, TEMP_VIDEO)
  169. if os.path.exists(TEMP_VIDEO):
  170. os.remove(TEMP_VIDEO)
  171. # 去水印
  172. video_path = 'video'
  173. output_path = 'output'
  174. remover = WatermarkRemover(video_path,output_path,threshold=80, kernel_size=5)
  175. remover.remove_video_watermark()
  176. #去字幕
  177. # remover = WatermarkRemover(video_path,output_path,threshold=80, kernel_size=5)
  178. # remover.remove_video_subtitle()

效果一般吧:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/359829
推荐阅读
相关标签
  

闽ICP备14008679号