当前位置:   article > 正文

爬取哔哩哔哩任意网站,以selenium为例_哔哩哔哩怎么抓ua

哔哩哔哩怎么抓ua

第一篇

from selenium import webdriver

url = 'https://www.bilibili.com/video/BV1iN4y1a7KJ'

options = webdriver.ChromeOptions()

options.add_experimental_option('detach', True)

driver = webdriver.Chrome(options=options)

driver.get(url)

import time

time.sleep(5)

html = driver.page_source

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

title = soup.find('h1', class_="video-title")

count = soup.find('span', class_="view item")

dm = soup.find('span', class_="dm item")

datetime = soup.find('span', class_="pubdate-text")

comments = soup.find_all('div', class_="content-warp")

comments_text = []

for comment in comments:

    name = comment.find('div', class_="user-info").text

    text = comment.find('span', class_="reply-content").text

    comments_text.append({

        'name': name,

        'text': text

    })

# 输出结果

print(f"标题:{title.text},播放量:{count.text.strip()},弹幕数:{dm.text.strip()}")

for comment in comments_text:

    print(f"评论:\nID:{comment['name']},评论内容:{comment['text']}")

driver.close()

第二篇

from selenium import webdriver

from selenium.webdriver.common.by import By

url = 'https://www.bilibili.com/v/popular/all/'

options = webdriver.ChromeOptions()

options.add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge(options=options)

driver.get(url)

html = driver.page_source


 

from bs4 import BeautifulSoup

soup = BeautifulSoup(html,'lxml')

result = soup.find_all('div',class_='video-card')

for item in result:

    title = item.find('p',class_='video-name')

    up = item.find('span',class_='up-name__text')

    play = item.find('span',class_='play-text')

    print(f'标题: {title.text.strip()}, Up: {up.text.strip()}, 播放量: {play.text.strip()}')


第三篇

import requests

from bs4 import BeautifulSoup

url = 'https://yjsy.hunnu.edu.cn/zsks/sszk1.htm'

ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'

headers = {

    'User-Agent': ua

}

response = requests.get(url,headers=headers)

response.encoding = 'utf-8'

print(response.text)

html = response.text

soup = BeautifulSoup(response.text, 'lxml')

result = soup.find_all('a',target="_blank")

for item in result:

    print(item.text)

for item in result:

    if '2024' in item.text and '招生简章' in item.text:

        print(item.text)

第四篇

from bs4 import BeautifulSoup

import requests

url = 'https://yjsy.hunnu.edu.cn/zsks/sszk1.htm'

ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'

headers = {

    'User-Agent':ua

}

html = requests.get(url,headers=headers)

html.encoding = 'utf-8'

soup = BeautifulSoup(html.text,'lxml')

result = soup.find_all('a',target='_blank')

for item in result:

    if '2024' in item.text:

       print(item.text)

第五篇

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver import ActionChains

import time

options = webdriver.ChromeOptions()

options.add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge(options=options)

driver.get('https://yjsy.hunnu.edu.cn/')

xpath1 = "//ul[@class='menu']/li[4]/a"

button1 = driver.find_element(By.XPATH,xpath1)

ActionChains(driver).move_to_element(button1).perform()

xpath2 = "//ul[@class='menu']/li[4]/ul/li[2]"

button2 = driver.find_element(By.XPATH,xpath2)

ActionChains(driver).move_to_element(button2).perform()

time.sleep(3)

ActionChains(driver).move_to_element(button2).click(button2).perform()

html = driver.page_source

print(html)

from bs4 import BeautifulSoup

soup = BeautifulSoup(html,'lxml')

result = soup.find_all('a',target='_blank')

for item in result:

    if '2024' in item.text:

       print(item.text)


第六篇

#抓取网页源代码

import requests

ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'

html = requests.get('https://yjsy.hunnu.edu.cn/zsks/sszk1.htm',headers={'user-Agent':ua})

html.encoding = 'utf-8'

print(html.text)

#使用xpath筛选数据

xpath = "//a[@target='_blank']"

from lxml import etree

page = etree.HTML(html.text)

result = page.xpath(xpath)

#print(result)

#输出所有符合xpath的结果

for item in result:

    print(item.text)

#对感兴趣的数据进行提纯

print()

print("符合条件的结果有")

for item in result:

    if '2024' in item.text and '招生简章' in item.text:

        print(item.text)

第七篇

from selenium import webdriver

from selenium.webdriver.common.by import By

options = webdriver.ChromeOptions()

options.add_experimental_option('detach',True)

driver = webdriver.ChromiumEdge(options=options)


 

driver.get('https://www.baidu.com')

import time

time.sleep(3)

driver.find_element(By.ID,'kw').send_keys("湘潭理工学院")

time.sleep(3)

driver.find_element(By.ID,'su').click()

driver.save_screenshot('baidu.png')

#html = driver.page_source

#print(html)

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号