当前位置:   article > 正文

selenium以及phantomjs爬取b站蔡徐坤

selenium以及phantomjs爬取b站蔡徐坤

  1. # coding=utf-8
  2. # 最新版的selenium(4.x.x)已经不支持PhantomJS。如要用PhantomJS,可用旧版本selenium。如pip install selenium==3.8.0。
  3. from selenium import webdriver
  4. from selenium.common.exceptions import TimeoutException
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from bs4 import BeautifulSoup
  9. import xlwt
  10. # browser = webdriver.PhantomJS()
  11. browser = webdriver.Chrome()
  12. WAIT = WebDriverWait(browser, 10)
  13. browser.set_window_size(1400, 900)
  14. book = xlwt.Workbook(encoding='utf-8', style_compression=0)
  15. sheet = book.add_sheet('蔡徐坤篮球', cell_overwrite_ok=True)
  16. sheet.write(0, 0, '名称')
  17. sheet.write(0, 1, '地址')
  18. sheet.write(0, 2, '描述')
  19. sheet.write(0, 3, '观看次数')
  20. sheet.write(0, 4, '弹幕数')
  21. sheet.write(0, 5, '发布时间')
  22. n = 1
  23. def search():
  24.     try:
  25.         print('开始访问b站....')
  26.         browser.get("https://www.bilibili.com/")
  27.         # 被那个破登录遮住了
  28.         # index = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#primary_menu > ul > li.home > a")))
  29.         # index.click()
  30.         input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#nav_searchform > input")))
  31.         submit = WAIT.until(EC.element_to_be_clickable(
  32.             (By.XPATH, '/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div/form/div/button')))
  33.         input.send_keys('蔡徐坤 篮球')
  34.         submit.click()
  35.         # 跳转到新的窗口
  36.         print('跳转到新窗口')
  37.         all_h = browser.window_handles
  38.         browser.switch_to.window(all_h[1])
  39.         get_source()
  40.         total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,
  41.                                                            "#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button")))
  42.         return int(total.text)
  43.     except TimeoutException:
  44.         return search()
  45. def next_page(page_num):
  46.     try:
  47.         print('获取下一页数据')
  48.         next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
  49.                                                           '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
  50.         next_btn.click()
  51.         WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,
  52.                                                      '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),
  53.                                                     str(page_num)))
  54.         get_source()
  55.     except TimeoutException:
  56.         browser.refresh()
  57.         return next_page(page_num)
  58. def save_to_excel(soup):
  59.     list = soup.find(class_='video-list clearfix').find_all(class_='video-item matrix')
  60.     for item in list:
  61.         item_title = item.find('a').get('title')
  62.         item_link = item.find('a').get('href')
  63.         item_dec = item.find(class_='des hide').text
  64.         item_view = item.find(class_='so-icon watch-num').text
  65.         item_biubiu = item.find(class_='so-icon hide').text
  66.         item_date = item.find(class_='so-icon time').text
  67.         print('爬取:' + item_title)
  68.         global n
  69.         sheet.write(n, 0, item_title)
  70.         sheet.write(n, 1, item_link)
  71.         sheet.write(n, 2, item_dec)
  72.         sheet.write(n, 3, item_view)
  73.         sheet.write(n, 4, item_biubiu)
  74.         sheet.write(n, 5, item_date)
  75.         n = n + 1
  76. def get_source():
  77.     WAIT.until(EC.presence_of_element_located(
  78.         (By.CSS_SELECTOR, '#all-list > div.flow-loader > div.filter-wrap')))
  79.     html = browser.page_source
  80.     soup = BeautifulSoup(html, 'lxml')
  81.     print('到这')
  82.     save_to_excel(soup)
  83. def main():
  84.     try:
  85.         total = search()
  86.         print(total)
  87.         for i in range(2, int(total + 1)):
  88.             next_page(i)
  89.     finally:
  90.         browser.close()
  91. if __name__ == '__main__':
  92.     main()
  93.     book.save('蔡徐坤篮球.xlsx')

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/696921
推荐阅读
相关标签
  

闽ICP备14008679号