当前位置:   article > 正文

Python爬虫神器简单介绍与使用(requests、Beautiful Soup、selenium等)_dplayer 爬虫

dplayer 爬虫

目录

一、requests

        1.1、介绍

        1.2、requests案例

二、BeautifulSoup

        2.1、介绍

        2.2、安装

        2.3、BeautifulSoup案例

三、selenium

        3.1、介绍

        3.2、selenium案例

        3.3、以手机标识user-agent打开网页

四、总结


一、requests

        1.1、介绍

                requests这个库相信大家都有所耳闻,requests自称 "让HTTP服务于人类" 。因为requests中的接口简介方便,所以目前已经是爬虫玩家最爱的工具了(至少是博主的最爱)。

                 官方文档:Requests: 让 HTTP 服务人类 — Requests 2.18.1 文档

        1.2、requests案例

  1. import requests
  2. import re
  3. from bs4 import BeautifulSoup
  4. from selenium import webdriver
  5. import os
  6. '''
  7. rqs.content 和 rqs.text 输出貌似是一样的,只是格式不同而已
  8. '''
  9. def requests_url(requests,url,headers):
  10. rqs = requests.get(url=url,headers=headers)
  11. if rqs.status_code == 200:
  12. print("响应成功")
  13. print("页面编码:", rqs.encoding)
  14. print("content:", type(rqs.content))
  15. print("text:", type(rqs.text))
  16. print("响应头:", rqs.headers)
  17. print("cookies:", rqs.cookies)
  18. # 获取状态码为200则证明响应成功,可以接着做一些其他的事情
  19. pass
  20. else:
  21. print("响应失败")
  22. if __name__ == '__main__':
  23. # 要爬取的url链接
  24. url = 'https://v.qq.com/'
  25. # headers中包括多种参数,可在页面f12查看header参数填写
  26. headers = {
  27. 'user - agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 97.0.4692.71Safari / 537.36Edg / 97.0.1072.62',
  28. 'cookies': '_ga=GA1.2.6371039.1615789955; eas_sid=e1e6s1o8y6P5n3q4m5u0e3t314; pgv_pvid=3404159900; RK=We4QlR07wA; ptcz=c41ecc5aa5596efb435144cc312f65129e0fe5c740d8030dff614a671f6d5f56; tvfe_boss_uuid=1e3ffcf4a9941f29; o_cookie=2289550339; pac_uid=1_2289550339; pgv_info=ssid=s7852532080; midas_openkey=@GUPuObMQG; midas_openid=2289550339; dgqqcomrouteLine=a20220221fxfd_a20220314download; ied_qq=o2289550339; rv2=805DE51BF9D936734F033F8A9F76E46E6CB77BD3B5C802DC08; property20=5764B79D9D63533983A5D15A47D0EEE1C2CB03478C5885C354BD1C873DBB23722F41156BF5228533; nyzxqqcomrouteLine=a20220317nyzx; tokenParams=%3FsOpenid%3D11853442815032565208%26sSign%3D0FD296824F479F17D3D0281056A6D648%26_wv%3D49952%26ADTAG%3Dtgi.qq.share.qq; ptag=cn_bing_com; video_platform=2; video_guid=910ab70e6bdcc45b; bucket_id=9231002; ts_refer=cn.bing.com/; ts_uid=9431210527; vversion_name=8.2.95; video_omgid=910ab70e6bdcc45b; acctype=pt; QQLivePCVer=50221312; qv_als=mobG4XYcttwicAiVA11649817068STaWZA==; ts_last=v.qq.com/x/cover/mzc002001uhk73a.html; ad_play_index=64'
  29. }
  30. # 调用爬取网页函数
  31. requests_url(requests,url,headers)

二、BeautifulSoup

        2.1、介绍

                我觉得BeautifulSoup最厉害的地方是他可以把html解析成一个对象,这样把html的每个标签分之,我们使用的时候就可以 Object.div.p.ul.li 来访问html标签为 <li> 的数据。这样再我们去网页抓取资源时可以准确的抓取到

                 官方文档:Beautiful Soup 4.4.0 文档 — Beautiful Soup 4.2.0 中文 文档

  

        2.2、安装

pip install bs4

        2.3、BeautifulSoup案例

  1. from bs4 import BeautifulSoup
  2. import requests
  3. import sys
  4. from lxml import etree
  5. def requests_url(url,headers):
  6. p4 = requests.get(url, headers=headers)
  7. soup = BeautifulSoup(p4.content, 'html.parser')
  8. print(type(soup.prettify())) # <class 'str'>
  9. '''
  10. 相当于把html解析成一个数据对象,可以通过.的方式访问html的标签
  11. '''
  12. print(soup.div.ul)
  13. print(soup.div.ul.find(href="/vod-type-id-3-pg-1.html"))
  14. print(soup.div.ul.li.a['title'])
  15. print(type(soup.div)) # <class 'bs4.element.Tag'>
  16. # 一次查找所有标签带a的
  17. print(soup.find_all('a'))
  18. if __name__ == '__main__':
  19. url_list = ['https://4480.tv/vod-play-id-63360-src-3-num-{}.html'.format(i) for i in range(9, 16)]
  20. headers = {
  21. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62',
  22. 'cookies': 'PHPSESSID=g65ipam1c0kgj8sctkcbfmkq35; __51vcke__JSfg3PPYsm2jAeAF=59365898-12ee-5c79-aee8-c07a733dec1f; __51vuft__JSfg3PPYsm2jAeAF=1649736012578; __51uvsct__JSfg3PPYsm2jAeAF=2; __vtins__JSfg3PPYsm2jAeAF=%7B%22sid%22%3A%20%22475e118d-15a5-55ef-8abd-75e4e54116f2%22%2C%20%22vd%22%3A%2020%2C%20%22stt%22%3A%202315702%2C%20%22dr%22%3A%20678768%2C%20%22expires%22%3A%201649765561094%2C%20%22ct%22%3A%201649763761094%7D'
  23. }
  24. for url in url_list:
  25. requests_url(url, headers)
  26. sys.exit()

三、selenium

        3.1、介绍

                Selenium是一个用于Web应用程序测试的工具Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla FirefoxSafariGoogle ChromeOpera,Edge等

                官方文档:Selenium with Python — Selenium Python Bindings 2 documentation

        3.2、selenium案例

  1. from selenium import webdriver
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import sys
  5. from lxml import etree
  6. import time
  7. from selenium.webdriver.common.by import By
  8. import os
  9. def parser_url(url):
  10. driver.get(url)
  11. time.sleep(2)
  12. # # \
  13. # pwd = os.path.split(os.path.realpath(__file__))[0]
  14. # # 保存快照
  15. # driver.save_screenshot(pwd+'\快照.png')
  16. try:
  17. # aa = driver.find_elements_by_class_name(name="dplayer-video dplayer-video-current")
  18. # aa = driver.find_element_by_id("viewport")
  19. aa = driver.find_elements_by_tag_name("div") # 查找html标签为div的数据
  20. # aa = driver.find_element(by=By.LINK_TEXT,value='电影').get_property("herf")
  21. # aa = driver.title
  22. print(aa) # list
  23. for i in aa:
  24. print(i.text)
  25. except:
  26. print("出错")
  27. else:
  28. driver.close()
  29. sys.exit()
  30. if __name__ == '__main__':
  31. driver = webdriver.Chrome()
  32. url_list = ['https://4480.tv/vod-play-id-63360-src-3-num-{}.html'.format(i) for i in range(9,16)]
  33. print(url_list)
  34. for url in url_list:
  35. # headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62',
  36. # 'cooker':'PHPSESSID=g65ipam1c0kgj8sctkcbfmkq35; __51vcke__JSfg3PPYsm2jAeAF=59365898-12ee-5c79-aee8-c07a733dec1f; __51vuft__JSfg3PPYsm2jAeAF=1649736012578; __51uvsct__JSfg3PPYsm2jAeAF=2; __vtins__JSfg3PPYsm2jAeAF=%7B%22sid%22%3A%20%22475e118d-15a5-55ef-8abd-75e4e54116f2%22%2C%20%22vd%22%3A%2020%2C%20%22stt%22%3A%202315702%2C%20%22dr%22%3A%20678768%2C%20%22expires%22%3A%201649765561094%2C%20%22ct%22%3A%201649763761094%7D'}
  37. parser_url(url)
  38. '''
  39. 官方文档示例
  40. from selenium import webdriver
  41. from selenium.webdriver.common.keys import Keys
  42. driver = webdriver.Chrome()
  43. driver.get("http://www.python.org")
  44. assert "Python" in driver.title
  45. elem = driver.find_element_by_name("q")
  46. elem.clear()
  47. elem.send_keys("pycon")
  48. elem.send_keys(Keys.RETURN)
  49. assert "No results found." not in driver.page_source
  50. driver.close()
  51. '''

        3.3、以手机标识user-agent打开网页

  1. from selenium import webdriver
  2. from selenium.webdriver.chrome.options import Options
  3. mobile_emulation = {
  4. "deviceMetrics": {"width": 920, "height": 850, "pixelRatio": 3.0}, # 定义设备高宽,像素比
  5. "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) " # 通过UA来模拟
  6. "AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
  7. chrome_options = Options()
  8. chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
  9. driver = webdriver.Chrome(chrome_options = chrome_options)
  10. driver.get("http://www.baidu.com")

四、总结

        其实除了上面写到的几个工具外,还有很多爬虫可以用的工具。如:urllib、urllib3等。由于不怎么了解就先不写了,后续再更吧。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/991443
推荐阅读
相关标签
  

闽ICP备14008679号