赞
踩
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests
案例:百度
import requests
url = "http://www.baidu.com"
# 发送 get 请求
response = requests.get(url)
# 设置字符集 (根据爬取网页 charset=utf-8)
response.encoding = 'utf8'
# 获取网页源代码
print(response.text)
案例:百度翻译
import requests
url = "https://fanyi.baidu.com/sug"
# 搜索词
data = {
"kw":"love"
}
# 发送 get 请求
response = requests.post(url,data)
# 如果返回值是 json 数据,可直接调用 json() 方法
print(response.json())
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"
}
response = requests.get(url, headers=headers)
proxy = {
"http": "121.233.212.34:3432"
}
response = requests.get(url, proxies=proxy)
案例:17k小说网
import requests
sessions = requests.session()
url = 'https://passport.17k.com/ck/user/login'
data = {
"loginName": "15675521581",
"password": "**"
}
res = sessions.post(url, data=data)
# print(res.cookies)
result = sessions.get(url = 'https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919')
print(result.json())
案例:梨视频
import requests # 抓包的视频地址 https://video.pearvideo.com/mp4/short/20161103/1683211212907-10041181-hd.mp4 # 实际播放的地址 https://video.pearvideo.com/mp4/short/20161103/cont-1007270-10041181-hd.mp4 # 比较两个地址需要把抓包地址后面的时间戳替换为 cont-id url = 'https://www.pearvideo.com/videoStatus.jsp?contId=1007270&mrd=0.2400885482825692' refererUrl = "https://www.pearvideo.com/video_1007270" contId = refererUrl.split('_')[1] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Referer": refererUrl } res = requests.get(url, headers=headers) # 抓包返回的数据 dict = res.json() print(dict) # 抓包返回的时间戳 systemTime = dict['systemTime'] # 抓包返回的视频地址 srcUrl = dict['videoInfo']['videos']['srcUrl'] # 地址处理 newUrl = srcUrl.replace(systemTime, f"cont-{contId}") print(newUrl) # 下载视频到本地 with open('test.mp4', mode='wb') as f: f.write(requests.get(newUrl).content)
findall
import re
result = re.findall(r"\d+","今天写了2行代码,赚了200元。")
print(result) # ['2', '200'] 返回列表
search
import re
result = re.search(r"\d+","今天写了2行代码,赚了200元。")
print(result) # <re.Match object; span=(4, 5), match='2'> 只匹配第一个值,然后返回
print(result.group()) # 2 从 match 获取匹配的值
finditer
import re
result = re.finditer(r"\d+","今天写了2行代码,赚了200元。")
print(result) # <callable_iterator object at 0x000001B01E0DAE00> 把所有结果放在迭代器里
for item in result:
#<re.Match object; span=(4, 5), match='2'> 2
# <re.Match object; span=(11, 14), match='200'> 200
print(item,item.group())
提前写好正则表达式
import re
obj = re.compile(r"\d+")
result = obj.findall("今天写了2行代码,赚了200元。")
print(result) # ['2', '200']
匹配换行 re.S
obj = re.compile(r"\d+",re.S)
在 html 中使用
import re s = """ <div> <div><a href="http://www.baidu.com">Baidu</a></div> <div><a href="http://www.weixin.com">weixin</a></div> </div> """ # 输出如下格式 # http://www.baidu.com Baidu # http://www.weixin.com weixin obj = re.compile(r'<div><a href="(.*?)">(.*?)</a></div>') result = obj.finditer(s) for item in result: print(item.group(1),item.group(2))
原子组使用别名
obj = re.compile(r'<div><a href="(?P<url>.*?)">(?P<title>.*?)</a></div>')
result = obj.finditer(s)
for item in result:
print(item.group('url'),item.group('title'))
import requests import re url = 'https://movie.douban.com/chart' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52" } response = requests.get(url, headers=headers,) response.encoding = 'utf-8' # re.S 可以让 re 匹配到换行符 obj = re.compile(r'<dl class="">.*?<dd>.*?<a .*?class="">(.*?)</a>',re.S) content = obj.finditer(response.text) for item in content: print(str(item.group(1)).strip()) # 哈利·波特与凤凰社 # 茶馆 # 喜剧之王 # 萤火虫之墓 # 少年派的奇幻漂流 # 上帝之城 # 疯狂原始人 # 机器人总动员 # 泰坦尼克号 # 谍影重重 # 红辣椒 # 神偷奶爸
错误的原因是解析的html文件中,开始的标签和结束的标签不匹配
根据报错提示,找到对应的位置添加结束标签即可
单标签在括号里添加 /
双标签写上对应的结束标签
下载
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple lxml
from lxml import etree
tree = etree.parse('index.html')
from lxml import etree
tree = etree.HTML()
html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8" /> <title>Title</title> </head> <body> <ul> <li class="zs">zs</li> <li >ls</li> <li class="xr">xr</li> <li id="xr" class="xr">xr2</li> </ul> <img src="icon.png" alt="" /> </body> </html>
基本语法示例
from lxml import etree tree = etree.parse('index.html') # 查找 ul 下的所有 li print(tree.xpath("//ul/li")) # [<Element li at 0x20b094f3dc0>, <Element li at 0x20b094f3e80>, <Element li at 0x20b094f3f00>] # 获取 ul 下 li class="xr" 值 print(tree.xpath("//ul/li[@class='xr']/text()")) # ['xr', 'xr2'] # 查找 ul 下第一个 li print(tree.xpath("//ul/li[1]/text()")) # ['zs'] # 获取 img 标签 src 属性值 print(tree.xpath("//img/@src")) # ['icon.png'] # 查找 ul 下 li id="xr" 并且 class="xr" print(tree.xpath("//ul/li[@class='xr' and @id='xr']/text()")) # ['xr2']
下载
pip install selenium
下载谷歌驱动
https://chromedriver.storage.googleapis.com/index.html
基本使用
from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from time import sleep url = 'https://www.lagou.com' # path = Service('./chromedriver.exe') # 取消自动关闭浏览器 option = webdriver.ChromeOptions() option.add_experimental_option("detach", True) # 创建浏览器对象 driver = webdriver.Chrome(options=option) # driver = webdriver.Chrome() # 最大化窗口 # driver.maximize_window() # 最小化窗口 # driver.minimize_window() # 打开地址 driver.get(url) # 获取网站的 title 标签内容 # print(driver.title) # 通过 xpath 查找元素 btn = driver.find_element(by=By.XPATH, value='//*[@id="cboxClose"]') # <button type="button" id="cboxClose">close</button> # 获取值 和 type 属性 ---- close button print(btn.text, btn.get_attribute('type')) # 点击按钮 btn.click() sleep(2) # 根据 id 获取 search_btn = driver.find_element(by=By.ID,value='search_input') # 向 input 输入内容 search_btn.send_keys('python') sleep(1) # 清空输入框 search_btn.clear() # 执行 js 代码,删除某元素 driver.execute_script(""" const ad = document.querySelector('.un-login-banner') ad.parentNode.removeChild(ad) """) # 切换窗口 从 0 开始 -1 为最后一个 # driver.switch_to.window(driver.window_handles[-1]) # driver.switch_to.window(driver.window_handles[0]) # 关闭窗口 # driver.close() # 关闭浏览器 # driver.quit()
无头浏览器
# 无头浏览器 即不打开浏览器
option.add_argument("--headless")
option.add_argument("--disable-gpu")
# 创建浏览器对象
driver = webdriver.Chrome(options=option)
iframe
# iframe 切换到 iframe
# iframe = driver.find_element(by=By.ID, value='search_button')
# driver.switch_to.frame(iframe)
# 跳出 iframe
# driver.switch_to.parent_frame()
# 获取页面代码 是 f12 里的代码 不是源代码
# driver.page_source
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。