【爬虫】使用requests和etree爬取特定信息_requests etree

作者：从前慢现在也慢 | 2024-08-03 18:54:57

踩

requests etree

一、需要导入的包

requests：用于向网站发送请求的包
pandas：用于数据处理的包
time：用于获得当前时间的包
math：用于使用数学公式的包
etree：用于解析网页的包
re：用于使用正则表达式的包

import requests
import pandas as pd
import time
import math
from lxml import etree
import re
1
2
3
4
5
6

二、初始化

创建表格
获得session
获得url

def init():
    try:
        info = pd.read_excel("xxx1.xlsx")
        noInfo = pd.read_excel("xxx2.xlsx")
    except FileNotFoundError:
        print("--------------------创建新表格--------------------")
        column = ['a', 'b', 'c']
        info = pd.DataFrame(columns=column)
        info.to_excel("xxx1.xlsx", index=False)
        noInfo = pd.DataFrame(columns=['d'])
        noInfo.to_excel("xxx2.xlsx", index=False)
    session = requests.Session()
    timeStamp = math.floor(time.time())
    url =  'abcdefg'
    return session, info, url, noInfo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
'运行

三、获得请求头

def get_header():
    return {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
    }

1
2
3
4
5
6
7
8
9
10
'运行

四、登录

不同网页登录操作不同

def login(session):
    response = session.get(
        "abcdefg", headers=get_header()
    )
    if response.status_code != 200:
        raise Exception("http请求失败，错误代码：{}", response.status_code)
1
2
3
4
5
6
'运行

五、获得验证码

def get_captcha(url):
    #获取验证码图片
    response = session.get(url, headers=get_header())
    if response.status_code != 200:
        raise Exception("http请求失败，错误代码：{}", response.status_code)
    with open("captcha.jpeg", "wb") as f:
        f.write(response.content)
    
1
2
3
4
5
6
7
8
'运行

六、写入表格

使用etree解析获得的response
然后使用xpath定位到具体信息的xpath
然后使用dealXpath()函数获得信息

def write(target_url, res, info, noinfo):
    res = session.get(target_url, headers=get_header())
    res.encoding = 'utf-8'
    if res.status_code != 200:
        raise Exception("http请求失败，错误代码：{}", res.status_code)
    tree = etree.HTML(res.text)
    if(len(tree.xpath('//*[@id="sealInfo02"]/div[6]/table/tbody/tr[1]/td[4]/span/text()')) == 0):
        noinfo.loc[len(info)] = target_url
        noinfo.to_excel("xxx2.xlsx", index=False) 
        return
    name = dealXpath(tree, '//*[@id="sealInfo02"]/div[6]/table/tbody/tr[1]/td[4]/span/text()', '', '')
    addr = dealXpath(tree, '//*[@id="sealInfo02"]/div[6]/table/tbody/tr[2]/td[4]/span/span/text()', '', '')
    phone = dealXpath(tree, '//*[@id="contactMethod"]/text()', '', '')
    contract_number = dealXpath(tree, '//*[@id="sealInfo02"]/p[8]/span[2]/span/text()', '//*[@id="sealInfo02"]/p[9]/span[2]/text()', '')
    contract_name = dealXpath(tree, '//*[@id="sealInfo02"]/p[8]/span[4]/span/text()', '', '')
    money = dealXpath(tree, '//*[@id="htjebq"]/text()', '', '')
    money = str(tree.xpath('//*[@id="htjebq"]/text()')[0]).split('￥')[1]
    money = money[:money.index("）")]
    start_date = dealXpath(tree, '//*[@id="content"]/p[4]/span[3]/text()', '', '')
    start_date = re.findall(r"\d+\.?\d*", start_date.split("：")[1])
    start_date = ''.join(num for num in start_date)
    end_date = dealXpath(tree, '//*[@id="sealInfo02"]/p[35]/span[2]/span/span/text()', '//*[@id="sealInfo02"]/p[36]/span[2]/span/text()', '//*[@id="sealInfo02"]/p[37]/span[2]/span/span/text()')
    dealExcel(name, addr, phone, contract_number, contract_name, money, start_date, end_date)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
'运行

七、处理xpath获得具体信息

def dealXpath(tree, xpath1, xpath2, xpath3):
    if(xpath1 != '' and len(tree.xpath(xpath1)) != 0):
        return str(tree.xpath(xpath1)[0])
    elif(xpath2 != '' and len(tree.xpath(xpath2)) != 0):
        return str(tree.xpath(xpath2)[0])
    elif(xpath3 != '' and len(tree.xpath(xpath3)) != 0):
        return str(tree.xpath(xpath3)[0])
    else:
        return ""

1
2
3
4
5
6
7
8
9
10
'运行

八、写入表格的具体操作

def dealExcel(name, addr, phone, contract_number, contract_name, money, start_date, end_date):
    info_list = []
    info_list.append(name)
    info_list.append(addr)
    info_list.append(phone)
    info_list.append(contract_number)
    info_list.append(contract_name)
    info_list.append(money)
    info_list.append(start_date)
    info_list.append(end_date)
    print(info_list)
    info.loc[len(info)] = info_list
    info.to_excel("xxx1.xlsx", index=False) 
1
2
3
4
5
6
7
8
9
10
11
12
13
'运行

九、处理的总操作

def deal(session, info, page, verifyCode, noinfo, url):
    print("--------------------正在爬取第" + str(page) + "页--------------------")
    response = session.get(
        abc'
        + str(page) + 
        'def'
        + verifyCode + 
        'ghi', headers=get_header()
    )
    if response.status_code != 200:
        raise Exception("http请求失败，错误代码：{}", response.status_code)
    for i in range(0,10):
        res = response.json()
        target_url = abc" + res['data'][i]['htmlpath']
        write(target_url, res, info, noinfo)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

十、主函数

实现了验证码失效后，捕获异常重新输入，断点续爬

if __name__ == '__main__':
    session, info, url, noinfo = init()
    login(session)
    get_captcha(url)
    print("请输入验证码：")
    verifyCode = input()
    page = 1
    while(1):
        try: 
            deal(session, info, page, verifyCode, noinfo, url)
        except Exception:
            get_captcha(url)
            print("验证码失效或错误，请重新输入：")
            verifyCode = input()
            continue
        page += 1
       
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

十一、可优化点

多线程爬取
验证码自动识别

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/924372