赞
踩
一、需要导入的包
import requests
import pandas as pd
import time
import math
from lxml import etree
import re
二、初始化
def init():
try:
info = pd.read_excel("xxx1.xlsx")
noInfo = pd.read_excel("xxx2.xlsx")
except FileNotFoundError:
print("--------------------创建新表格--------------------")
column = ['a', 'b', 'c']
info = pd.DataFrame(columns=column)
info.to_excel("xxx1.xlsx", index=False)
noInfo = pd.DataFrame(columns=['d'])
noInfo.to_excel("xxx2.xlsx", index=False)
session = requests.Session()
timeStamp = math.floor(time.time())
url = 'abcdefg'
return session, info, url, noInfo
三、获得请求头
def get_header():
return {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
}
四、登录
def login(session):
response = session.get(
"abcdefg", headers=get_header()
)
if response.status_code != 200:
raise Exception("http请求失败,错误代码:{}", response.status_code)
五、获得验证码
def get_captcha(url):
#获取验证码图片
response = session.get(url, headers=get_header())
if response.status_code != 200:
raise Exception("http请求失败,错误代码:{}", response.status_code)
with open("captcha.jpeg", "wb") as f:
f.write(response.content)
六、写入表格
def write(target_url, res, info, noinfo):
res = session.get(target_url, headers=get_header())
res.encoding = 'utf-8'
if res.status_code != 200:
raise Exception("http请求失败,错误代码:{}", res.status_code)
tree = etree.HTML(res.text)
if(len(tree.xpath('//*[@id="sealInfo02"]/div[6]/table/tbody/tr[1]/td[4]/span/text()')) == 0):
noinfo.loc[len(info)] = target_url
noinfo.to_excel("xxx2.xlsx", index=False)
return
name = dealXpath(tree, '//*[@id="sealInfo02"]/div[6]/table/tbody/tr[1]/td[4]/span/text()', '', '')
addr = dealXpath(tree, '//*[@id="sealInfo02"]/div[6]/table/tbody/tr[2]/td[4]/span/span/text()', '', '')
phone = dealXpath(tree, '//*[@id="contactMethod"]/text()', '', '')
contract_number = dealXpath(tree, '//*[@id="sealInfo02"]/p[8]/span[2]/span/text()', '//*[@id="sealInfo02"]/p[9]/span[2]/text()', '')
contract_name = dealXpath(tree, '//*[@id="sealInfo02"]/p[8]/span[4]/span/text()', '', '')
money = dealXpath(tree, '//*[@id="htjebq"]/text()', '', '')
money = str(tree.xpath('//*[@id="htjebq"]/text()')[0]).split('¥')[1]
money = money[:money.index(")")]
start_date = dealXpath(tree, '//*[@id="content"]/p[4]/span[3]/text()', '', '')
start_date = re.findall(r"\d+\.?\d*", start_date.split(":")[1])
start_date = ''.join(num for num in start_date)
end_date = dealXpath(tree, '//*[@id="sealInfo02"]/p[35]/span[2]/span/span/text()', '//*[@id="sealInfo02"]/p[36]/span[2]/span/text()', '//*[@id="sealInfo02"]/p[37]/span[2]/span/span/text()')
dealExcel(name, addr, phone, contract_number, contract_name, money, start_date, end_date)
七、处理xpath获得具体信息
def dealXpath(tree, xpath1, xpath2, xpath3):
if(xpath1 != '' and len(tree.xpath(xpath1)) != 0):
return str(tree.xpath(xpath1)[0])
elif(xpath2 != '' and len(tree.xpath(xpath2)) != 0):
return str(tree.xpath(xpath2)[0])
elif(xpath3 != '' and len(tree.xpath(xpath3)) != 0):
return str(tree.xpath(xpath3)[0])
else:
return ""
八、写入表格的具体操作
def dealExcel(name, addr, phone, contract_number, contract_name, money, start_date, end_date):
info_list = []
info_list.append(name)
info_list.append(addr)
info_list.append(phone)
info_list.append(contract_number)
info_list.append(contract_name)
info_list.append(money)
info_list.append(start_date)
info_list.append(end_date)
print(info_list)
info.loc[len(info)] = info_list
info.to_excel("xxx1.xlsx", index=False)
九、处理的总操作
def deal(session, info, page, verifyCode, noinfo, url):
print("--------------------正在爬取第" + str(page) + "页--------------------")
response = session.get(
abc'
+ str(page) +
'def'
+ verifyCode +
'ghi', headers=get_header()
)
if response.status_code != 200:
raise Exception("http请求失败,错误代码:{}", response.status_code)
for i in range(0,10):
res = response.json()
target_url = abc" + res['data'][i]['htmlpath']
write(target_url, res, info, noinfo)
十、主函数
if __name__ == '__main__':
session, info, url, noinfo = init()
login(session)
get_captcha(url)
print("请输入验证码:")
verifyCode = input()
page = 1
while(1):
try:
deal(session, info, page, verifyCode, noinfo, url)
except Exception:
get_captcha(url)
print("验证码失效或错误,请重新输入:")
verifyCode = input()
continue
page += 1
十一、可优化点
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。