赞
踩
接上篇:
Python爬虫练习
这里做了简单的优化,网址什么的老规矩隐藏掉。
目前不是完全体。
缺少部分:
import time import requests from bs4 import BeautifulSoup from PIL import Image from io import BytesIO import pytesseract import logging import xlwt from flyCodeList import flyCodes from flyInfo import flyInfo class flyspider: # 初始化请求头 def __init__(self): # 准备请求头信息 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36' host = '...' connection = 'keep-alive' accept_Language = 'zh-CN;zh;q=0.9' accept_Encoding = 'gzip; deflate' accept = 'text/html;application/xhtml+xml;application/xml;q=0.9;image/avif;image/webp;image/apng;*/*;q=0.8;application/signed-exchange;v=b3;q=0.9' referer = '...' # 组装请求头 headers = { 'Accept': accept, 'Accept-Language': accept_Language, 'Connection': connection, 'Host': host, 'Referer': referer, 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent } self.headers = headers self.cookies = '' # ocr识别 def ocrWebImg(self, url): ocrStr = '' try: imgResp = requests.get(url + '/get', headers=self.headers, cookies=self.cookies) image = Image.open(BytesIO(imgResp.content)) ocrStr = pytesseract.image_to_string(image) except BaseException as e: logging.info(e) pass finally: imgResp.close() # print('ocr:' + ocrStr) return ocrStr # 爬取 def spiderRun(self, flyCode, flyDate): url = '...' + flyCode + '.html?AE71649A58c77&fdate=' + flyDate base_url = "..." # 请求 r_get = requests.get(url + '/get', headers=self.headers) respCode = r_get.status_code self.cookies = r_get.cookies if 200 != respCode: print(respCode) flyInfoTmp = flyInfo() flyInfoTmp.fly_code = flyCode flyInfoTmp.fly_date = flyDate return flyInfoTmp r_get.encoding = 'utf-8' # 找到需要爬取的信息 text = r_get.text # 获取页面html页面 soup = BeautifulSoup(text, 'html.parser') # print(soup) flyInfoTmp = flyInfo() flyInfoTmp.fly_code = flyCode flyInfoTmp.fly_date = flyDate if soup.find('p', class_='t') is None: # 航班列表 items = soup.find_all('div', class_="li_com") size = items.__len__() if size > 0: flyInfoTmp.is_empty = 'N' for index in range(0, size): imgs = items[index].find_all('img') # print(imgs) # 航空公司 flyInfoTmp.fly_company = imgs[0].attrs['align'] # 计划起飞 flyInfoTmp.plan_start_time = items[index].find_all('span')[1].text.strip() # 实际起飞 flyInfoTmp.real_start_time_ocr = self.ocrWebImg(base_url + imgs[1]['src']) # 出发地 flyInfoTmp.start_address = items[index].find_all('span')[3].text.strip() # 计划到达 flyInfoTmp.plan_end_time = items[index].find_all('span')[4].text.strip() # 实际到达 flyInfoTmp.real_end_time_ocr = self.ocrWebImg(base_url + imgs[2]['src']) # 到达地 flyInfoTmp.end_address = items[index].find_all('span')[6].text.strip() # 准点率 flyInfoTmp.time_performance = self.ocrWebImg(base_url + imgs[3]['src']) r_get.close() return flyInfoTmp # 入口 def start(self, datastr): # 获取列表 flyCodeList = flyCodes.getflyList() # 默认 flyDate = time.strftime('%Y-%m-%d',time.localtime(time.time())) if datastr is not None and flyCodeList is not None and flyCodeList.__len__() > 0: flyDate = datastr # Excel wb = xlwt.Workbook() wbSheet = wb.add_sheet(flyDate) wbSheet.write(0, 0, '序号') wbSheet.write(0, 1, '航空公司') wbSheet.write(0, 2, '航班号') wbSheet.write(0, 3, '日期') wbSheet.write(0, 4, '计划起飞') wbSheet.write(0, 5, '实际起飞') wbSheet.write(0, 6, '出发地') wbSheet.write(0, 7, '计划到达') wbSheet.write(0, 8, '实际到达') wbSheet.write(0, 9, '到达地') wbSheet.write(0, 10, '准点率') wbSheet.write(0, 11, '是否为空') wb.save('D://FLYTMP.xls') ''' size = flyCodeList.__len__() for index in range(0, size): try: seq = index + 1 res = self.spiderRun(flyCodeList[0], flyDate) print(res) wbSheet.write(seq, 0, str(seq)) wbSheet.write(seq, 1, res.fly_company if(res.fly_company is not None) else '') wbSheet.write(seq, 2, res.fly_code if(res.fly_code is not None) else '') wbSheet.write(seq, 3, res.fly_date if(res.fly_date is not None) else '') wbSheet.write(seq, 4, res.plan_start_time if(res.plan_start_time is not None) else '') wbSheet.write(seq, 5, res.real_start_time_ocr if(res.real_start_time_ocr is not None) else '') wbSheet.write(seq, 6, res.start_address if(res.start_address is not None) else '') wbSheet.write(seq, 7, res.plan_end_time if(res.plan_end_time is not None) else '') wbSheet.write(seq, 8, res.real_end_time_ocr if(res.real_end_time_ocr is not None) else '') wbSheet.write(seq, 9, res.end_address if(res.end_address is not None) else '') wbSheet.write(seq, 10, res.time_performance if(res.time_performance is not None) else '') wbSheet.write(seq, 11, res.is_empty if(res.is_empty is not None) else '') except BaseException as e: print(e) continue ''' res = self.spiderRun('KN5977', flyDate) print(res) seq = 1 wbSheet.write(seq, 0, str(seq)) wbSheet.write(seq, 1, res.fly_company if(res.fly_company is not None) else '') wbSheet.write(seq, 2, res.fly_code if(res.fly_code is not None) else '') wbSheet.write(seq, 3, res.fly_date if(res.fly_date is not None) else '') wbSheet.write(seq, 4, res.plan_start_time if(res.plan_start_time is not None) else '') wbSheet.write(seq, 5, res.real_start_time_ocr if(res.real_start_time_ocr is not None) else '') wbSheet.write(seq, 6, res.start_address if(res.start_address is not None) else '') wbSheet.write(seq, 7, res.plan_end_time if(res.plan_end_time is not None) else '') wbSheet.write(seq, 8, res.real_end_time_ocr if(res.real_end_time_ocr is not None) else '') wbSheet.write(seq, 9, res.end_address if(res.end_address is not None) else '') wbSheet.write(seq, 10, res.time_performance if(res.time_performance is not None) else '') wbSheet.write(seq, 11, res.is_empty if(res.is_empty is not None) else '') wb.save('D://FLYTMP.xls') # return self.spiderRun(flyCodeList[0], flyDate) # return if __name__ == "__main__": flyutil = flyspider() flyutil.start('20201014')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。