当前位置:   article > 正文

Python 爬取航班信息_python查询一个月的航班

python查询一个月的航班

接上篇:
Python爬虫练习

这里做了简单的优化,网址什么的老规矩隐藏掉。
目前不是完全体。

缺少部分:

  • 异常场景处理
  • 该网站做了反爬处理,需要使用代理池,这个后续有时间在搞吧
import time
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import pytesseract
import logging
import xlwt

from flyCodeList import flyCodes
from flyInfo import flyInfo


class flyspider:



    # 初始化请求头
    def __init__(self):
        # 准备请求头信息
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
        host = '...'
        connection = 'keep-alive'
        accept_Language = 'zh-CN;zh;q=0.9'
        accept_Encoding = 'gzip; deflate'
        accept = 'text/html;application/xhtml+xml;application/xml;q=0.9;image/avif;image/webp;image/apng;*/*;q=0.8;application/signed-exchange;v=b3;q=0.9'
        referer = '...'

        # 组装请求头
        headers = {
            'Accept': accept,
            'Accept-Language': accept_Language,
            'Connection': connection,
            'Host': host,
            'Referer': referer,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': user_agent
        }

        self.headers = headers
        self.cookies = ''

    # ocr识别
    def ocrWebImg(self, url):
        ocrStr = ''
        try:
            imgResp = requests.get(url + '/get', headers=self.headers, cookies=self.cookies)
            image = Image.open(BytesIO(imgResp.content))
            ocrStr = pytesseract.image_to_string(image)
        except BaseException as e:
            logging.info(e)
            pass
        finally:
            imgResp.close()
        # print('ocr:' + ocrStr)
        return ocrStr

    # 爬取
    def spiderRun(self, flyCode, flyDate):

        url = '...' + flyCode + '.html?AE71649A58c77&fdate=' + flyDate
        base_url = "..."
        # 请求
        r_get = requests.get(url + '/get', headers=self.headers)
        respCode = r_get.status_code
        self.cookies = r_get.cookies
        if 200 != respCode:
            print(respCode)
            flyInfoTmp = flyInfo()
            flyInfoTmp.fly_code = flyCode
            flyInfoTmp.fly_date = flyDate
            return flyInfoTmp

        r_get.encoding = 'utf-8'
        # 找到需要爬取的信息
        text = r_get.text
        # 获取页面html页面
        soup = BeautifulSoup(text, 'html.parser')
        # print(soup)

        flyInfoTmp = flyInfo()
        flyInfoTmp.fly_code = flyCode
        flyInfoTmp.fly_date = flyDate
        if soup.find('p', class_='t') is None:
            # 航班列表
            items = soup.find_all('div', class_="li_com")

            size = items.__len__()
            if size > 0:
                flyInfoTmp.is_empty = 'N'
                for index in range(0, size):

                    imgs = items[index].find_all('img')
                    # print(imgs)
                    # 航空公司
                    flyInfoTmp.fly_company = imgs[0].attrs['align']
                    # 计划起飞
                    flyInfoTmp.plan_start_time = items[index].find_all('span')[1].text.strip()
                    # 实际起飞
                    flyInfoTmp.real_start_time_ocr = self.ocrWebImg(base_url + imgs[1]['src'])
                    # 出发地
                    flyInfoTmp.start_address = items[index].find_all('span')[3].text.strip()
                    # 计划到达
                    flyInfoTmp.plan_end_time = items[index].find_all('span')[4].text.strip()
                    # 实际到达
                    flyInfoTmp.real_end_time_ocr = self.ocrWebImg(base_url + imgs[2]['src'])
                    # 到达地
                    flyInfoTmp.end_address = items[index].find_all('span')[6].text.strip()
                    # 准点率
                    flyInfoTmp.time_performance = self.ocrWebImg(base_url + imgs[3]['src'])
        r_get.close()
        return flyInfoTmp

    # 入口
    def start(self, datastr):
        # 获取列表
        flyCodeList = flyCodes.getflyList()
        # 默认
        flyDate = time.strftime('%Y-%m-%d',time.localtime(time.time()))

        if datastr is not None and flyCodeList is not None and flyCodeList.__len__() > 0:
            flyDate = datastr

            # Excel
            wb = xlwt.Workbook()
            wbSheet = wb.add_sheet(flyDate)
            wbSheet.write(0, 0, '序号')
            wbSheet.write(0, 1, '航空公司')
            wbSheet.write(0, 2, '航班号')
            wbSheet.write(0, 3, '日期')
            wbSheet.write(0, 4, '计划起飞')
            wbSheet.write(0, 5, '实际起飞')
            wbSheet.write(0, 6, '出发地')
            wbSheet.write(0, 7, '计划到达')
            wbSheet.write(0, 8, '实际到达')
            wbSheet.write(0, 9, '到达地')
            wbSheet.write(0, 10, '准点率')
            wbSheet.write(0, 11, '是否为空')
            wb.save('D://FLYTMP.xls')

            '''
            size = flyCodeList.__len__()
            for index in range(0, size):
                try:
                    seq = index + 1
                    res = self.spiderRun(flyCodeList[0], flyDate)
                    print(res)
                    wbSheet.write(seq, 0, str(seq))
                    wbSheet.write(seq, 1, res.fly_company if(res.fly_company is not None) else '')
                    wbSheet.write(seq, 2, res.fly_code if(res.fly_code is not None) else '')
                    wbSheet.write(seq, 3, res.fly_date if(res.fly_date is not None) else '')
                    wbSheet.write(seq, 4, res.plan_start_time if(res.plan_start_time is not None) else '')
                    wbSheet.write(seq, 5, res.real_start_time_ocr if(res.real_start_time_ocr is not None) else '')
                    wbSheet.write(seq, 6, res.start_address if(res.start_address is not None) else '')
                    wbSheet.write(seq, 7, res.plan_end_time if(res.plan_end_time is not None) else '')
                    wbSheet.write(seq, 8, res.real_end_time_ocr if(res.real_end_time_ocr is not None) else '')
                    wbSheet.write(seq, 9, res.end_address if(res.end_address is not None) else '')
                    wbSheet.write(seq, 10, res.time_performance if(res.time_performance is not None) else '')
                    wbSheet.write(seq, 11, res.is_empty if(res.is_empty is not None) else '')
                except BaseException as e:
                    print(e)
                    continue
            '''
            res = self.spiderRun('KN5977', flyDate)
            print(res)
            seq = 1
            wbSheet.write(seq, 0, str(seq))
            wbSheet.write(seq, 1, res.fly_company if(res.fly_company is not None) else '')
            wbSheet.write(seq, 2, res.fly_code if(res.fly_code is not None) else '')
            wbSheet.write(seq, 3, res.fly_date if(res.fly_date is not None) else '')
            wbSheet.write(seq, 4, res.plan_start_time if(res.plan_start_time is not None) else '')
            wbSheet.write(seq, 5, res.real_start_time_ocr if(res.real_start_time_ocr is not None) else '')
            wbSheet.write(seq, 6, res.start_address if(res.start_address is not None) else '')
            wbSheet.write(seq, 7, res.plan_end_time if(res.plan_end_time is not None) else '')
            wbSheet.write(seq, 8, res.real_end_time_ocr if(res.real_end_time_ocr is not None) else '')
            wbSheet.write(seq, 9, res.end_address if(res.end_address is not None) else '')
            wbSheet.write(seq, 10, res.time_performance if(res.time_performance is not None) else '')
            wbSheet.write(seq, 11, res.is_empty if(res.is_empty is not None) else '')
            wb.save('D://FLYTMP.xls')
        # return self.spiderRun(flyCodeList[0], flyDate)
        # return



if __name__ == "__main__":
    flyutil = flyspider()
    flyutil.start('20201014')


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/75801
推荐阅读
相关标签
  

闽ICP备14008679号