当前位置:   article > 正文

关于python多线程引用Queue爬虫_python queue thread 多线程 爬虫

python queue thread 多线程 爬虫

出于工作保密性质,代码中所涉及url皆以***代替

import logging
import random
import time
import requests
from lxml import etree
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from datetime import datetime
from queue import Queue
import threading

es = Elasticsearch(hosts='192.168.126.90', port=9200)  # 连接Elasticsearch

class Freedom(object):
    def __init__(self):
        self.log = self.get_log()
        self.headers, self.proxies_list, self.data = self.get_headers()
        self.urlQueue = Queue()
        self.resQueue = Queue()

    def get_log(self):
        logger = logging.getLogger(__name__)   #日志
        logger.setLevel(level=logging.INFO)   #日志级别
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') #日志时间、执行程序路径、日志当前行号、日志级别、日志信息
        sh = logging.StreamHandler()  # 往屏幕上输出
        sh.setFormatter(formatter)  # 设置屏幕上显示的格式
        today = datetime.now()
        log_file_path = "./log/form-{}-{}-{}.log".format(today.year, today.month, today.day)
        handler = logging.FileHandler(log_file_path,encoding='utf-8')     #往文件输出
        handler.setFormatter(formatter)     #设置文件写入格式
        logger.addHandler(handler)        #把对象加入到logger里
        logger.addHandler(sh)
        return logger

    def get_headers(self):
        proxies_list = [
            {"http": "192.168.126.110:9008"},
            {"http": "192.168.126.107:9398"},
            {"http": "192.168.126.106:9398"},
            {"http": "192.168.126.105:9398"},
            {"http": "192.168.126.108:9398"},
        ]
        data = {
            'name': 'qwertyuiopl',
            'passwd': 'Qwertyuiopl123'
        }
        headers = {
            'Host' '**********************.onion'
            'Content-Type': 'application/x-www-form-urlencoded',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
            'Upgrade-Insecure-Requests': '1',
        }
        return headers, proxies_list, data

    def main(self):
        self.get_url()  # 得到队列中url

        thread_list = []  # 存放所有的线程
        for i in range(5):
            Content = threading.Thread(target=self.getInfo)  # 响应线程
            thread_list.append(Content)

        for j in range(3):
            Parse = threading.Thread(target=self.getParse)  # 解析线程
            thread_list.append(Parse)

        for th in thread_list:
            th.setDaemon(True)  # 所有线程开始干活
            th.start()

        self.urlQueue.join()  # 回收
        self.resQueue.join()  # 回收

    # 获取url
    def get_url(self):
        url_login = 'http://*************************************************'
        proxies = random.choice(self.proxies_list)  # 代理
        global session
        session = requests.session()  # session请求
        r = session.post(url_login, headers=self.headers, proxies=proxies, data=self.data)  # 登录
        first_page = etree.HTML(r.text)
        url_good=first_page.xpath('//div[@class="col-md-2"]/a/@href')[0] #商品目录
        res = session.get(url_good,headers=self.headers,proxies=proxies)
        second_page = etree.HTML(res.text)
        urls=second_page.xpath('//div[@class="post-item p-1"]/h4/a/@href')  #详情链接
        for url in urls:
            print(url)
            self.urlQueue.put(url)  # 遍历所有url添加进队列
        while True:
            time.sleep(0.01)
            try:
                next_page = second_page.xpath('//div[@class="d-flex mt-5 justify-content-center"]/ul/li[last()]/a/@href')[0] #翻页
                response = session.get(next_page, headers=self.headers, proxies=proxies)  # 翻页请求
                third_page = etree.HTML(response.text)
                second_page = third_page
                urls = third_page.xpath('//div[@class="post-item p-1"]/h4/a/@href')  # 详情链接
                for url in urls:
                    print('url:',url)
                    self.urlQueue.put(url)  # 遍历所有url添加进队列
            except:
                break

    # 请求资源
    def getInfo(self):
        while True:
            time.sleep(0.01)
            try:
                proxies = random.choice(self.proxies_list)  # 代理
                url = self.urlQueue.get()
                response = session.get(url, headers=self.headers, proxies=proxies)
                body = response.text
                item = {
                    'body':body,
                    'url':url
                }
                self.resQueue.put(item)   # 将响应数据存到字典item中
                self.urlQueue.task_done() # 剔除队列中数据
            except:
                break

    # 解析对数据作处理 数据持久化到es
    def getParse(self):
        while True:
            try:
                item = self.resQueue.get()
                url = item['url']    # 读取字典url的value
                body = item['body']  # 读取字典body的value
                index_name = 'deeps'
                index_type = 'test'
                actions = []
                action = {
                    "_index": index_name,
                    "_type": index_type,
                    # "_id": i, #_id 也可以默认生成,不赋值
                    "_source": {
                        "url": url,
                        "html": body,
                        "domain_name": '****************.onion/',
                        "language": 'en',
                        "crawl_time": datetime.utcnow(),
                    }
                }
                actions.append(action)
                success, _ = bulk(es, actions, index=index_name, raise_on_error=True)
                self.resQueue.task_done()  # 剔除队列中数据
            except:
                break


if __name__ == '__main__':
    creat = Freedom()
    creat.main()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/189691
推荐阅读
相关标签
  

闽ICP备14008679号