赞
踩
# -*- coding: utf-8 -*- # @Time : 2023/4/26 19:46 # @Author : Weiri # @File : paqu_ip.py # @Project : Web crawlers import time import random import requests from lxml import etree from fake_useragent import UserAgent class ProxyFreePool: """ 抓取快代理免费高匿代理,并测试是否可用,建立免费代理IP池 """ def __init__(self): self.url = 'https://www.kuaidaili.com/free/inha/{}/' # url地址 self.text_url = "http://baidu.com/" # 测试url def get_proxy_pool(self, url): """ function: 获取url地址的ip和port in: url:传入的url地址 out: ip:代理ip port:端口号 return: None others: Get IP & Port Func """ headers = {'User-Agent': UserAgent().random} # 构建随机请求头 html = requests.get(url=url, headers=headers).text # 获取快代理页面的响应内容 p = etree.HTML(html) # 创造解析对象 # 1、基准xpath //table[@class='table table-bordered table-striped']/tbody/tr/td tr_list = p.xpath("//table[@class='table table-bordered table-striped']/tbody/tr") # 解析对象调用xpath for tr in tr_list[1:]: ip = tr.xpath("./td[1]/text()")[0].strip() port = tr.xpath("./td[2]/text()")[0].strip() # 测试代理IP是否可用 self.text_proxy(ip, port) def text_proxy(self, ip, port): """ function: 测试一个代理IP是否可用函数 in: ip:代理IP port:端口号 out: None return: None others: Text Proxy Func """ proxies = { 'http': 'http://{}:{}'.format(ip, port), 'https': 'https://{}:{}'.format(ip, port) } # 传入ip和端口号 # noinspection PyBroadException try: headers = {'User-Agent': UserAgent().random} # 构建随机请求头 res = requests.get(url=self.text_url, headers=headers, timeout=2, proxies=proxies) if res.status_code == 200: # 判断响应码是否正确 print(ip, port, '\033[31m可用\033[0m') # 打印 with open("proxy.txt", "a") as f: f.write(ip + ':' + port + '\n') # 写入proxy.txt except Exception as e: print(ip, port, '不可用') def run(self): """ function: 程序入口函数 in: None out: None return: None others: Program Entry Func """ for i in range(1, 1001): url = self.url.format(i) # 拼接url地址 self.get_proxy_pool(url=url) time.sleep(random.randint(1, 2)) # 随机休眠1-2s if __name__ == '__main__': # spider = ProxyPool() # spider.run() spider = ProxyFreePool() spider.run()
headers = {'User-Agent': UserAgent().random} # 构建随机请求头
headers = {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36 Edg/112.0.1722.48 '
}
proxies = {
'https':'117.29.228.43:64257',
'http':'117.29.228.43:64257'
}
requests.get(url, headers=head, proxies=proxies, timeout=3) #proxies
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。