赞
踩
在使用Python爬虫时,经常遇见具有反爬机制的网站。我们可以通过伪装headers来爬取,但是网站还是可以获取你的ip,从而禁掉你的ip来阻止爬取信息。
在request方法中,我们可以通过proxies参数来伪装我们的ip,一些网站上有免费的ip代理网站,可以通过爬取这些ip,经检测后建立ip代理池。
ip代理网站:
(https://www.xicidaili.com/nt/)
(https://www.kuaidaili.com/free/intr/)
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent':ua.random}
接下来进入正题
import requests from lxml import etree from fake_useragent import UserAgent #伪装 ua = UserAgent() headers = {'User-Agent':ua.random} def get_ip(): ip_list = [] #路径 url = 'https://www.xicidaili.com/nt/' #ip是有时效的,只爬取第一页 #请求 response = requests.get(url=url,headers=headers) #设置编码 response.encoding = response.apparent_encoding response = response.text response = etree.HTML(response) tr_list = response.xpath('//tr[@class="odd"]') for i in tr_list: #ip ip = i.xpath('./td[2]/text()')[0] #端口号 port = i.xpath('./td[3]/text()')[0] #协议 agreement = i.xpath('./td[6]/text()')[0] agreement = agreement.lower() #拼装完整路径 ip = agreement + '://' + ip + ':' + port ip_list.append(ip) return ip_list if __name__ == '__main__': ip_list = get_ip() print(ip_list)
import requests from multiprocessing.dummy import Pool #获取爬取到的ip列表 from IPPool import get_ip test_list = get_ip() #定义一个全局列表,用来存放有效ip ip_list = [] #ip测试网站 url = 'http://icanhazip.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0' } def ip_test(ip): try: if ip.split(":")[0] == 'http': proxies = { 'http': ip } else: proxies = { 'https': ip } response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3) ip_list.append(ip) print(ip + "可用") except: print(ip + "不可用") if __name__ == '__main__': pool = Pool(4) pool.map(ip_test, test_list) print(ip_list) print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))
测试结果:
import threading import requests import queue from fake_useragent import UserAgent #获取爬取到的ip列表 from IPPool import get_ip test_list = get_ip() #定义一个全局列表,用来存放有效ip ip_pool = [] #随机头伪装 ua = UserAgent() headers = {'User-Agent':ua.random} url = 'https://www.csdn.net/' # url = 'http://icanhazip.com/' def test_ip(queue_list): while True: if queue_list.empty(): break else: ip = queue_list.get() if ip.split(":")[0] == 'http': proxies = { 'http' : ip } else: proxies = { 'https': ip } try: response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3) if response.status_code == 200: print("【%s】测试%s,测试结果【可用】" % (threading.current_thread().name, proxies)) ip_pool.append(ip) except: print("【%s】测试%s,测试结果【不可用】" % (threading.current_thread().name, proxies)) if __name__ == '__main__': queue_list = queue.Queue()#创建队列 #将爬取的ip放入队列中 for i in test_list: queue_list.put(i) #创建线程 out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)] for thread in out_thread: thread.start() for thread in out_thread: thread.join() print('测试完成') print(ip_pool) print("总共爬取%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))
结果:
测试网址不需要那么复杂,www.baidu.com一类的都可以,有一位博主推荐了一个测试网站:http://icanhazip.com/
在测试时遇到了一个坑,没有太注意协议是http还是https,统一用了http,然后发现每一个ip都可以用,当然这是不可能的,经过修改后,测试成功的ip大概在二十五个左右。
https://www.kuaidaili.com/free/intr/这个网址的ip爬取也写了(ip还没有处理),但是这个网址的一页ip有点少,所以就没有测试
import requests from lxml import etree from fake_useragent import UserAgent #伪装 ua = UserAgent() headers = {'User-Agent':ua.random} def get_ip(): ip_list = [] #路径 url = 'https://www.kuaidaili.com/free/intr/' #请求 response = requests.get(url=url,headers=headers) #设置编码 response.encoding = response.apparent_encoding response = response.text response = etree.HTML(response) tr_list = response.xpath('//*[@id="list"]/table/tbody/tr') for i in tr_list: ip = i.xpath('./td[1]/text()')[0] ip_list.append(ip) return ip_list if __name__ == '__main__': ip_list = get_ip() # print(ip_list)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。