赞
踩
目录结构:
后面见代码吧,太细有点啰嗦
经过测试:控制好爬取的速度就没有什么反爬了,如果速度过快会重定向到验证码界面,要不就云打码机器学习识别验证码(中间件拦截请求url),要不就上代理,最好还是控制爬取速度,没必要一天爬完
# -*- coding: utf-8 -*- import scrapy import re from fang.items import NewHouseItem , EsfHouseItem class SfwSpider(scrapy.Spider): name = 'sfw' allowed_domains = ['fang.com'] start_urls = ['https://www.fang.com/SoufunFamily.htm'] # 对配置文件进行自定制,custom_settings的优先级高于settings custom_settings = { 'DOWNLOAD_DELAY': 1, # 延时最低为2s 'AUTOTHROTTLE_ENABLED': True, # 启动[自动限速] 'AUTOTHROTTLE_DEBUG': True, # 开启[自动限速]的debug 'AUTOTHROTTLE_MAX_DELAY': 10, # 设置最大下载延时 'DOWNLOAD_TIMEOUT': 15, 'CONCURRENT_REQUESTS_PER_DOMAIN': 8 # 限制对该网站的并发请求数 } # 获取所有城市的省份名称二手房url新房url def parse(self, response): trs = response.xpath('//div[@id="c02"]//tr') province = None for tr in trs: # 第一个是省 # 第二个是城市 tds = tr.xpath('.//td[not(@class)]') # 获取省文本,可能有会空的情况 province_text = tds[0].xpath('.//text()').extract_first() # 将空白字符替换为空 province_text = re.sub(r'\s','',province_text) # 如果有值说明有省份,如果没有值那么就用上一次的省份(为空说明属于上一次的省) if province_text: province = province_text # 不爬取海外城市房源 if province =='其它': continue city_list = tds[1].xpath('.//a') for city in city_list: city_name = city.xpath('./text()').extract_first() city_link = city.xpath('./@href').extract_first() # 构建新房url city_link_new = city_link.replace('fang.com','newhouse.fang.com/house/s') # 构建二手房url city_link_esf = city_link.replace('fang.com','esf.fang.com') # 发起请求callback是执行回调函数,meta用来请求传参,将本次的信息传递给回调函数中 yield scrapy.Request(url=city_link_new,callback=self.parse_newhouse,meta={'info':[province,city_name]}) yield scrapy.Request(url=city_link_esf, callback=self.parse_esfhouse,meta={'info': [province, city_name]}) # 解析新房页面 def parse_newhouse(self,response): province,city_name = response.meta['info'] li_list = response.xpath('//div[@id="newhouse_loupai_list"]//li[not(@style)]') for li in li_list: try: house_name = li.xpath('.//div[@class="nlcd_name"]/a/text()').extract_first().strip() except AttributeError: house_name = '' rooms_area_list = li.xpath('.//div[contains(@class,"house_type")]//text()').extract() # 将空白字符用正则去掉后进行拼接操作,结果为 1居/2居/3居-35~179平米..... # 因为后面要大量使用这个map函数,所以可以封装一个函数,优化点 rooms_area = ''.join(list(map(lambda x:re.sub(r'\s','',x),rooms_area_list))) # 如果不是居室情况就改成[] if '居' not in rooms_area: rooms_area=[] else: # 格式变得更好看 rooms_area = rooms_area.replace(r'-','/总面积:') address = li.xpath('.//div[@class="address"]/a/@title').extract_first() try: district = li.xpath('.//div[@class="address"]/a//text()').extract() # 里面是字符串形式的列表列表中是行政区 [怀来] [门头沟] XXX district =list(map(lambda x: re.sub(r'\s', '', x), district))[1][1:-1] except IndexError: district = '' sale = li.xpath('.//div[@class="fangyuan"]/span/text()').extract_first() price = li.xpath('.//div[@class="nhouse_price"]//text()').extract() price = ''.join(list(map(lambda x: re.sub(r'\s', '', x), price))) # response.urljoin是将缺失的url拼接完整 # //feicuigongyuan.fang.com/ 自动拼接成https://feicuigongyuan.fang.com/ 如果完整就不会做操作 house_link_url = response.urljoin(li.xpath('.//div[@class="nlcd_name"]/a/@href').extract_first()) phone = li.xpath('.//div[@class="tel"]/p/text()').extract_first() item = NewHouseItem(province=province,city_name=city_name,house_name=house_name,price=price,rooms_area=rooms_area,address=address,district=district,sale=sale,house_link_url=house_link_url,phone=phone) yield item # 获取下一页的url # 爬取到最后5页的时候next就会变成上一页的url,可优化! next_url = response.urljoin(response.xpath('.//div[@class="page"]//a[@class="next"]/@href').extract_first()) # 分页爬取 yield scrapy.Request(url=next_url,callback=self.parse_newhouse,meta={'info': [province,city_name]}) # 解析二手房页面 def parse_esfhouse(self,response): # print(response.url) province,city_name = response.meta['info'] dl_list = response.xpath('//div[@class="shop_list shop_list_4"]/dl[not(@dataflag="bgcomare")]') for dl in dl_list: house_name = dl.xpath('.//p[@class="add_shop"]/a/@title').extract_first() address = dl.xpath('.//p[@class="add_shop"]/span/text()').extract_first() try: price = dl.xpath('.//dd[@class="price_right"]/span[1]//text()').extract() price = price[1] + price[2] except IndexError: price = '' # price = price[1]+price[2] try: unit = dl.xpath('.//dd[@class="price_right"]/span[2]/text()').extract_first().strip() except AttributeError: unit = '' house_link_url = response.urljoin(dl.xpath('.//h4[@class="clearfix"]/a/@href').extract_first()) infos = dl.xpath('.//p[@class="tel_shop"]/text()').extract() try: infos = list(map(lambda x:re.sub(r'\s','',x),infos)) # 去除掉不和规矩的少数数据 if '厅' not in infos[0] or len(infos) !=7: continue for info in infos: if '厅' in info: rooms = info elif '层' in info: floor = info elif '向' in info: orientation = info elif '㎡' in info: area = info elif '建' in info: year = info item = EsfHouseItem(province=province,city_name=city_name,house_name=house_name,address=address,price=price,unit=unit,rooms=rooms,floor=floor,area=area,year=year,orientation=orientation,house_link_url=house_link_url) yield item except (IndexError,UnboundLocalError) : continue # 分页爬取 next_url = response.urljoin(response.xpath('.//div[@class="page_al"]/p[1]/a/@href').extract_first()) # print(next_url) yield scrapy.Request(url=next_url,callback=self.parse_esfhouse,meta={'info':[province,city_name]})
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class NewHouseItem(scrapy.Item): # 省份 province = scrapy.Field() # 城市名字 city_name = scrapy.Field() # 小区名字 house_name = scrapy.Field() # 价格 price = scrapy.Field() # 居室和面积情况 rooms_area = scrapy.Field() # 地址 address = scrapy.Field() # 行政区 district = scrapy.Field() # 是否在售 sale = scrapy.Field() # 电话 phone = scrapy.Field() # 房天下详情页面url house_link_url = scrapy.Field() class EsfHouseItem(scrapy.Item): # 省份 province = scrapy.Field() # 城市名字 city_name = scrapy.Field() # 小区名字 house_name = scrapy.Field() # 地址 address = scrapy.Field() # 总价格 price = scrapy.Field() # 单价 unit = scrapy.Field() # 居室 rooms = scrapy.Field() # 层 floor = scrapy.Field() # 面积 area = scrapy.Field() # 年代 year = scrapy.Field() # 朝向 orientation = scrapy.Field() # 房天下详情页面url house_link_url = scrapy.Field()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import json from fang.items import NewHouseItem class FangPipeline(object): def open_spider(self,spider): self.new_f = open('new_house.json','w',encoding='utf-8') self.esf_f = open('esf_house.json','w',encoding='utf-8') def process_item(self, item, spider): # 如果item是属于NewHouseItem实例化的那么就写入new_house.josn # 否则写入esf_house.json # isinstance 判断某个实例是否是指定类的实例 如果是返回True # 实例名.__class__.__name__ 获取类实例的名称 if isinstance(item,NewHouseItem): self.new_f.write(json.dumps(dict(item),ensure_ascii=False)+'\n') else: self.esf_f.write(json.dumps(dict(item),ensure_ascii=False)+'\n') return item def close_spider(self,spider): self.esf_f.close() self.new_f.close()
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from fake_useragent import UserAgent from scrapy import signals from twisted.internet import defer from twisted.internet.error import TimeoutError, DNSLookupError, \ ConnectionRefusedError, ConnectionDone, ConnectError, \ ConnectionLost, TCPTimedOutError from scrapy.http import HtmlResponse from twisted.web.client import ResponseFailed from scrapy.core.downloader.handlers.http11 import TunnelError # 伪装中间件,可以在这里添加代理 class UserangentDemoDownloaderMiddleware(object): def process_request(self, request, spider): request.headers['User-Agent'] = UserAgent().random return None def process_response(self, request, response, spider): return response # 异常中间件 class ProcessAllExceptionMiddleware(object): ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost, TCPTimedOutError, ResponseFailed, IOError, TunnelError) def process_response(self, request, response, spider): # 捕获状态码为40x/50x的response if str(response.status).startswith('4') or str(response.status).startswith('5'): # # 随意封装,直接返回response,spider代码中根据url==''来处理response 可以替换成正确的url并返回 print(response.status) print(response.url) pass # 后序处理,这里只做拦截,因为异常状态不多 # 其他状态码不处理 return response
# -*- coding: utf-8 -*- # Scrapy settings for fang project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'fang' SPIDER_MODULES = ['fang.spiders'] NEWSPIDER_MODULE = 'fang.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36 Edg/84.0.522.11' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'fang.middlewares.UserangentDemoDownloaderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'fang.middlewares.UserangentDemoDownloaderMiddleware': 100, 'fang.middlewares.ProcessAllExceptionMiddleware':80 } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'fang.pipelines.FangPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# -*- coding: utf-8 -*-
from scrapy import cmdline
# 直接run这个文件就不用命令行一直输入命令了
cmdline.execute('scrapy crawl sfw'.split())
博主这里默认你已经可以远程连接上redis数据库
(你也可以开几个虚拟机尝试是否可执行,但是注意就算可以执行但是带宽用的实际上还是一台机器,爬取的效果是有的,但是爬取的效率并不能显著提升)
要将一个Scrapy项目变成一个Scrapy-redis项目只需要修改一下三点就可以了:
# 设置redis为item pipeline ITEM_PIPELINES = { # 'fang.pipelines.FangPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 300 } # Scrapy-Redis相关配置 # 增加了一个去重容器类的配置,作用使用Redis的set集合来存储请求的指纹数据,从而实现请求去重的持久化(也就是用的scrapy_redis封装好的过滤器,毕竟调度器都是scrapy_redis共享的了) DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 使用scrapy-redis组件自己的调度器 SCHEDULER = 'scrapy_redis.scheduler.Scheduler' # 配置调度器是否要持久化,也就是说当爬虫结束了,要不要清空Redis中请求队列和去重指纹的set。如果是True,就表示要持久化存储,就不清空数据,否则清空数据。(就是可以实现增量式已经爬过的数据下次在爬的时候就不会爬了只会爬新的数据) # 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列, SCHEDULER_PERSIST = True # 设置连接redis信息 REDIS_HOST = 'XXXXXX' # ip,这里指定的ip为存储的redis的ip REDIS_POST = XXXX # 端口 REDIS_PARAMS={'password':'XXXX'} # 密码,如果redis没有密码可以不加这行
# -*- coding: utf-8 -*- import scrapy import re from scrapy_redis.spiders import RedisSpider from fang.items import NewHouseItem,EsfHouseItem class SfwSpider(RedisSpider): name = 'sfw' allowed_domains = ['fang.com'] # start_urls = ['https://www.fang.com/SoufunFamily.htm'] redis_key = 'sfw:start_url' # 对配置文件进行自定制,custom_settings的优先级高于settings custom_settings = { 'DOWNLOAD_DELAY': 1, # 延时最低为2s 'AUTOTHROTTLE_ENABLED': True, # 启动[自动限速] 'AUTOTHROTTLE_DEBUG': True, # 开启[自动限速]的debug 'AUTOTHROTTLE_MAX_DELAY': 10, # 设置最大下载延时 'DOWNLOAD_TIMEOUT': 15, 'CONCURRENT_REQUESTS_PER_DOMAIN': 8 # 限制对该网站的并发请求数 } # 获取所有城市的省份名称二手房url新房url def parse(self, response): trs = response.xpath('//div[@id="c02"]//tr') province = None for tr in trs: # 第一个是省 # 第二个是城市 tds = tr.xpath('.//td[not(@class)]') # 获取省文本,可能有会空的情况 province_text = tds[0].xpath('.//text()').extract_first() # 将空白字符替换为空 province_text = re.sub(r'\s','',province_text) # 如果有值说明有省份,如果没有值那么就用上一次的省份(为空说明属于上一次的省) if province_text: province = province_text # 不爬取海外城市房源 if province =='其它': continue city_list = tds[1].xpath('.//a') for city in city_list: city_name = city.xpath('./text()').extract_first() city_link = city.xpath('./@href').extract_first() # 构建新房url city_link_new = city_link.replace('fang.com','newhouse.fang.com/house/s') # 构建二手房url city_link_esf = city_link.replace('fang.com','esf.fang.com') # 发起请求callback是执行回调函数,meta用来请求传参,将本次的信息传递给回调函数中 yield scrapy.Request(url=city_link_new,callback=self.parse_newhouse,meta={'info':[province,city_name]}) yield scrapy.Request(url=city_link_esf, callback=self.parse_esfhouse,meta={'info': [province, city_name]}) # 解析新房页面 def parse_newhouse(self,response): province,city_name = response.meta['info'] li_list = response.xpath('//div[@id="newhouse_loupai_list"]//li[not(@style)]') for li in li_list: try: house_name = li.xpath('.//div[@class="nlcd_name"]/a/text()').extract_first().strip() except AttributeError: house_name = '' rooms_area_list = li.xpath('.//div[contains(@class,"house_type")]//text()').extract() # 将空白字符用正则去掉后进行拼接操作,结果为 1居/2居/3居-35~179平米..... # 因为后面要大量使用这个map函数,所以可以封装一个函数,优化点 rooms_area = ''.join(list(map(lambda x:re.sub(r'\s','',x),rooms_area_list))) # 如果不是居室情况就改成[] if '居' not in rooms_area: rooms_area=[] else: # 格式变得更好看 rooms_area = rooms_area.replace(r'-','/总面积:') address = li.xpath('.//div[@class="address"]/a/@title').extract_first() try: district = li.xpath('.//div[@class="address"]/a//text()').extract() # 里面是字符串形式的列表列表中是行政区 [怀来] [门头沟] XXX district =list(map(lambda x: re.sub(r'\s', '', x), district))[1][1:-1] except IndexError: district = '' sale = li.xpath('.//div[@class="fangyuan"]/span/text()').extract_first() price = li.xpath('.//div[@class="nhouse_price"]//text()').extract() price = ''.join(list(map(lambda x: re.sub(r'\s', '', x), price))) # response.urljoin是将缺失的url拼接完整 # //feicuigongyuan.fang.com/ 自动拼接成https://feicuigongyuan.fang.com/ 如果完整就不会做操作 house_link_url = response.urljoin(li.xpath('.//div[@class="nlcd_name"]/a/@href').extract_first()) phone = li.xpath('.//div[@class="tel"]/p/text()').extract_first() item = NewHouseItem(province=province,city_name=city_name,house_name=house_name,price=price,rooms_area=rooms_area,address=address,district=district,sale=sale,house_link_url=house_link_url,phone=phone) yield item # 获取下一页的url # 爬取到最后5页的时候next就会变成上一页的url,可优化! next_url = response.urljoin(response.xpath('.//div[@class="page"]//a[@class="next"]/@href').extract_first()) # 分页爬取 yield scrapy.Request(url=next_url,callback=self.parse_newhouse,meta={'info': [province,city_name]}) # 解析二手房页面 def parse_esfhouse(self,response): # print(response.url) province,city_name = response.meta['info'] dl_list = response.xpath('//div[@class="shop_list shop_list_4"]/dl[not(@dataflag="bgcomare")]') for dl in dl_list: house_name = dl.xpath('.//p[@class="add_shop"]/a/@title').extract_first() address = dl.xpath('.//p[@class="add_shop"]/span/text()').extract_first() try: price = dl.xpath('.//dd[@class="price_right"]/span[1]//text()').extract() price = price[1] + price[2] except IndexError: price = '' # price = price[1]+price[2] try: unit = dl.xpath('.//dd[@class="price_right"]/span[2]/text()').extract_first().strip() except AttributeError: unit = '' house_link_url = response.urljoin(dl.xpath('.//h4[@class="clearfix"]/a/@href').extract_first()) infos = dl.xpath('.//p[@class="tel_shop"]/text()').extract() try: infos = list(map(lambda x:re.sub(r'\s','',x),infos)) # 去除掉不和规矩的少数数据 if '厅' not in infos[0] or len(infos) !=7: continue for info in infos: if '厅' in info: rooms = info elif '层' in info: floor = info elif '向' in info: orientation = info elif '㎡' in info: area = info elif '建' in info: year = info item = EsfHouseItem(province=province,city_name=city_name,house_name=house_name,address=address,price=price,unit=unit,rooms=rooms,floor=floor,area=area,year=year,orientation=orientation,house_link_url=house_link_url) yield item except (IndexError,UnboundLocalError) : continue # 分页爬取 next_url = response.urljoin(response.xpath('.//div[@class="page_al"]/p[1]/a/@href').extract_first()) # print(next_url) yield scrapy.Request(url=next_url,callback=self.parse_esfhouse,meta={'info':[province,city_name]})
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class NewHouseItem(scrapy.Item): # 省份 province = scrapy.Field() # 城市名字 city_name = scrapy.Field() # 小区名字 house_name = scrapy.Field() # 价格 price = scrapy.Field() # 居室和面积情况 rooms_area = scrapy.Field() # 地址 address = scrapy.Field() # 行政区 district = scrapy.Field() # 是否在售 sale = scrapy.Field() # 电话 phone = scrapy.Field() # 房天下详情页面url house_link_url = scrapy.Field() class EsfHouseItem(scrapy.Item): # 省份 province = scrapy.Field() # 城市名字 city_name = scrapy.Field() # 小区名字 house_name = scrapy.Field() # 地址 address = scrapy.Field() # 总价格 price = scrapy.Field() # 单价 unit = scrapy.Field() # 居室 rooms = scrapy.Field() # 层 floor = scrapy.Field() # 面积 area = scrapy.Field() # 年代 year = scrapy.Field() # 朝向 orientation = scrapy.Field() # 房天下详情页面url house_link_url = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from fake_useragent import UserAgent from scrapy import signals from twisted.internet import defer from twisted.internet.error import TimeoutError, DNSLookupError, \ ConnectionRefusedError, ConnectionDone, ConnectError, \ ConnectionLost, TCPTimedOutError from scrapy.http import HtmlResponse from twisted.web.client import ResponseFailed from scrapy.core.downloader.handlers.http11 import TunnelError # 伪装中间件,可以在这里添加代理 class UserangentDemoDownloaderMiddleware(object): def process_request(self, request, spider): request.headers['User-Agent'] = UserAgent().random return None def process_response(self, request, response, spider): return response # 异常中间件 class ProcessAllExceptionMiddleware(object): ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost, TCPTimedOutError, ResponseFailed, IOError, TunnelError) def process_response(self, request, response, spider): # 捕获状态码为40x/50x的response if str(response.status).startswith('4') or str(response.status).startswith('5'): # # 随意封装,直接返回response,spider代码中根据url==''来处理response 可以替换成正确的url并返回 print(response.status) print(response.url) pass # 后序处理,这里只做拦截,因为异常状态不多 # 其他状态码不处理 return response
# -*- coding: utf-8 -*- # Scrapy settings for fang project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'fang' SPIDER_MODULES = ['fang.spiders'] NEWSPIDER_MODULE = 'fang.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.30 Safari/537.36 Edg/84.0.522.11' # Obey robots.txt rules ROBOTSTXT_OBEY = False # LOG_LEVEL = 'ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'fang.middlewares.UserangentDemoDownloaderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'fang.middlewares.UserangentDemoDownloaderMiddleware': 100, 'fang.middlewares.ProcessAllExceptionMiddleware':80, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 设置redis为item pipeline ITEM_PIPELINES = { # 'fang.pipelines.FangPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 300 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # Scrapy-Redis相关配置 # 增加了一个去重容器类的配置,作用使用Redis的set集合来存储请求的指纹数据,从而实现请求去重的持久化(也就是用的scrapy_redis封装好的过滤器,毕竟调度器都是scrapy_redis共享的了) DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 使用scrapy-redis组件自己的调度器 SCHEDULER = 'scrapy_redis.scheduler.Scheduler' # 配置调度器是否要持久化,也就是说当爬虫结束了,要不要清空Redis中请求队列和去重指纹的set。如果是True,就表示要持久化存储,就不清空数据,否则清空数据。(就是可以实现增量式已经爬过的数据下次在爬的时候就不会爬了只会爬新的数据) # 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列, SCHEDULER_PERSIST = True # 设置连接redis信息 REDIS_HOST = 'XXXX' REDIS_POST = XXXX REDIS_PARAMS={'password':'XXXX'}
# -*- coding: utf-8 -*-
from scrapy import cmdline
cmdline.execute('scrapy crawl sfw'.split())
感觉对你有帮助的话不妨去GitHub上给个star吧(里面还有一些其他关于爬虫的项目,一直有在更新)https://github.com/programday/crawler,感谢
或者点个赞( •̀ ω •́ )✧,欢迎评论区讨论问题
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。