赞
踩
这几天完成了分布式爬虫的学习,发现了解scrapy-redis源代码对于分布式爬虫的学习真的很重要,废话少说,直接上干货:
首先,我们要创建一个Scrapy项目来进行学习。创建项目用下面的代码在Terminal中执行。
scrapy startproject 项目名称
如图:
然后我们需要将scrapy-redis的源码拷贝到scrapy中,源码下载地址如下:
https://github.com/rmax/scrapy-redis
我们可以通过下载zip文件和git clone两种方式进行下载,个人建议还是用常见的下载zip文件比较方便。
分布式爬虫系统的结构如下:
scrapy-redis的源码中主要有以下几个文件:
connection.py
defaults.py
dupefilter.py
picklecompat.py
pipeline.py
queue.py
scheduler.py
spiders.py
utils.py
这个文件是用来连接redis的文件,与其他文件相比较而言,这个文件用到的次数是很多的,也是最重要的文件,Connection提供了一个非常重要的参数。pipeline,queue,scheduler文件都会调用。Connection.py文件解析如下:
import six from scrapy.utils.misc import load_object from . import defaults # Shortcut maps 'setting name' -> 'parmater name'. # redis数据库的关系映射 SETTINGS_PARAMS_MAP = { 'REDIS_URL': 'url', 'REDIS_HOST': 'host', 'REDIS_PORT': 'port', 'REDIS_ENCODING': 'encoding', } def get_redis_from_settings(settings): # 获取一个redis连接实例 # 生成连接redis参数 """Returns a redis client instance from given Scrapy settings object. This function uses ``get_client`` to instantiate the client and uses ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You can override them using the ``REDIS_PARAMS`` setting. Parameters ---------- settings : Settings A scrapy settings object. See the supported settings below. Returns ------- server Redis client instance. Other Parameters ---------------- REDIS_URL : str, optional Server connection URL. REDIS_HOST : str, optional Server host. REDIS_PORT : str, optional Server port. REDIS_ENCODING : str, optional Data encoding. REDIS_PARAMS : dict, optional Additional client parameters. """ # 浅拷贝,是为了防止params改变,会导致默认的REDIS_PARAMS被改变 params = defaults.REDIS_PARAMS.copy() # 将settings中的参数更新到params params.update(settings.getdict('REDIS_PARAMS')) # XXX: Deprecate REDIS_* settings. # 遍历映射表,获取指定的参数 for source, dest in SETTINGS_PARAMS_MAP.items(): # 优先使用settings中的参数 val = settings.get(source) # 如果settings中没有进行设置,则params不更新 if val: params[dest] = val # Allow ``redis_cls`` to be a path to a class. if isinstance(params.get('redis_cls'), six.string_types): params['redis_cls'] = load_object(params['redis_cls']) return get_redis(**params) # Backwards compatible alias. from_settings = get_redis_from_settings def get_redis(**kwargs): """Returns a redis client instance. Parameters ---------- redis_cls : class, optional Defaults to ``redis.StrictRedis``. url : str, optional If given, ``redis_cls.from_url`` is used to instantiate the class. **kwargs Extra parameters to be passed to the ``redis_cls`` class. Returns ------- server Redis client instance. """ # 没有redis_cli,则默认redis连接 redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) url = kwargs.pop('url', None) # 判断kwargs有没有url if url: #用url链接redis,优先使用url连接redis return redis_cls.from_url(url, **kwargs) else: #用字典的方式连接redis return redis_cls(**kwargs)
import redis # For standalone use. # 去重的键名 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' # 定义的存储items的键名(key),spider是爬虫的名称 PIPELINE_KEY = '%(spider)s:items' # Redis的连接对象,用于连接redis REDIS_CLS = redis.StrictRedis # 字符集编码 REDIS_ENCODING = 'utf-8' # Sane connection defaults. # redis数据库的连接参数 REDIS_PARAMS = { 'socket_timeout': 30, 'socket_connect_timeout': 30, 'retry_on_timeout': True, 'encoding': REDIS_ENCODING, } # 队列的变量名,用于存储爬取的url队列 SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 优先级队列,用于规定队列的进出方式 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 用于去重的key值,给request加指纹存储的地方 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 用于生成指纹的类 SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' #起始url对应的类(key) START_URLS_KEY = '%(name)s:start_urls' #起始url的类型 START_URLS_AS_SET = False
import logging import time from scrapy.dupefilters import BaseDupeFilter from scrapy.utils.request import request_fingerprint from . import defaults from .connection import get_redis_from_settings logger = logging.getLogger(__name__) # scrapy去重是利用集合实现的 # TODO: Rename class to RedisDupeFilter. class RFPDupeFilter(BaseDupeFilter): """Redis-based request duplicates filter. This class can also be used with default Scrapy's scheduler. """ logger = logger def __init__(self, server, key, debug=False): """Initialize the duplicates filter. Parameters ---------- server : redis.StrictRedis The redis server instance. redis 连接实例 key : str 存储requests指纹的地方 Redis key Where to store fingerprints. debug : bool, optional Whether to log filtered requests. 是否记录过滤的requests """ #看server是如何生成的,因为我们通过server就可以获取redis中的队列或者set self.server = server self.key = key self.debug = debug self.logdupes = True # 类方法传递当前的方法 @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ # 获取redis的连接实例 server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. # 存取指纹的key key = defaults.DUPEFILTER_KEY % { 'timestamp': int(time.time())} debug = settings.getbool('DUPEFILTER_DEBUG') # 默认值是false # 传给当前类,并把参数传递给init函数 return cls(server, key=key, debug=debug) @classmethod def from_crawler(cls, crawler): """Returns instance from crawler. Parameters ---------- crawler : scrapy.crawler.Crawler Returns ------- RFPDupeFilter Instance of RFPDupeFilter. """ return cls.from_settings(crawler.settings) def request_seen(self, request): """Returns True if request was already seen. Parameters ---------- request : scrapy.http.Request Returns ------- bool """ fp = self.request_fingerprint(request) # 生成一个指纹 # This returns the number of values added, zero if already exists. # 将 指纹加入redis 是一个集合类型 # self.server redis连接实例 # self.key 存储指纹的key # fp 就是指纹 adde
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。