当前位置:   article > 正文

scrapy-redis分布式爬虫去重异步写入mysql数据库实例代码_异步redis爬虫案例

异步redis爬虫案例

首先创建一个爬虫文件dgrds.py

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from scrapy_redis.spiders import RedisSpider
  4. class DgrdsSpider(RedisSpider):
  5. name = 'dgrds'
  6. redis_key = 'dgrds:start_urls'
  7. def parse(self, response):
  8. for i in range(2499930, 2499940):
  9. yield scrapy.Request('https://www.douguo.com/cookbook/' + str(i) + '.html', callback=self.parse2)
  10. def parse2(self, response):
  11. if (response.status == 200):
  12. title = response.css('.rinfo h1.title::text').get('')
  13. view_nums = response.css('.vcnum span:first-of-type::text').get('')
  14. collection_nums = response.css('.vcnum .collectnum::text').get('')
  15. user_name = response.css('.author-info .nickname::text').get('')
  16. user_image = response.css('.author-img img::attr(src)').get('')
  17. tags = ''
  18. tag_arr = response.css('.fenlei span')
  19. if tag_arr is not None:
  20. for tg in tag_arr:
  21. tags += ';' + tg.css('a::text').get('')
  22. basic_url = ''
  23. youku = ''
  24. id = 0
  25. isvideo = response.css('#banner + a')
  26. if isvideo is not None:
  27. next_url = response.css('#banner + a::attr(href)').get('')
  28. id = next_url.replace('/recipevideo/', '')
  29. basic_url = 'https://www.douguo.com/cookbook/' + id + '.html'
  30. item = {
  31. 'cate': '',
  32. 'title': title,
  33. 'view_nums': view_nums,
  34. 'collection_nums': collection_nums,
  35. 'user_name': user_name,
  36. 'user_image': user_image,
  37. 'tags': tags,
  38. 'basic_url': basic_url
  39. }
  40. yield scrapy.Request(response.urljoin(next_url), meta=item, callback=self.parse4)
  41. def parse4(self, response):
  42. url = response.css('embed::attr(src)').get('')
  43. item = {
  44. 'cate': response.meta['cate'],
  45. 'title': response.meta['title'],
  46. 'view_nums': response.meta['view_nums'],
  47. 'collection_nums': response.meta['collection_nums'],
  48. 'user_name': response.meta['user_name'],
  49. 'user_image': response.meta['user_image'],
  50. 'tags': response.meta['tags'],
  51. 'basic_url': response.meta['basic_url']
  52. }
  53. item['video_url'] = url
  54. yield item

然后修改setting.py配置文件

  1. # -*- coding: utf-8 -*-
  2. # Scrapy settings for dgredis project
  3. #
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. # https://docs.scrapy.org/en/latest/topics/settings.html
  8. # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  9. # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  10. BOT_NAME = 'dgredis'
  11. SPIDER_MODULES = ['dgredis.spiders']
  12. NEWSPIDER_MODULE = 'dgredis.spiders'
  13. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  14. #USER_AGENT = 'dgredis (+http://www.yourdomain.com)'
  15. # Obey robots.txt rules
  16. ROBOTSTXT_OBEY = False
  17. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  18. #CONCURRENT_REQUESTS = 32
  19. # Configure a delay for requests for the same website (default: 0)
  20. # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
  21. # See also autothrottle settings and docs
  22. #DOWNLOAD_DELAY = 3
  23. # The download delay setting will honor only one of:
  24. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  25. #CONCURRENT_REQUESTS_PER_IP = 16
  26. # Disable cookies (enabled by default)
  27. #COOKIES_ENABLED = False
  28. # Disable Telnet Console (enabled by default)
  29. #TELNETCONSOLE_ENABLED = False
  30. # Override the default request headers:
  31. #DEFAULT_REQUEST_HEADERS = {
  32. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  33. # 'Accept-Language': 'en',
  34. #}
  35. # Enable or disable spider middlewares
  36. # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  37. #SPIDER_MIDDLEWARES = {
  38. # 'dgredis.middlewares.DgredisSpiderMiddleware': 543,
  39. #}
  40. # Enable or disable downloader middlewares
  41. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  42. #DOWNLOADER_MIDDLEWARES = {
  43. # 'dgredis.middlewares.DgredisDownloaderMiddleware': 543,
  44. #}
  45. # Enable or disable extensions
  46. # See https://docs.scrapy.org/en/latest/topics/extensions.html
  47. #EXTENSIONS = {
  48. # 'scrapy.extensions.telnet.TelnetConsole': None,
  49. #}
  50. # Configure item pipelines
  51. # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  52. ITEM_PIPELINES = {
  53. 'dgredis.pipelines.DgredisPipeline': 300,
  54. 'scrapy_redis.pipelines.RedisPipeline': 400,
  55. }
  56. # Enable and configure the AutoThrottle extension (disabled by default)
  57. # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
  58. #AUTOTHROTTLE_ENABLED = True
  59. # The initial download delay
  60. #AUTOTHROTTLE_START_DELAY = 5
  61. # The maximum download delay to be set in case of high latencies
  62. #AUTOTHROTTLE_MAX_DELAY = 60
  63. # The average number of requests Scrapy should be sending in parallel to
  64. # each remote server
  65. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  66. # Enable showing throttling stats for every response received:
  67. #AUTOTHROTTLE_DEBUG = False
  68. # Enable and configure HTTP caching (disabled by default)
  69. # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  70. #HTTPCACHE_ENABLED = True
  71. #HTTPCACHE_EXPIRATION_SECS = 0
  72. #HTTPCACHE_DIR = 'httpcache'
  73. #HTTPCACHE_IGNORE_HTTP_CODES = []
  74. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  75. MYSQL_HOST = '127.0.0.1'
  76. MYSQL_PORT = '3306'
  77. MYSQL_USER = 'root'
  78. MYSQL_PASS = ''
  79. MYSQL_DB = 'test'
  80. HTTPERROR_ALLOWED_CODES = [404, 301]
  81. DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
  82. SCHEDULER = "scrapy_redis.scheduler.Scheduler"
  83. SCHEDULER_PERSIST = True
  84. #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
  85. #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
  86. #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
  87. LOG_LEVEL = 'DEBUG'
  88. # Introduce an artifical delay to make use of parallelism. to speed up the
  89. # crawl.
  90. DOWNLOAD_DELAY = 1

然后修改管道文件pipeline.py

  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  6. import pymysql
  7. from twisted.enterprise import adbapi
  8. class DgredisPipeline:
  9. def __init__(self, dbpool):
  10. self.dbpool = dbpool
  11. @classmethod
  12. def from_settings(cls, settings):
  13. adbparams = dict(
  14. host=settings['MYSQL_HOST'],
  15. db=settings['MYSQL_DB'],
  16. user=settings['MYSQL_USER'],
  17. password=settings['MYSQL_PASS'],
  18. charset='utf8',
  19. cursorclass=pymysql.cursors.DictCursor
  20. )
  21. dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
  22. return cls(dbpool)
  23. def process_item(self, item, spider):
  24. query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
  25. # 添加异常处理
  26. query.addCallback(self.handle_error) # 处理异常
  27. def do_insert(self, cursor, item):
  28. # 对数据库进行插入操作,并不需要commit,twisted会自动commit
  29. insert_sql = """
  30. insert into douguoaa(title,user_name,user_image,view_nums,collection_nums,basic_url,video_url,tags,cate_name) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)
  31. """
  32. cursor.execute(insert_sql, (
  33. item['title'], item['user_name'], item['user_image'], item['view_nums'], item['collection_nums'],
  34. item['basic_url'],
  35. item['video_url'], item['tags'], item['cate']))
  36. def handle_error(self, failure):
  37. if failure:
  38. print(failure)

修改完这三个文件就可以支持分布式爬取。去重。异步写入数据库功能了

cd切换到爬虫项目的spider文件夹,执行命令 scrapy runspider dgrds.py ,这时候爬虫出去等待发送指令状态

然后在redis-cli端发送爬取指令,也就是指定start_urls,命令:lpush dgrds:start_urls http://www.douguo.com,点击回车后上面的图片代码接收到指令后会自动执行爬取

 

这个是基础爬虫文件,规则爬取crawSpider跟这个类似,只需指定Rule规则即可

That's All

 

Thanks!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/208133?site
推荐阅读
相关标签
  

闽ICP备14008679号