当前位置:   article > 正文

Python爬虫项目实战-Scrapy+Charles+MongoDB+Redis实现分布式京东全网信息爬取2020最新版_python charles 实现爬虫

python charles 实现爬虫

目录

 

一、基础知识

二、开发环境及项目结构

三、结果展示

四、实战源码

4.1 数据模型-items.py

4.2 存储操作(以MongoDB为存储数据库)-pipelines.py

4.3 项目配置-settings.py

4.4 中间件配置-middlewares.py

4.5 分类信息抓取-jd_category.py

4.6 抓取商品详细信息-jd_product.py

4.7 分布式操作的实现-add_category_to_redis.py

五、项目使用方法


一、基础知识及相关安装包

下面是一些博客链接

HTTP基础知识理解:Python网络爬虫-原理及基础知识

Charles抓包:Windows环境下配合Charles进行抓包

JS渲染:如何结合Splash对JS进行渲染

代理池的构建:多线程结合Flask构建免费高匿代理池

MongoDB基础教程:MongoDB基础教程

Redis基础教程:Redis基础教程

scrapy基础项目开发:暂无,后续会做一个小节介绍

二、开发环境及项目结构

软件环境:Pycharm2019 + MongoDB + Redis3.20 + MongoDB+ WIndows10

三、结果展示

四、实战源码

4.1 数据模型-items.py

  1. # -*- coding: utf-8 -*-
  2. # Define here the models for your scraped items
  3. #
  4. # See documentation in:
  5. # https://doc.scrapy.org/en/latest/topics/items.html
  6. import scrapy
  7. class MallSpiderItem(scrapy.Item):
  8. # define the fields for your item here like:
  9. # name = scrapy.Field()
  10. pass
  11. class Category(scrapy.Item):
  12. '''
  13. 定义类型数据模型,明确抓取的字段
  14. '''
  15. # 大、中、小商品对应的名称及url
  16. b_category_name = scrapy.Field()
  17. b_category_url = scrapy.Field()
  18. m_category_name = scrapy.Field()
  19. m_category_url = scrapy.Field()
  20. s_category_name = scrapy.Field()
  21. s_category_url = scrapy.Field()
  22. class Product(scrapy.Item):
  23. '''
  24. 商品信息数据模型
  25. '''
  26. product_category = scrapy.Field() # 商品类别
  27. product_category_id = scrapy.Field() # 类别ID
  28. product_sku_id = scrapy.Field() # 商品ID
  29. product_name = scrapy.Field() # 商品名称
  30. product_img_url = scrapy.Field() # 商品图片URL
  31. product_book_info = scrapy.Field() # 图书信息,作者,出版社
  32. product_option = scrapy.Field() # 商品选项
  33. product_shop = scrapy.Field() # 商品店铺
  34. product_comments = scrapy.Field() # 商品评论数量
  35. product_ad = scrapy.Field() # 商品促销
  36. product_price = scrapy.Field() # 商品价格

4.2 存储操作(以MongoDB为存储数据库)-pipelines.py

  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. from mall_spider.spiders.jd_category import JdCategorySpider
  7. from pymongo import MongoClient
  8. from mall_spider.settings import MONGODB_URL
  9. from mall_spider.spiders.jd_product import JdProductSpider
  10. class CategoryPipeline(object):
  11. '''存储操作'''
  12. def open_spider(self,spider):
  13. # 当爬虫启动时执行,执行一次
  14. if isinstance(spider,JdCategorySpider):
  15. # 连接MongoDB数据库,获取要操作的集合
  16. self.client = MongoClient(MONGODB_URL)
  17. self.collection = self.client['jd']['category']
  18. def process_item(self, item, spider):
  19. if isinstance(spider,JdCategorySpider):
  20. # 插入数据,转换成字典
  21. self.collection.insert_one(dict(item))
  22. return item
  23. def close_spider(self,spider):
  24. # 关闭连接
  25. if isinstance(spider, JdCategorySpider):
  26. self.client.close()
  27. class ProductPipeline(object):
  28. '''存储操作'''
  29. def open_spider(self, spider):
  30. # 当爬虫启动时执行,执行一次
  31. if isinstance(spider, JdProductSpider):
  32. # 连接MongoDB数据库,获取要操作的集合
  33. self.client = MongoClient(MONGODB_URL)
  34. self.collection = self.client['jd']['product']
  35. def process_item(self, item, spider):
  36. if isinstance(spider, JdProductSpider):
  37. # 插入数据,转换成字典
  38. self.collection.insert_one(dict(item))
  39. return item
  40. def close_spider(self, spider):
  41. # 关闭连接
  42. if isinstance(spider, JdProductSpider):
  43. self.client.close()

4.3 项目配置-settings.py

  1. # -*- coding: utf-8 -*-
  2. # Scrapy settings for mall_spider project
  3. #
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. # https://doc.scrapy.org/en/latest/topics/settings.html
  8. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  9. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  10. BOT_NAME = 'mall_spider'
  11. SPIDER_MODULES = ['mall_spider.spiders']
  12. NEWSPIDER_MODULE = 'mall_spider.spiders'
  13. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  14. #USER_AGENT = 'mall_spider (+http://www.yourdomain.com)'
  15. # Obey robots.txt rules
  16. ROBOTSTXT_OBEY = True
  17. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  18. #CONCURRENT_REQUESTS = 32
  19. # Configure a delay for requests for the same website (default: 0)
  20. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  21. # See also autothrottle settings and docs
  22. #DOWNLOAD_DELAY = 3
  23. # The download delay setting will honor only one of:
  24. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  25. #CONCURRENT_REQUESTS_PER_IP = 16
  26. # Disable cookies (enabled by default)
  27. #COOKIES_ENABLED = False
  28. # Disable Telnet Console (enabled by default)
  29. #TELNETCONSOLE_ENABLED = False
  30. # Override the default request headers:
  31. #DEFAULT_REQUEST_HEADERS = {
  32. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  33. # 'Accept-Language': 'en',
  34. #}
  35. # Enable or disable spider middlewares
  36. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  37. #SPIDER_MIDDLEWARES = {
  38. # 'mall_spider.middlewares.MallSpiderSpiderMiddleware': 543,
  39. #}
  40. # Enable or disable downloader middlewares
  41. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  42. # 下载中间件进行反爬,随机头和随机IP,最好使用付费的
  43. # DOWNLOADER_MIDDLEWARES = {
  44. # 'mall_spider.middlewares.MallSpiderDownloaderMiddleware': 543,
  45. # 'mall_spider.middlewares.RandomUserAgent':301,
  46. # }
  47. # Enable or disable extensions
  48. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  49. #EXTENSIONS = {
  50. # 'scrapy.extensions.telnet.TelnetConsole': None,
  51. #}
  52. # Configure item pipelines
  53. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  54. ITEM_PIPELINES = {
  55. # 数字越小越先执行
  56. 'mall_spider.pipelines.CategoryPipeline': 300,
  57. 'mall_spider.pipelines.ProductPipeline':301,
  58. }
  59. # Enable and configure the AutoThrottle extension (disabled by default)
  60. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  61. #AUTOTHROTTLE_ENABLED = True
  62. # The initial download delay
  63. #AUTOTHROTTLE_START_DELAY = 5
  64. # The maximum download delay to be set in case of high latencies
  65. #AUTOTHROTTLE_MAX_DELAY = 60
  66. # The average number of requests Scrapy should be sending in parallel to
  67. # each remote server
  68. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  69. # Enable showing throttling stats for every response received:
  70. #AUTOTHROTTLE_DEBUG = False
  71. # Enable and configure HTTP caching (disabled by default)
  72. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  73. #HTTPCACHE_ENABLED = True
  74. #HTTPCACHE_EXPIRATION_SECS = 0
  75. #HTTPCACHE_DIR = 'httpcache'
  76. #HTTPCACHE_IGNORE_HTTP_CODES = []
  77. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  78. # 配置MongoDB的URL
  79. MONGODB_URL = 'mongodb://127.0.0.1:27017'
  80. # 在settings文件中配置scrapy_redis
  81. # REDIS数据链接
  82. REDIS_URL = 'redis://127.0.0.1:6379/0'
  83. # 去重容器类: 用于把已爬指纹存储到基于Redis的set集合中
  84. DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
  85. # 调度器: 用于把待爬请求存储到基于Redis的队列
  86. SCHEDULER = "scrapy_redis.scheduler.Scheduler"
  87. # 是不进行调度持久化:
  88. # 如果是True, 当程序结束的时候, 会保留Redis中已爬指纹和待爬的请求
  89. # 如果是False, 当程序结束的时候, 会清空Redis中已爬指纹和待爬的请求
  90. SCHEDULER_PERSIST = True

4.4 中间件配置-middlewares.py

注释:这里以随机请求头和代理IP池返回的随机IP构成请求,但是由于京东对免费的IP池进行了反爬,所以我在settings中并没有开启middlewares的配置,仅作参考,若需要抓取万级量度的数据,可以选购付费IP池

  1. # -*- coding: utf-8 -*-
  2. # Define here the models for your spider middleware
  3. #
  4. # See documentation in:
  5. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  6. import random
  7. import requests
  8. import re
  9. from scrapy.downloadermiddlewares.retry import RetryMiddleware
  10. from twisted.internet import defer
  11. from twisted.internet.error import TimeoutError, DNSLookupError, \
  12. ConnectionRefusedError, ConnectionDone, ConnectError, \
  13. ConnectionLost, TCPTimedOutError
  14. from twisted.web.client import ResponseFailed
  15. from scrapy.core.downloader.handlers.http11 import TunnelError
  16. # 实现代理IP中间件
  17. class MallSpiderDownloaderMiddleware(object):
  18. EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
  19. ConnectionRefusedError, ConnectionDone, ConnectError,
  20. ConnectionLost, TCPTimedOutError, ResponseFailed,
  21. IOError, TunnelError)
  22. def process_request(self, request, spider):
  23. # 从代理池随机获取一个代理IP,协议和域名
  24. response = requests.get('http://localhost:6888/random?protocal=https&domain=jd.com')
  25. request.meta['proxy'] = response.content.decode()
  26. def process_exception(self, request, exception, spider):
  27. if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
  28. # 当请求出现异常的时候, 代理池哪些代理IP在本域名下是不可以用的
  29. url = 'http://localhost:6868/disable_domain'
  30. proxy = request.meta['proxy']
  31. ip = re.findall('https?://(.+?):\d+', proxy)[0]
  32. params = {
  33. 'ip': ip,
  34. 'domain': 'jd.com'
  35. }
  36. # 发送请求, 告诉代理池这个代理IP在本域名下是不可以用的
  37. requests.get(url, params=params)
  38. #1. 准备User-Agent列表
  39. # 准备请求头
  40. USER_AGENTS = [
  41. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  42. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
  43. "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  44. "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
  45. "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
  46. "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
  47. "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
  48. "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
  49. "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
  50. "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
  51. "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
  52. "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
  53. "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
  54. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
  55. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
  56. "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
  57. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
  58. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
  59. "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
  60. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
  61. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
  62. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
  63. "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
  64. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
  65. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
  66. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
  67. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
  68. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
  69. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
  70. "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
  71. "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
  72. "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
  73. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
  74. "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
  75. ]
  76. # 实现RandomUSerAgent类,UserAgent的中间件
  77. class RandomUserAgent(object):
  78. def process_request(self, request, spider):
  79. # 如果请求是https://cdnware.m.jd.com开头,就是iPhone的user-agent
  80. if request.url.startswith('https://cdnware.m.jd.com'):
  81. request.headers['user-agent'] = 'JD4iPhone/164880 (iPhone; iOS 12.1.2; Scale/2.00)'
  82. else:
  83. request.headers['user-agent'] =random.choice(USER_AGENTS)

4.5 分类信息抓取-jd_category.py

注:抓取主页面的所有分类信息

  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import json
  4. from mall_spider.items import Category
  5. class JdCategorySpider(scrapy.Spider):
  6. name = 'jd_category'
  7. allowed_domains = ['3.cn']
  8. start_urls = ['https://dc.3.cn/category/get'] # 修改起始url
  9. def parse(self, response):
  10. # print(response.body.decode('GBK')) # 指定编码方式,防止乱码
  11. result = json.loads(response.body.decode('GBK'))
  12. datas = result['data']
  13. # 遍历数据列表
  14. for data in datas:
  15. # 利用数据模型进行存储
  16. item = Category()
  17. b_category = data['s'][0]
  18. # 大分类信息
  19. b_category_info = b_category['n']
  20. # print("大分类:{}".format(b_category_info))
  21. item['b_category_name'], item['b_category_url'] = self.get_actegory_name_url(b_category_info)
  22. # 中分类信息列表
  23. m_category_s = b_category['s']
  24. # 遍历中分类信息列表
  25. for m_category in m_category_s:
  26. # 中分类信息
  27. m_category_info = m_category['n']
  28. item['m_category_name'], item['m_category_url'] = self.get_actegory_name_url(m_category_info)
  29. # print("中分类:{}".format(m_category_info))
  30. # 获取小分类数据列表
  31. s_category_s = m_category['s']
  32. for s_category in s_category_s:
  33. s_category_info = s_category['n']
  34. # print("小分类:{}".format(s_category_info))
  35. item['s_category_name'], item['s_category_url'] = self.get_actegory_name_url(s_category_info)
  36. # print(item)
  37. # 把数据交给引擎
  38. yield item
  39. def get_actegory_name_url(self,category_info):
  40. '''
  41. 根据分类的信息,提取名称和url
  42. :param category_info: 分类信息
  43. :return: 分类名称和url
  44. '''
  45. # 商品分类有三种数据格式,部分url需要进行拼接
  46. # mice.jd.com;1713-9278;4938-12420-12423
  47. category = category_info.split('|')
  48. category_url = category[0] # 分类URL
  49. category_name = category[1] # 分类名称
  50. # 处理第一类url
  51. if category_url.count('jd.com') == 1:
  52. # url进行补全即可
  53. category_url = 'https://'+category_url
  54. elif category_url.count('-')==1:
  55. # 处理第二类url
  56. category_url = 'https://channel.jd.com/{}.html'.format(category_url)
  57. else:
  58. # 处理第三类,把url中-替换为,
  59. category_url = category_url.replace('-',',')
  60. category_url = 'https://list.jd.com/list.html?cat={}'.format(category_url)
  61. return category_name, category_url

4.6 抓取商品详细信息-jd_product.py

注:这里主要是借用Networks面板进行分析,对网络中出现的json数据接口进行分析得出的,并没有使用selenium对页面进行操作

  1. # -*- coding: utf-8 -*-
  2. import time
  3. import scrapy
  4. import json
  5. import pickle
  6. from mall_spider.items import Product
  7. from jsonpath import jsonpath
  8. from scrapy_redis.spiders import RedisSpider
  9. '''
  10. 分布式爬虫
  11. 1.修改爬虫类,继承RedisSpider
  12. '''
  13. class JdProductSpider(RedisSpider):
  14. name = 'jd_product'
  15. allowed_domains = ['jd.com','3.cn']
  16. # start_urls = ['http://jd.com/']
  17. # 2. 用于指定起始url列表,在redis数据库中的key
  18. redis_key = 'jd_product:category'
  19. # def start_requests(self):
  20. # '''重写start_requests方法,根据分类信息构建列表页的请求'''
  21. # category = {
  22. # "b_category_name":"家用电器",
  23. # "b_category_url":"https://jiadian.jd.com",
  24. # "m_category_name": "电视",
  25. # "m_category_url": "https://list.jd.com/list.html?cat=737,794,798",
  26. # "s_category_name": "超薄电视",
  27. # "s_category_url": "https://list.jd.com/list.html?cat=737,794,798&ev=4155_76344&sort=sort_rank_asc&trans=1&JL=2_1_0#J_crumbsBar"
  28. # }
  29. # # 根据小分类的url构建列表页面请求
  30. # yield scrapy.Request(category['s_category_url'],callback=self.parse,meta={'category':category})
  31. # 3. 重写make_request_from_data
  32. def make_request_from_data(self, data):
  33. '''
  34. 根据redis读取的分类信息的二进制数据,构建请求
  35. :param data: 分类信息的二进制数据
  36. :return: 根据小分类URl构建的请求对象
  37. '''
  38. # 把分类的二进制数据转换为字典
  39. category = pickle.loads(data)
  40. # 根据小分类的url构建列表页面请求
  41. # 注意这里必须使用return返回请求,yield会变成生成器
  42. return scrapy.Request(category['s_category_url'],callback=self.parse,meta={'category':category})
  43. def parse(self, response):
  44. category = response.meta['category']
  45. # print(category)
  46. # 解析列表页,提取商品的skuid
  47. sku_ids = response.xpath('//div[contains(@class,"gl-i-wrap")]//@data-sku').extract()
  48. for sku_id in sku_ids:
  49. # 创建一个Product,用于保存商品数据
  50. item = Product()
  51. # product_category = scrapy.Field() # 商品类别
  52. # product_sku_id = scrapy.Field() # 商品ID
  53. # 设置商品类型
  54. item['product_category'] = category
  55. item['product_sku_id'] =sku_id
  56. # 构建商品基本信息请求,通过Charles进行识别获得
  57. # item传入无需深拷贝
  58. product_base_url = 'https://cdnware.m.jd.com/c1/skuDetail/apple/7.3.0/{}.json'.format(sku_id) # 但是只能看前三十个
  59. yield scrapy.Request(product_base_url,callback=self.parse_product_base,meta={'item':item})
  60. # 获取下一页的url,这里源代码不能使用了,已经进行修改
  61. # 最大页面数,下面未提取出来,所以换在上面提取,这里不知道为什么失效了,所以只抓取每个分类的前20页,也就是i为40
  62. max_len = int(response.xpath('//*[@id="J_topPage"]/span/i/text()').extract_first())
  63. # print("hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh",max_len)
  64. i = 1
  65. s = 0
  66. # for i in range(1,max_len+1):
  67. if i < 20:
  68. # 进行url拼接,这里的url只为偶数,但发现为偶数时候,不显示后面30个商品,这里是用&s参数来区分的,s每次加60(s代表页指标不动) p代表上页下页
  69. # 如果i为偶数,则代表不换页,s不变
  70. if i %2 !=0:
  71. s += 30
  72. next_url = category['s_category_url'].split('#')[0]+ "&page={}&s={}".format(i,s)
  73. # print(next_url)
  74. yield scrapy.Request(next_url,callback=self.parse,meta={'category':category})
  75. i += 1
  76. def parse_product_base(self,response):
  77. # 取出传递来的数据
  78. item = response.meta['item']
  79. # print(item)
  80. # print(response.text)
  81. # 把json字符串转化为字典
  82. result = json.loads(response.text)
  83. # 提取数据
  84. # product_name = scrapy.Field() # 商品名称
  85. # product_img_url = scrapy.Field() # 商品图片URL
  86. # product_book_info = scrapy.Field() # 图书信息,作者,出版社
  87. # product_option = scrapy.Field() # 商品选项
  88. # product_shop = scrapy.Field() # 商品店铺
  89. # product_comments = scrapy.Field() # 商品评论数量
  90. # product_ad = scrapy.Field() # 商品促销
  91. # product_price = scrapy.Field() # 商品价格
  92. item['product_name'] = result['wareInfo']['basicInfo']['name']
  93. item['product_img_url'] = result['wareInfo']['basicInfo']['wareImage'][0]['small']
  94. item['product_book_info'] = result['wareInfo']['basicInfo']['bookInfo']
  95. color_size = jsonpath(result,'$..colorSize')
  96. if color_size:
  97. # 注意colorsize值是个列表,jsonpath返回列表
  98. color_size = color_size[0]
  99. product_option = {}
  100. for option in color_size:
  101. title = option['title']
  102. value = jsonpath(option,'$..text') # 这是一个路径表达式
  103. product_option[title] = value
  104. item['product_option'] = product_option
  105. shop = jsonpath(result,'$..shop')
  106. if shop:
  107. shop = shop[0]
  108. if shop:
  109. # 无id则为京东自营
  110. item['product_shop'] = {
  111. 'shop_id':shop['shopId'],
  112. 'shop_name': shop['name'],
  113. 'shop_score': shop['score']
  114. }
  115. else:
  116. item['product_shop'] = {
  117. 'shop_name': '京东自营'
  118. }
  119. item['product_category_id'] = result['wareInfo']['basicInfo']['category'].replace(';',',')
  120. # 准备促销信息的URL,搜索promotion可以获得相关api,5051427
  121. ad_url = 'https://cd.jd.com/promotion/v2?skuId={}&area=12_919_922_0&cat={}'.format(item['product_sku_id'],item['product_category_id'])
  122. # print(item)
  123. # 构建促销请求
  124. yield scrapy.Request(ad_url,callback=self.parse_product_ad,meta={'item':item})
  125. def parse_product_ad(self,response):
  126. item = response.meta['item']
  127. print(item)
  128. # print(response.body.decode['GBK'])
  129. # 转化为字典
  130. result = json.loads(response.body)
  131. item['product_ad'] = jsonpath(result,'$..ad')[0] if jsonpath(result,'$..ad') else ''
  132. # 构建评价信息请求,comments刷新页面可见
  133. comments_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format(item['product_sku_id'])
  134. yield scrapy.Request(comments_url,callback=self.parse_product_comments,meta={'item':item})
  135. print(item)
  136. def parse_product_comments(self,response):
  137. item = response.meta['item']
  138. # print(item)
  139. # print(response.text)
  140. result = json.loads(response.text)
  141. item['product_comments'] = {
  142. 'CommentCount': jsonpath(result,'$..CommentCount')[0],
  143. 'GoodCount': jsonpath(result,'$..GoodCount')[0],
  144. 'PoorCount': jsonpath(result,'$..PoorCount')[0],
  145. 'GoodRate': jsonpath(result,'$..GoodRate')[0]
  146. }
  147. # print(item)
  148. # 构造价格请求,但这里scrapy会过滤非jd.com的域名,所以前面需要加上
  149. price_url = 'https://p.3.cn/prices/mgets?skuIds=J_{}'.format(item['product_sku_id'])
  150. yield scrapy.Request(price_url,callback=self.parse_product_price,meta={'item':item})
  151. def parse_product_price(self,response):
  152. item = response.meta['item']
  153. # print(response.text)
  154. item['product_price'] = json.loads(response.text)[0]['p']
  155. # print(item)
  156. # 把商品数据交给引擎
  157. yield item

4.7 分布式操作的实现-add_category_to_redis.py

  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Author : {Jack Zhao}
  5. @Time : 2020/5/14 12:26
  6. @Contact : {zc_dlmu@163.com}
  7. @Desc : 链接mongodb,链接redis,将mongodb中数据添加到redis_key,关闭mongodb
  8. '''
  9. from pymongo import MongoClient
  10. from redis import StrictRedis
  11. from mall_spider.settings import MONGODB_URL,REDIS_URL
  12. from mall_spider.spiders.jd_product import JdProductSpider
  13. import pickle
  14. def add_category_to_redis():
  15. mongo =MongoClient(MONGODB_URL)
  16. redis =StrictRedis.from_url(REDIS_URL)
  17. # 获取信息
  18. collection = mongo['jd']['category']
  19. cursor = collection.find()
  20. for category in cursor:
  21. # 序列化字典数据
  22. data = pickle.dumps(category)
  23. # 添加到redis
  24. redis.lpush(JdProductSpider.redis_key,data)
  25. mongo.close()
  26. if __name__ == '__main__':
  27. # 调用方法测试
  28. add_category_to_redis()

五、项目使用方法

  1. # WIn+R 输入services.msc
  2. # 先保证已经开启MongoDB,Redis
  3. cd ../mall_spider
  4. scrapy crawl jd_category # 先抓目录,再进行下面的操作
  5. scrapy crawl jd_product # 这里暂时是使用本机的IP进行爬取,不要过分爬取,防止京东封你家IP

欢迎关注公众号,DataDesigner,让我们一起白话机器学习。

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/437195
推荐阅读
相关标签
  

闽ICP备14008679号