赞
踩
一. 普通使用方法
- (1)settings. py
- 'scrapy. pipelines. imges. ImagesPipeline':300
- FILES_ STORE = 'D:\\ cnblogs' 存放位置
- FILES_ URLS_ FIELD = 'file_ urls' 下载图片url
- FILES_ RESULT_ FIELD = 'files' 图片名字
- FILES_ EXPIRES = 30 #30天过期
-
- (2)在item.py中添加两个字段
- img_urls = scrapy. Field()
- name= scrapy. Field()
二.通过继承使用图片管道
在item_completed(self, results, item, info):方法中 是图片下载完后进行操作result 包含 图片的url ,存放路径path 检查完 整性 checksum在该方法中可以将 默认的路径path 替换 成新的路径new_path 以便分类保存图片
- from .settings import IMAGES_STORE
- from scrapy.pipelines.images import ImagesPipeline
- from scrapy.http import Request
- import os
- class DouyuproImagePipeline(ImagesPipeline):
- def get_media_requests(self, item, info):
- image_url = item['vertical_src']
- yield Request(url=image_url)
- def item_completed(self, results, item, info):
- # 老路径
- old_path = IMAGES_STORE + [x['path'] for ok, x in results if ok][0]
- # 新路径
- new_path = IMAGES_STORE + item['nickname'] + '.jpg'
- # 防止图片重复
- try:
- os.renames(old_path, new_path)
- except Exception as e:
-
- print('替换完成!')
- return item
- '''
- result=[
- (True, {'url': 'https://rpic.douyucdn.cn/asrpic/190725/6587811_2142.png/dy1',
- 'path': 'full/ab811811c57efac2ef5a354265e692eb44e0adb6.jpg',
- 'checksum': 'cb171aeba651caab1b7827da664ef7c0'})
- ]
- '''
在get_media_requests 方法中构造下载图片的请求
没有item中的名字命名图片时 图片顺序混乱
- def file_path(self, request, response=None, info=None):
- image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
- return 'full/%s.jpg' % (image_guid)
原默认名字是以sha1 进行命名 重写这个方法也可以 修改图片名字
下面这个演示了该方法的 一组item中有多个图片链接时 该如何处理:
- class ManhuaPipeline(ImagesPipeline):
- name=0
- def get_media_requests(self, item, info):
- image_url_list = item['url_list']
- for url in image_url_list:
- yield Request(url=url,meta={'big_name':item['name']})
- self.name=0
- def file_path(self, request, response=None, info=None):
- self.name+=1
- big_name=request.meta['big_name']
- file_path=big_name+'/'+str(self.name)+'.jpg'
- return file_path
-
- '''
- result=[
- (True, {'url': 'https://rpic.douyucdn.cn/asrpic/190725/6587811_2142.png/dy1',
- 'path': 'full/ab811811c57efac2ef5a354265e692eb44e0adb6.jpg',
- 'checksum': 'cb171aeba651caab1b7827da664ef7c0'}),
- (True, {'url': 'https://rpic.douyucdn.cn/asrpic/190725/6587811_2142.png/dy1',
- 'path': 'full/ab811811c57efac2ef5a354265e692eb44e0adb6.jpg',
- 'checksum': 'cb171aeba651caab1b7827da664ef7c0'}),
- (True, {'url': 'https://rpic.douyucdn.cn/asrpic/190725/6587811_2142.png/dy1',
- 'path': 'full/ab811811c57efac2ef5a354265e692eb44e0adb6.jpg',
- 'checksum': 'cb171aeba651caab1b7827da664ef7c0'})
- ]
- '''
- class ManhuaPipeline(ImagesPipeline):
- name=0
- def get_media_requests(self, item, info):
- image_url_list = item['url_list']
- for url in image_url_list:
- self.name+=1
- yield Request(url=url,meta={'small_name':item['name'],'big_name':item['big_name'],'name':self.name})
- def file_path(self, request, response=None, info=None):
- small_name=request.meta['small_name']
- big_name=request.meta['big_name']
- name=request.meta['name']
- file_path=big_name+'/'+small_name+'/'+str(name)+'.jpg'
- print(file_path+'下载完成!')
- return file_path
- (1)
- settings. py
- 'scrapy. pipelines. files. FilesPipeline':300
-
- (2)在item中添加两个字段
- file_ urls = scrapy. Field()
- files = scrapy. Field()
-
- (3)settings. py
- FILES_ STORE = 'D:\\ cnblogs' 存放位置
- FILES_ URLS_ FIELD = 'file_ urls' 下载文件url
- FILES_ RESULT_ FIELD = 'files' 文件信息字段
- FILES_ EXPIRES = 30 #30天过期
-
- from scrapy.pipelines.files import FilesPipeline
- from scrapy.http import Request
- class LolPipeline(FilesPipeline):
- base_url='https://qt.qq.com/php_cgi/cod_video/php/get_video_url.php?game_id=2103041&vid='
- def get_media_requests(self, item, info):
- url=self.base_url+item['vid']
- return Request(url=url,meta={'view':item['view'],'name':item['game_name']})
-
- def file_path(self, request, response=None, info=None):
- view=request.meta['view']
- name=request.meta['name']
- path=str(view)+name+'.mp4'
- return path
downloader/response_status_count/200': 42,
'downloader/response_status_count/302': 41,
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。