赞
踩
只需要img的src属性值进行解析,提交给该管道,该管道就会对图片的src进行请求发送获取图片的二进制数据,且保存到本地。
本次解析的网站地址:https://www.tuiimg.com/meinv/
谁不爱呢哈哈哈
原本思路:
后来发现这个思路不太可行。因为在详情页中还需要点击展开全图,这需要用到selenium的操作,加上scrapy我实现了一下失败了,所以后面换别的方法。
发现在初始界面的图片src为https://i.tuiimg.net/007/3007/c.jpg
然后点进去他的详情页,发现它每张照片的url为:
所以我只需要将预览图的c.jpg换成123456就可以得到所有的图片src
hotgirls.py
)parse() 用于获取所有不同图集的图片src模板,即将c.jpg去掉。
parse_length() 用于获取每个图集的图片张数,并解析每一张图片,将其yield给管道并下载。
from time import sleep import scrapy from hotgilrsPro.items import HotgilrsproItem class HotgirlsSpider(scrapy.Spider): name = "hotgirls" # allowed_domains = ["www.xxx.com"] start_urls = ["https://www.tuiimg.com/meinv/"] name_src_list = [] idx = 0 def parse_length(self, response): item = response.meta['item'] # print("当前在parse_length中解析的页面", response.url) length = response.xpath('.//div[@id="page"]/span[2]/i//text()').extract_first() length = int(length.split('/')[-1][:-1]) # 从str:展开全图(1/75)转到int: 75 item['length'] = length print("当前的idx", self.idx) # print("当前长度为:", length) for i in range(1, length+1): #print("保存在字典中的改写的src为",src, "字典模板为", self.dic['img_template']) url = self.name_src_list[self.idx]['img_template'] + f"/{i}.jpg" item['src'] = url # print("在parse_length中生成的url:", url) yield item # 将带有图片src的item传给管道,对应 2. 将存储图片地址的item提交到指定的管道类 self.idx += 1 def parse(self, response): # scrapy框架的内容 li_list = response.xpath('/html/body/div[3]/ul/li') # /html/body/div[3]/ul item = HotgilrsproItem() item['page_url'] = [] for li in li_list: name = li.xpath('.//a[2]/text()').extract_first() img_template = li.xpath('.//a[1]/img/@src').extract_first() img_template = img_template[:img_template.rfind('/')] # 得到前面的模板 img_template = ''.join(img_template) print("当前的name:", name) print("当前的模板:", img_template) item['img_template'] = img_template dic = {} dic['name'] = name item['name'] = name dic['img_srcs'] = {} dic['img_template'] = img_template self.name_src_list.append(dic) page_src = li.xpath('./a[1]/@href').extract_first() item['page_url'].append(page_src) yield scrapy.Request(url=page_src, callback=self.parse_length, meta={'item': item}) # 这一步是异步的,在这儿等请求响应并接着往下执行。 print(self.name_src_list)
return 'images/' + str(self.imgName) + ".jpg"
这里写图片的路径的时候,前面必须再加上一个目录名,否则不会保存到本地,!!!!
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import scrapy # useful for handling different item types with a single interface from itemadapter import ItemAdapter from scrapy.pipelines.images import ImagesPipeline # from hotgilrsPro.spiders.hotgirls import name_src_list # 写这里别忘了修改settings。 # class HotgilrsproPipeline: # def process_item(self, item, spider): # return item class imgsPileLine(ImagesPipeline): imgName = 1 # 可以根据图片地址进行图片数据的请求 def get_media_requests(self, item, info): # print("当前在pipelines中请求到图片地址:", item['src']) yield scrapy.Request(item['src']) # 指定图片的存储路径 # 这里本来想将图集名称也爬取下来,放到多个文件夹下,但是能力不够没能实现。只能按照张数123一张一张胡乱存。 def file_path(self, request, response=None, info=None): print("当前图片", request.url, "的存储路径", self.imgName) self.imgName += 1 return 'images/' + str(self.imgName) + ".jpg" def item_completed(self, results, item, info): return item # 返回给下一个即将被执行的管理类
#指定图片存储的目录
IMAGES_STORE = './imgs_hotgirls'
# 开启指定管道
ITEM_PIPELINES = {
# "hotgilrsPro.pipelines.HotgilrsproPipeline": 300,
"hotgilrsPro.pipelines.imgsPileLine": 250,
}
class HotgilrsproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 在item中定义相关的属性
length = scrapy.Field()
name = scrapy.Field()
src = scrapy.Field()
page_url = scrapy.Field()
img_template = scrapy.Field()
from time import sleep import scrapy from hotgilrsPro.items import HotgilrsproItem class HotgirlsSpider(scrapy.Spider): name = "hotgirls" # allowed_domains = ["www.xxx.com"] start_urls = ["https://www.tuiimg.com/meinv/"] name_src_list = [] idx = 0 def parse_length(self, response): item = response.meta['item'] # print("当前在parse_length中解析的页面", response.url) length = response.xpath('.//div[@id="page"]/span[2]/i//text()').extract_first() length = int(length.split('/')[-1][:-1]) # 从str:展开全图(1/75)转到int: 75 item['length'] = length print("当前的idx", self.idx) # print("当前长度为:", length) for i in range(1, length+1): #print("保存在字典中的改写的src为",src, "字典模板为", self.dic['img_template']) url = self.name_src_list[self.idx]['img_template'] + f"/{i}.jpg" item['src'] = url # print("在parse_length中生成的url:", url) yield item self.idx += 1 def parse(self, response): # scrapy框架的内容 li_list = response.xpath('/html/body/div[3]/ul/li') # /html/body/div[3]/ul item = HotgilrsproItem() item['page_url'] = [] for li in li_list: name = li.xpath('.//a[2]/text()').extract_first() img_template = li.xpath('.//a[1]/img/@src').extract_first() img_template = img_template[:img_template.rfind('/')] # 得到前面的模板 img_template = ''.join(img_template) print("当前的name:", name) print("当前的模板:", img_template) item['img_template'] = img_template dic = {} dic['name'] = name item['name'] = name dic['img_srcs'] = {} dic['img_template'] = img_template self.name_src_list.append(dic) page_src = li.xpath('./a[1]/@href').extract_first() item['page_url'].append(page_src) yield scrapy.Request(url=page_src, callback=self.parse_length, meta={'item': item}) # 这一步是异步的,在这儿等请求响应并接着往下执行。 print(self.name_src_list)
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class HotgilrsproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() length = scrapy.Field() name = scrapy.Field() src = scrapy.Field() page_url = scrapy.Field() img_template = scrapy.Field()
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import scrapy # useful for handling different item types with a single interface from itemadapter import ItemAdapter from scrapy.pipelines.images import ImagesPipeline # from hotgilrsPro.spiders.hotgirls import name_src_list # 写这里别忘了修改settings。 # class HotgilrsproPipeline: # def process_item(self, item, spider): # return item class imgsPileLine(ImagesPipeline): imgName = 1 # 可以根据图片地址进行图片数据的请求 def get_media_requests(self, item, info): # print("当前在pipelines中请求到图片地址:", item['src']) yield scrapy.Request(item['src']) # 指定图片的存储路径 def file_path(self, request, response=None, info=None): print("当前图片", request.url, "的存储路径", self.imgName) self.imgName += 1 return 'images/' + str(self.imgName) + ".jpg" def item_completed(self, results, item, info): return item # 返回给下一个即将被执行的管理类
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。