赞
踩
这篇水文是为那些刚刚开始写爬虫的朋友们准备的,已经在爬虫坑里呆了很久的老鸟请绕过(自己分析去!你个伸手党)。
废话不多说(这就说的不少了),打开百度图片
在哪个框框里面输入你想要下载的图片名称例如白云,苍(老师),狗 。然后,我就输入了”狗”。就是这个吊样子
然后,打开检查(F12)勾选日志,选择XHR
然后就开始撸代码了,创建一个scrapy项目,不会的同学自行百度。然后,爬虫这样写的:`# -- coding: utf-8 --
import re
import scrapy
import json
from spiderframe.items import ImgsItem
from urllib.parse import quote
class ImageBaiduSpider(scrapy.Spider):
name = ‘image_baidu’
def __init__(self, category=None, *args, **kwargs): super(ImageBaiduSpider, self).__init__(*args, **kwargs) self.category = category def start_requests(self): url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={category}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word={category}&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=1e&1560505489300=".format(category=quote(self.category)) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True) def parse(self, response): resp = json.loads(response.text) data = resp.get("data", []) img_urls = [] for img in data: hover_url = img.get("middleURL") if hover_url: img_urls.append(hover_url) print(hover_url) item = ImgsItem() item["category"] = self.category item["image_urls"] = img_urls yield item total_num = resp.get("displayNum") current_num = re.findall('&pn=(.*?)&rn=30', response.url)[0] if int(current_num) < int(total_num): url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={category}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word={category}&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn={page}&rn=30&gsm=1e&1560505489300=".format( category=quote(self.category), page=int(current_num)+30) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
`
items 是这样定义的:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class SpiderframeItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 这个跟爬图片无关 url = scrapy.Field() content = scrapy.Field() class ImgsItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 这个是爬图片的 category = scrapy.Field() image_urls = scrapy.Field() # 这个图片的URL 类型:list images = scrapy.Field() # 这个看源码,源码说默认结果字段,也不知道要它干啥, 有个屌用!
pipelines是这样写的
class ImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): # 这个方法是在发送下载请求之前调用的,其实这个方法本身就是去发送下载请求的 request_objs = super(ImagePipeline, self).get_media_requests(item, info) for request_obj in request_objs: request_obj.item = item return request_objs def file_path(self, request, response=None, info=None): # 这个方法是在图片将要被存储的时候调用,来获取这个图片存储的路径 path = super(ImagePipeline, self).file_path(request, response, info) category = request.item.get('category') image_store = settings.IMAGES_STORE category_path = os.path.join(image_store, category) if not os.path.exists(category_path): os.makedirs(category_path) image_name = path.replace("full/", "") image_path = os.path.join(category_path, image_name) return image_path
setting设置是这样的:
# image info
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'spiderframe/files/image')
ITEM_PIPELINES = {
'spiderframe.pipelines.SpiderframePipeline': 300,
'spiderframe.pipelines.RedisPipeline': 350,
'spiderframe.pipelines.MySQLPipeline': 400,
'spiderframe.pipelines.ImagePipeline': 450, # 就这个是图片,其他不用看
}
让爬虫跑起来就完事了,不会让爬虫跑起来的自行百度。
两个小坑,一个是爬虫传进去那个参数关键词:“狗” 要编码一下。链接要用https不能用http
两个小坑注意,不然返回的图片URL无法下载。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。