赞
踩
实现爬取任意数量的图片,百度图片是动态加载进来的,只用requests来爬取静态页面比较复杂,可以利用selenium和requests来爬取,其实可以单独利用selenium来爬。
需要下载chromedriver和chrome浏览器。(chromedriver和这个程序放在同一个文件夹中)
需要python下载selenuim、requests库
from selenium import webdriver import time from lxml import etree import requests import os def mkdir(path): folder = os.path.exists(path) if not folder: os.makedirs(path) print(f"folder{path} success") else: print(f"folder{path} filed") if __name__ == '__main__': word = input("关键词") num = eval(input("数量")) path = "./"+word mkdir(path) headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", "referer": "https://image.baidu.com/" } #打开浏览器驱动 bro = webdriver.Chrome(executable_path='./chromedriver.exe') #请求浏览器 bro.get("https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111110&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word="+word) index = 1 #获取当前状态页面源代码 page = bro.page_source tree = etree.HTML(page) #解析 top = boxes = tree.xpath('//div[@class="imgbox"]') #第一次执行,先把有的这一部分弄出来 for box in boxes: url = box.xpath('./a/img/@src')[0] print(index,url) content = requests.get(url=url,headers=headers).content with open(file=path+f'/{index}.jpg',mode="wb") as fp: fp.write(content) index += 1 while(index<num): #让浏览器滚轮向下滑动 bro.execute_script("window.scrollTo(0,document.body.scrollHeight)") #继续解析 page = bro.page_source tree = etree.HTML(page) boxes = tree.xpath('//div[@class="imgbox"]') #如果现在解析到的和上一次不一样,那么找到多的这一部分打印出来,并且这一次作为上一次 if(top!=boxes): more = boxes[len(top)-1:-1] for a in more: #获取图片url url = a.xpath('./a/img/@src')[0] #图片二进制内容 content = requests.get(url=url,headers=headers).content with open(file=path+f'/{index}.jpg',mode="wb") as fp: fp.write(content) print(index,url) index += 1 top = boxes #关闭浏览器 bro.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。