赞
踩
具体分析过程就不写了,给出我学习的链接:https://blog.csdn.net/qq_35371031/article/details/81207966
上代码
import requests import os import threading import urllib.parse import time import re import hashlib class picture: """ 爬取百度图片 """ def __init__(self, picture_name,picture_number=100 ,path = 'picture'): self.save_path = picture_name self.picture_number = int(picture_number) self.start_time = time.time() self.picture_name = picture_name self.header = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } if self.save_path not in os.listdir('.'): os.makedirs(self.save_path) self.start() def start(self): for i in range(0,self.picture_number,60): self.get_picture_content(i) def get_picture_content(self,count): url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&word={0}&pn={1}'.format(urllib.parse.quote(self.picture_name),str(count)) print(url) r = requests.get(url,headers = self.header) if r.status_code != 200: exit("访问百度图库错误") else: link_url = re.findall('(?<=thumbURL":").*?.jpg',r.text) new_count = 60 if count+60 < self.picture_number else count + 60 - self.picture_number for i in range(new_count): res = requests.get(link_url[i],headers=self.header) if res.status_code != 200: exit('访问图片链接错误') else: self.save_picture(res.content,link_url[i]) def save_picture(self,content,picture_name): with open("{0}/{1}.jpg".format(self.save_path,hashlib.md5(picture_name.encode()).hexdigest()),'wb') as f: f.write(content) def __del__(self): print("花费了{}s时间".format(str(time.time()-self.start_time))) if __name__ == "__main__": picture_name = input("输入你要爬取的图片类型 ") number = input('输入你想爬取的数量 ') pic = picture(picture_name,number)
我没有写多线程,在我本地测试中了1000张
PS:
就这(狗头)??
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。