赞
踩
代码是测试爬取Ajax异步请求的百度图片,注视掉可以当模块导入
在这里插入代码片 #批量采集百度图库图片数据 import re import requests import threading import urllib.request,urllib.response import sys import uuid import time class Downloader(threading.Thread): def __init__(self,url,fileName): threading.Thread.__init__(self) self.__url=url self.__fileName=fileName @staticmethod def download(bockNum,blockSzie,contentLength): p=100*bockNum*blockSzie/contentLength if p>100: p=100 sys.stdout.write('下载进度{0}%'.format(p)) sys.stdout.flush() pass def run(self): # fileName, headers = urllib.request.urlretrieve(self.__url) print('开始下载') urllib.request.urlretrieve(self.__url,self.__fileName, self.download) print('下载完成') pass pass def decode_url(url): """ 对百度加密后的地址进行解码\n :param url:百度加密的url\n :return:解码后的url """ table = {'w': "a", 'k': "b", 'v': "c", '1': "d", 'j': "e", 'u': "f", '2': "g", 'i': "h", 't': "i", '3': "j", 'h': "k", 's': "l", '4': "m", 'g': "n", '5': "o", 'r': "p", 'q': "q", '6': "r", 'f': "s", 'p': "t", '7': "u", 'e': "v", 'o': "w", '8': "1", 'd': "2", 'n': "3", '9': "4", 'c': "5", 'm': "6", '0': "7", 'b': "8", 'l': "9", 'a': "0", '_z2C$q': ":", "_z&e3B": ".", 'AzdH3F': "/"} url = re.sub(r'(?P<value>_z2C\$q|_z\&e3B|AzdH3F+)', lambda matched: table.get(matched.group('value')), url) return re.sub(r'(?P<value>[0-9a-w])', lambda matched: table.get(matched.group('value')), url) if __name__=='__main__': header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } i=0 while True: time.sleep(10) url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%B0%8F%E5%A7%90%E5%A7%90&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word=%E5%B0%8F%E5%A7%90%E5%A7%90&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&' \ 'force=&pn={0}&rn=30&gsm={1}&1565847642409='.format(i,str(hex(i))) response = requests.get(url, headers=header) try: obj = response.json() for temp in obj['data']: if temp.get('objURL'): imgUrl=temp['objURL'] if imgUrl.startswith('ippr_z2C'): imgUrl = decode_url(imgUrl) lastName=imgUrl.split('.')[-1] d=Downloader(imgUrl,str(uuid.uuid4())+'.'+lastName) d.start() except Exception as e: print(e) i+=30
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。