赞
踩
因为需要一些图片素材,又不想一个个手动下载,遂通过爬虫来解放双手。在百度图片中搜索“汉服美女”,然后以浏览器地址栏上的地址作为初始 URL。通过对 URL 分析知道 URL 分为 3 部分:域名 + 固定参数 + 关键字参数。
#!/usr/bin/env python # -*- coding: utf-8 -*- # @author: Nancy # @contact: fweiren@163.com # @software: PyCharm # @file: getHanfu.py # @time: 2019/2/23 14:34 import requests import re import time class BaiduPictures(object): def __init__(self, keyboard): self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3355.4 Safari/537.36"} self.base_url = "https://image.baidu.com" self.keyboard = str(keyboard) def send_request(self, url): """ :param url: 网址 :return: unicode 型数据 """ try: html = requests.get(url, headers=self.headers).text return html except Exception as e: print(e) def make_request(self, url): """ :param url: 网址 :return: bytes 型数据 (二进制的数据) """ try: html = requests.get(url, headers=self.headers).content return html except Exception as e: print(e) def load_page(self, html): pattern = r'"objURL":"(http.*?)",' img_urls = re.findall(pattern, html) for img_url in img_urls: img_puffix = img_url.rsplit(".", 1)[1].lower() t = time.time() now_time = str(round(t * 1000)) data = self.make_request(img_url) if data and img_puffix in ['jpg', 'jpeg', 'png']: self.write_pic(data, now_time + "." + img_puffix) elif data: self.write_pic(data, now_time + ".jpg") else: print(img_url + u"地址无效")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。