赞
踩
爬取网站:http://www.pes-stars.co.ua/?page1/
import requests
import time
from lxml import etree
url = 'http://www.pes-stars.co.ua/?page1/'
headers = {"Referer":" ",}
resq = requests.get(url,headers = headers)
print(resq)
html = etree.HTML(resq.text)
srcs = html.xpath(".//img/@src")
for i in srcs:
imgname = i.split('/')[-1]
img = requests.get(i,headers = headers)
with open('imgs1/'+imgname,'wb') as file:
file.write(img.content)
print(i,imgname)
发现下载了网页前几个标题,到了真正要下载的正文部分的人脸图像时反而报错,查看每张图片的【src】,发现:
成功下载的图片的【src】:
报错的【src】:
发现报错的【src】连接是有问题的,少了http:的信息,手动加上去,代码变成下面这样:
import requests
import time
from lxml import etree
url = 'http://www.pes-stars.co.ua/?page1/'
headers = {"Referer":"Referer: http://www.pes-stars.co.ua/?page1/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70",}
resq = requests.get(url,headers = headers)
print(resq)
html = etree.HTML(resq.text)
srcs = html.xpath(".//img/@src")
for i in srcs:
print('1---i: ', i)
imgname = i.split('/')[-1]
j = 'http://www.pes-stars.co.ua' + i
print('2---j: ', j)
try:
img = requests.get(j, headers=headers)
with open('imgs1/' + imgname, 'wb') as file:
file.write(img.content)
print(i, imgname)
except requests.exceptions.ConnectionError:
requests.status_code = "Connection refused"
成功下载。
翻看每一页的url,可以发现只有最后的数字不一样:
http://www.pes-stars.co.ua/?page1
http://www.pes-stars.co.ua/?page2
http://www.pes-stars.co.ua/?page3
...
直接在url里加一个for循环,选择要下载的页数:
# -*- coding:utf-8 -*-
# Author: 自学小白菜
'''
既然选择了前进,那每一步都要认真的去做
'''
import requests
import time
from lxml import etree
for m in range(1, 301): # 爬取1-300页的图
m += 1
url = 'http://www.pes-stars.co.ua/?page' + str(m)
print('url: ', url)
headers = {"Referer": "Referer: 'http://www.pes-stars.co.ua/?page",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70",}
resq = requests.get(url, headers=headers)
print('resq: ', resq)
html = etree.HTML(resq.text)
srcs = html.xpath(".//img/@src")
for i in srcs:
# print('1---i: ', i, srcs)
list1 = str(i)
if list1[-3:] == 'jpg':
imgname = i.split('/')[-1]
j = 'http://www.pes-stars.co.ua' + i
# print('2---j: ', j, imgname)
try:
img = requests.get(j, headers=headers)
with open('imgs1/' + imgname, 'wb') as file:
file.write(img.content)
print(i, imgname)
except requests.exceptions.ConnectionError:
requests.status_code = "Connection refused"
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。