赞
踩
网址:煎蛋随手拍
函数库:requests bs4 lxml(这几个是需要你pip)
声明:爬取的图片仅供学习 不做其他用途!!
进入网站以后你会看到有80页的图片,然后我们检查网页源代码,通过查找我们很容易就能找到链接的位置
通过下面的代码把图片的链接保存到list里面。
def get_content_page(html): try: soup = BeautifulSoup(html,"lxml") div = soup.find('div',attrs = {'id':'comments'}) ol_list = div.find(name = 'ol') for ol in ol_list.find_all(name = 'li'): try: text_list = ol.find('div',attrs = {'class':'text'}) #print(text_list) global a_list a_list = 'http:' + text_list.p.a['href'] #print(a_list) if a_list not in totle_list: print(a_list) totle_list.append(a_list) except: pass except: print('没有该网值') return totle_list
该网站有80页的图片,通过点击上一页和下一页发现 该网站的链接有一个特性,就是等号前面的两个字母不一样其,其他的都是一样的,但每一页它变换的字母都是随机的并没有什么规律,但我也懒得再取找规律,直接就做了一个list里面有二十六个大写和小写字母,让它遍历循环,如果有该网址就爬去上面的信息,如果没有就跳过。虽然有点耗时但简单暴力。
ur = ['a','s','d','f','g','h','j','k','l','z','x','c','v','b','n','m','q','w','e','r','t','y','u','i','o','p', 'A','S','D','F','G','H','J','K','L','Z','X','C,','V','B','N','M','Q','W','E','R','T','Y','U','I','O','P'] for i in tqdm(ur): for n in tqdm(ur): url = "http://jandan.net/ooxx/MjAyMDEwMTUtN{l1}{l2}=#comments".format(l1=i,l2=n) def get_totle_page(url): global headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } try: response = requests.get(url ,headers =headers) if response.status_code == 200: return response.text except requests.ConnectionError: print("请求数据失败!!!") return None
这没什么好说的直接上代码
def save_img_list(list):
if not os.path.exists('./picture_Libs'):
os.mkdir('./picture_Libs')
for totle in message:
try:
response = requests.get(url=totle, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
img_data = response.content
except TimeoutError:
print('请求超时!!!')
img_path = './picture_Libs/' + totle.split('/')[-1]
with open(img_path,'wb') as fp:
fp.write(img_data)
全部源码:
import requests from bs4 import BeautifulSoup import lxml import os from tqdm import tqdm totle_list = [] ur = ['a','s','d','f','g','h','j','k','l','z','x','c','v','b','n','m','q','w','e','r','t','y','u','i','o','p', 'A','S','D','F','G','H','J','K','L','Z','X','C,','V','B','N','M','Q','W','E','R','T','Y','U','I','O','P'] def get_totle_page(url): global headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } try: response = requests.get(url ,headers =headers) if response.status_code == 200: return response.text except requests.ConnectionError: print("请求数据失败!!!") return None def get_content_page(html): try: soup = BeautifulSoup(html,"lxml") div = soup.find('div',attrs = {'id':'comments'}) ol_list = div.find(name = 'ol') for ol in ol_list.find_all(name = 'li'): try: text_list = ol.find('div',attrs = {'class':'text'}) #print(text_list) global a_list a_list = 'http:' + text_list.p.a['href'] #print(a_list) if a_list not in totle_list: print(a_list) totle_list.append(a_list) except: pass except: print('没有该网址') return totle_list def save_img_list(message): if not os.path.exists('./grils'): os.mkdir('./grils') for totle in message: try: response = requests.get(url=totle, headers=headers) if response.status_code == 200: response.encoding = response.apparent_encoding img_data = response.content except TimeoutError: print('请求超时!!!') img_path = './picture_Libs/' + totle.split('/')[-1] with open(img_path,'wb') as fp: fp.write(img_data) def main(): for i in tqdm(ur): for n in tqdm(ur): url = "http://jandan.net/ooxx/MjAyMDEwMTUtN{l1}{l2}=#comments".format(l1=i,l2=n) print(url) html = get_totle_page(url) message = get_content_page(html) main() save_img_list(totle_list)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。