赞
踩
1.分析网站页面以及请求
获取请求网址以及参数 可知查询参数为q: id :xxx 页面page
所以后期我们只需要拼接路径就可以自动获取每页的数据
"https://wallhaven.cc/search?q=id:"+类型+"&page="+页码;
分析列表页面找到 当前页面是否隐藏下载url,没有的话还需跳转到详情页面再获取下载链接
没有直接下载地址只有缩略图url https://th.wallhaven.cc/small/g7/g7qyzd.jpg
打开详情页面
找到图片原图地址 : https://w.wallhaven.cc/full/g7/wallhaven-g7qyzd.jpg
比较发现,缩略图与原图都是这种规律
https://th.wallhaven.cc/small/g7/g7qyzd.jpg
https://w.wallhaven.cc/full/g7/wallhaven-g7qyzd.jpg
所以我们只需要根据缩略图地址就可以拼接出原图地址
所以:
先获取每页图片列表的缩略图地址
-
- /**
- * 获取页面html,
- */
- public static String getHtml(String url) throws IOException {
- String name = Thread.currentThread().getName();
-
- BufferedReader in = null;
- String result="";
- try (InputStream ins = getInputStream(url)) {
-
- InputStreamReader read = new InputStreamReader( ins, "gb2312");// 考虑到编码格
- BufferedReader bufferedReader = new BufferedReader(read);
- String lineTxt = null;
- //读取一行
- while ((lineTxt = bufferedReader.readLine()) != null) {
- //正则表达式
- result += lineTxt;
- }
- read.close();
- bufferedReader.close();
- } catch (IOException e) {
- System.out.println("线程"+name+",页html文件不存在: " + url);
- // e.printStackTrace();
- }
-
- return result;
- }
-
-
- /**
- * 获取html所有某个相同标签上属性值
- * @param html
- * @param element 标签元素
- * @param attr 属性
- * @return
- */
- public static List match(String html, String element, String attr) {
-
- List result = new ArrayList();
-
- String reg = "]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
-
- Matcher m = Pattern.compile(reg).matcher(html);
-
- while (m.find()) {
-
- String r = m.group(1);
-
- result.add(r);
-
- }
-
- return result;
-
- }
-
-
-
-
-
- public static String download(List<String> links,String type , int pageNum){
- /**
- https://th.wallhaven.cc/small/4l/4ly2o2.jpg
- https://w.wallhaven.cc/full/4l/wallhaven-4ly2o2.jpg
- */
- for (String url : links ) {
- String realUrl = "https://w.wallhaven.cc/full/".concat(url.split("/")[4]) .concat("/wallhaven-".concat(url.split("/")[5]));
- //System.out.println("当前路径: "+ realUrl);
- String dirSec = url.split("/")[4];
- String fileName = "-wallhaven-"+dirSec+"-"+url.split("/")[5];
- downFiles(realUrl, type , fileName,pageNum);
- }
-
-
- return "分任务下载完成";
- }
-
- /**
- * 下载文件
- * @return
- */
- public static void downFiles(String url,String type,String fileName,int pageNum){
-
- String name = Thread.currentThread().getName();
- try (InputStream ins = getInputStream(url)) {
- if(ins.available() ==0){
- return;
- }
- sumNum++;
- String saveDir = "I:\\picture\\"+type;
- Path target = Paths.get(saveDir, sumNum+fileName);
- Files.createDirectories(target.getParent());
- Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
- System.out.println("第"+pageNum+"页,"+"第"+sumNum+"张图片下载完成!");
- } catch (IOException e) {
- System.out.println("线程"+name+",第"+pageNum+"页,"+"下载文件时文件不存在: "+url);
- // e.printStackTrace();
- }
- }
-
- public static InputStream getInputStream(String url) throws IOException {
- URL realUrl = new URL(url);
-
- // 打开和URL之间的连接
-
- URLConnection connection = realUrl.openConnection();
-
- //模拟浏览器通用的请求属性
-
- connection.setRequestProperty("accept", "*/*");
-
- connection.setRequestProperty("connection", "Keep-Alive");
-
- connection.setRequestProperty("user-agent",
-
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
-
- // 建立实际的连接
-
- connection.connect();
-
- InputStream ins = connection.getInputStream();
- return ins;
- }
以上就是基本思路
下面完整代码,采用线程池异步任务,由于网站有连接数限制,最大3个线程同时下载访问
- 异步任务配置
- /**
- * 线程池的配置:
- * @EnableAsync 开启异步的支持
- *
- */
- @Slf4j
- @Configuration
- @ComponentScan("com.zhyqin.service")
- @EnableAsync
- public class TaskExecutorConfig implements AsyncConfigurer {
- @Override
- public Executor getAsyncExecutor() {
- //初始化线程池
- ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
- taskExecutor.setMaxPoolSize(3);
- taskExecutor.setCorePoolSize(3);
- taskExecutor.setQueueCapacity(3);
- taskExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
- taskExecutor.initialize();
- log.info("线程池初始化成功!!");
- return taskExecutor;
- }
异步任务:
- @Slf4j
- @Service
- public class AsyncTaskService {
-
- @Async
- public void executeAsyncTask(String type, int pageNum){
- String name = Thread.currentThread().getName();
-
- log.info("当前线程"+name+"执行任务:{},当前页码{} ",type, pageNum);
- String url = "https://wallhaven.cc/search?q=id:"+type+"&page="+pageNum;
- String html = null;
- String downloadStatus="";
- try {
- html = getHtml(url);
- List links = match(html,"img","data-src");
- if (links==null || links.size()==0){
- Thread.sleep(5000);
- StaticMethod.failTimes = StaticMethod.failTimes+1;
- log.info("******失败次数:{}*****", StaticMethod.failTimes);
- return;
- }
- downloadStatus = StaticMethod.download(links,type, pageNum);
-
- } catch (Exception e) {
- log.info("################IO执行异常#########################");
-
- }
-
-
- log.info("当前线程"+name+"; 执行第{}页,执行状态:{}", pageNum, downloadStatus);
- }
-
-
-
-
- }
由于不确定规则是否通用,错略失败40次不再下载
- @Slf4j
- @Service
- public class PictureService {
- @Resource
- AsyncTaskService asyncTaskService;
-
- public String startDown( String type,int startPage) {
-
-
- while (StaticMethod.failTimes <40){
- if(startPage > 0) StaticMethod.currPageNum = startPage;
-
- asyncTaskService.executeAsyncTask(type,startPage);
- StaticMethod.currPageNum= StaticMethod.currPageNum + 1;
- startPage++;
- StaticMethod.failTimes=0;
- try {
- Thread.sleep(15000);
- } catch (InterruptedException e) {
- throw new RuntimeException(e);
- }
- }
-
- return "下载成功";
-
- }
工具静态方法类
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.net.URLConnection;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import java.nio.file.StandardCopyOption;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Objects;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
-
- public class StaticMethod {
-
- //下载总数
- public static int sumNum = 0;
- //进行的页数
- public static int currPageNum = 1;
- //失败次数
- public static int failTimes = 0;
- public static String download(List<String> links,String type , int pageNum){
- /**
- https://th.wallhaven.cc/small/4l/4ly2o2.jpg
-
- https://w.wallhaven.cc/full/4l/wallhaven-4ly2o2.jpg
- */
- for (String url : links ) {
- String realUrl = "https://w.wallhaven.cc/full/".concat(url.split("/")[4]) .concat("/wallhaven-".concat(url.split("/")[5]));
- //System.out.println("当前路径: "+ realUrl);
-
-
- String dirSec = url.split("/")[4];
- String fileName = "-wallhaven-"+dirSec+"-"+url.split("/")[5];
- downFiles(realUrl, type , fileName,pageNum);
- }
-
-
- return "分任务下载完成";
- }
- /**
- * 下载文件
- * @return
- */
- public static void downFiles(String url,String type,String fileName,int pageNum){
-
- String name = Thread.currentThread().getName();
- try (InputStream ins = getInputStream(url)) {
- if(ins.available() ==0){
- return;
- }
- sumNum++;
- String saveDir = "I:\\picture\\"+type;
- Path target = Paths.get(saveDir, sumNum+fileName);
- Files.createDirectories(target.getParent());
- Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
- System.out.println("第"+pageNum+"页,"+"第"+sumNum+"张图片下载完成!");
- } catch (IOException e) {
- System.out.println("线程"+name+",第"+pageNum+"页,"+"下载文件时文件不存在: "+url);
- // e.printStackTrace();
- }
- }
-
- public static InputStream getInputStream(String url) throws IOException {
- URL realUrl = new URL(url);
-
- // 打开和URL之间的连接
-
- URLConnection connection = realUrl.openConnection();
-
- //模拟浏览器通用的请求属性
-
- connection.setRequestProperty("accept", "*/*");
-
- connection.setRequestProperty("connection", "Keep-Alive");
-
- connection.setRequestProperty("user-agent",
-
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
-
- // 建立实际的连接
-
- connection.connect();
-
- InputStream ins = connection.getInputStream();
- return ins;
- }
-
- /**
- * 获取页面html,
- */
- public static String getHtml(String url) throws IOException {
- String name = Thread.currentThread().getName();
-
- BufferedReader in = null;
- String result="";
- try (InputStream ins = getInputStream(url)) {
-
- InputStreamReader read = new InputStreamReader( ins, "gb2312");// 考虑到编码格
- BufferedReader bufferedReader = new BufferedReader(read);
- String lineTxt = null;
- //读取一行
- while ((lineTxt = bufferedReader.readLine()) != null) {
- //正则表达式
- result += lineTxt;
- }
- read.close();
- bufferedReader.close();
- } catch (IOException e) {
- System.out.println("线程"+name+",页html文件不存在: " + url);
- // e.printStackTrace();
- }
-
- return result;
- }
-
-
-
- /**
- * 获取html所有某个相同标签上属性值
- * @param html
- * @param element 标签元素
- * @param attr 属性
- * @return
- */
- public static List match(String html, String element, String attr) {
-
- List result = new ArrayList();
-
- String reg = "]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
- Matcher m = Pattern.compile(reg).matcher(html);
- while (m.find()) {
- String r = m.group(1);
- result.add(r);
- }
- return result;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。