利用java多线程爬取高清图片网站wallhaven的图片_java获取网页图片

作者：你好赵伟 | 2024-04-17 00:45:36

踩

java获取网页图片

个人爬了100G高清图片，用来玩玩挺好,哈哈

准备工作：

1.分析网站页面以及请求

获取请求网址以及参数可知查询参数为q: id :xxx 页面page

所以后期我们只需要拼接路径就可以自动获取每页的数据

"https://wallhaven.cc/search?q=id:"+类型+"&page="+页码;

分析列表页面找到当前页面是否隐藏下载url，没有的话还需跳转到详情页面再获取下载链接

没有直接下载地址只有缩略图url https://th.wallhaven.cc/small/g7/g7qyzd.jpg

打开详情页面

找到图片原图地址： https://w.wallhaven.cc/full/g7/wallhaven-g7qyzd.jpg

比较发现，缩略图与原图都是这种规律

https://th.wallhaven.cc/small/g7/g7qyzd.jpg

https://w.wallhaven.cc/full/g7/wallhaven-g7qyzd.jpg

所以我们只需要根据缩略图地址就可以拼接出原图地址

所以：

先获取每页图片列表的缩略图地址


 
    /**
     * 获取页面html,
     */
    public static String getHtml(String url) throws IOException {
        String name = Thread.currentThread().getName();
 
        BufferedReader in = null;
        String result="";
        try (InputStream ins =  getInputStream(url)) {
 
            InputStreamReader read = new InputStreamReader( ins, "gb2312");// 考虑到编码格
            BufferedReader bufferedReader = new BufferedReader(read);
            String lineTxt = null;
            //读取一行
            while ((lineTxt = bufferedReader.readLine()) != null) {
                //正则表达式
                result += lineTxt;
            }
            read.close();
            bufferedReader.close();
        } catch (IOException e) {
            System.out.println("线程"+name+"，页html文件不存在: " + url);
           // e.printStackTrace();
        }
 
        return result;
    }
 
 
 /**
     * 获取html所有某个相同标签上属性值
     * @param html
     * @param element  标签元素
     * @param attr 属性
     * @return
     */
    public static List match(String html, String element, String attr) {
 
        List result = new ArrayList();
 
        String reg = "]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
 
        Matcher m = Pattern.compile(reg).matcher(html);
 
        while (m.find()) {
 
            String r = m.group(1);
 
            result.add(r);
 
        }
 
        return result;
 
    }
 
 
 
 
 
 public static String  download(List<String> links,String type , int pageNum){
        /**
         https://th.wallhaven.cc/small/4l/4ly2o2.jpg
         https://w.wallhaven.cc/full/4l/wallhaven-4ly2o2.jpg
         */
        for (String url : links  ) {
            String realUrl = "https://w.wallhaven.cc/full/".concat(url.split("/")[4]) .concat("/wallhaven-".concat(url.split("/")[5]));
            //System.out.println("当前路径: "+ realUrl);
            String dirSec = url.split("/")[4];
            String fileName = "-wallhaven-"+dirSec+"-"+url.split("/")[5];
            downFiles(realUrl, type , fileName,pageNum);
        }
 
 
        return  "分任务下载完成";
    }
 
 /**
     * 下载文件
     * @return
     */
    public static   void downFiles(String url,String type,String fileName,int pageNum){
 
        String name = Thread.currentThread().getName();
        try (InputStream ins =  getInputStream(url)) {
            if(ins.available() ==0){
                return;
            }
            sumNum++;
            String saveDir = "I:\\picture\\"+type;
            Path target = Paths.get(saveDir, sumNum+fileName);
            Files.createDirectories(target.getParent());
            Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
            System.out.println("第"+pageNum+"页，"+"第"+sumNum+"张图片下载完成！");
        } catch (IOException e) {
            System.out.println("线程"+name+"，第"+pageNum+"页，"+"下载文件时文件不存在: "+url);
           // e.printStackTrace();
        }
    }
 
    public static InputStream   getInputStream(String url) throws IOException {
        URL realUrl = new URL(url);
 
        // 打开和URL之间的连接
 
        URLConnection connection = realUrl.openConnection();
 
        //模拟浏览器通用的请求属性
 
        connection.setRequestProperty("accept", "*/*");
 
        connection.setRequestProperty("connection", "Keep-Alive");
 
        connection.setRequestProperty("user-agent",
 
                "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
 
        // 建立实际的连接
 
        connection.connect();
 
        InputStream ins = connection.getInputStream();
        return ins;
    }

以上就是基本思路

下面完整代码，采用线程池异步任务，由于网站有连接数限制，最大3个线程同时下载访问


异步任务配置
/**
 * 线程池的配置：
 *  @EnableAsync 开启异步的支持
 *
 */
@Slf4j
@Configuration
@ComponentScan("com.zhyqin.service")
@EnableAsync
public class TaskExecutorConfig implements AsyncConfigurer {
    @Override
    public Executor getAsyncExecutor() {
        //初始化线程池
        ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
        taskExecutor.setMaxPoolSize(3);
        taskExecutor.setCorePoolSize(3);
        taskExecutor.setQueueCapacity(3);
        taskExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
        taskExecutor.initialize();
        log.info("线程池初始化成功！！");
        return taskExecutor;
    }

异步任务：


@Slf4j
@Service
public class AsyncTaskService {
 
    @Async
    public void executeAsyncTask(String type, int pageNum){
        String name = Thread.currentThread().getName();
 
        log.info("当前线程"+name+"执行任务:{},当前页码{} ",type, pageNum);
        String url = "https://wallhaven.cc/search?q=id:"+type+"&page="+pageNum;
        String html = null;
        String downloadStatus="";
        try {
            html = getHtml(url);
            List links = match(html,"img","data-src");
            if (links==null || links.size()==0){
                Thread.sleep(5000);
                StaticMethod.failTimes = StaticMethod.failTimes+1;
                log.info("******失败次数：{}*****", StaticMethod.failTimes);
                return;
            }
            downloadStatus = StaticMethod.download(links,type, pageNum);
 
        } catch (Exception e) {
            log.info("################IO执行异常#########################");
 
        }
 
 
        log.info("当前线程"+name+"； 执行第{}页,执行状态：{}", pageNum, downloadStatus);
    }
 
 
 
 
}

由于不确定规则是否通用，错略失败40次不再下载


@Slf4j
@Service
public class PictureService {
    @Resource
    AsyncTaskService asyncTaskService;
 
    public String startDown( String type,int startPage)  {
 
 
        while (StaticMethod.failTimes <40){
            if(startPage > 0) StaticMethod.currPageNum = startPage;
 
            asyncTaskService.executeAsyncTask(type,startPage);
            StaticMethod.currPageNum= StaticMethod.currPageNum + 1;
            startPage++;
            StaticMethod.failTimes=0;
            try {
                Thread.sleep(15000);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
 
        return "下载成功";
 
    }

工具静态方法类


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public  class StaticMethod {
 
    //下载总数
    public static  int sumNum = 0;
    //进行的页数
    public static  int currPageNum = 1;
    //失败次数
    public static  int failTimes = 0;
    public static String  download(List<String> links,String type , int pageNum){
        /**
         https://th.wallhaven.cc/small/4l/4ly2o2.jpg
 
         https://w.wallhaven.cc/full/4l/wallhaven-4ly2o2.jpg
         */
        for (String url : links  ) {
            String realUrl = "https://w.wallhaven.cc/full/".concat(url.split("/")[4]) .concat("/wallhaven-".concat(url.split("/")[5]));
            //System.out.println("当前路径: "+ realUrl);
 
 
            String dirSec = url.split("/")[4];
            String fileName = "-wallhaven-"+dirSec+"-"+url.split("/")[5];
            downFiles(realUrl, type , fileName,pageNum);
        }
 
 
        return  "分任务下载完成";
    }
    /**
     * 下载文件
     * @return
     */
    public static   void downFiles(String url,String type,String fileName,int pageNum){
 
        String name = Thread.currentThread().getName();
        try (InputStream ins =  getInputStream(url)) {
            if(ins.available() ==0){
                return;
            }
            sumNum++;
            String saveDir = "I:\\picture\\"+type;
            Path target = Paths.get(saveDir, sumNum+fileName);
            Files.createDirectories(target.getParent());
            Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
            System.out.println("第"+pageNum+"页，"+"第"+sumNum+"张图片下载完成！");
        } catch (IOException e) {
            System.out.println("线程"+name+"，第"+pageNum+"页，"+"下载文件时文件不存在: "+url);
           // e.printStackTrace();
        }
    }
 
    public static InputStream   getInputStream(String url) throws IOException {
        URL realUrl = new URL(url);
 
        // 打开和URL之间的连接
 
        URLConnection connection = realUrl.openConnection();
 
        //模拟浏览器通用的请求属性
 
        connection.setRequestProperty("accept", "*/*");
 
        connection.setRequestProperty("connection", "Keep-Alive");
 
        connection.setRequestProperty("user-agent",
 
                "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
 
        // 建立实际的连接
 
        connection.connect();
 
        InputStream ins = connection.getInputStream();
        return ins;
    }
 
    /**
     * 获取页面html,
     */
    public static String getHtml(String url) throws IOException {
        String name = Thread.currentThread().getName();
 
        BufferedReader in = null;
        String result="";
        try (InputStream ins =  getInputStream(url)) {
 
            InputStreamReader read = new InputStreamReader( ins, "gb2312");// 考虑到编码格
            BufferedReader bufferedReader = new BufferedReader(read);
            String lineTxt = null;
            //读取一行
            while ((lineTxt = bufferedReader.readLine()) != null) {
                //正则表达式
                result += lineTxt;
            }
            read.close();
            bufferedReader.close();
        } catch (IOException e) {
            System.out.println("线程"+name+"，页html文件不存在: " + url);
           // e.printStackTrace();
        }
 
        return result;
    }
 
 
 
    /**
     * 获取html所有某个相同标签上属性值
     * @param html
     * @param element  标签元素
     * @param attr 属性
     * @return
     */
    public static List match(String html, String element, String attr) {
 
        List result = new ArrayList();
 
        String reg = "]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
        Matcher m = Pattern.compile(reg).matcher(html);
        while (m.find()) {
            String r = m.group(1);
            result.add(r);
        }
        return result;
    }

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/你好赵伟/article/detail/437312

利用java多线程爬取高清图片网站wallhaven的图片_java获取网页图片

个人爬了100G高清图片 ， 用来玩玩挺好,哈哈

准备工作：

个人爬了100G高清图片，用来玩玩挺好,哈哈