当前位置:   article > 正文

利用java多线程爬取高清图片网站wallhaven的图片_java获取网页图片

java获取网页图片

个人爬了100G高清图片 , 用来玩玩挺好,哈哈

 

准备工作: 

1.分析网站页面以及请求

 

 获取请求网址以及参数 可知查询参数为q: id :xxx    页面page

 

 

 

 所以后期我们只需要拼接路径就可以自动获取每页的数据

"https://wallhaven.cc/search?q=id:"+类型+"&page="+页码;

分析列表页面找到 当前页面是否隐藏下载url,没有的话还需跳转到详情页面再获取下载链接

 

没有直接下载地址只有缩略图url https://th.wallhaven.cc/small/g7/g7qyzd.jpg

打开详情页面

 

找到图片原图地址 : https://w.wallhaven.cc/full/g7/wallhaven-g7qyzd.jpg

比较发现,缩略图与原图都是这种规律

https://th.wallhaven.cc/small/g7/g7qyzd.jpg

https://w.wallhaven.cc/full/g7/wallhaven-g7qyzd.jpg

所以我们只需要根据缩略图地址就可以拼接出原图地址

所以:

先获取每页图片列表的缩略图地址

  1. /**
  2. * 获取页面html,
  3. */
  4. public static String getHtml(String url) throws IOException {
  5. String name = Thread.currentThread().getName();
  6. BufferedReader in = null;
  7. String result="";
  8. try (InputStream ins = getInputStream(url)) {
  9. InputStreamReader read = new InputStreamReader( ins, "gb2312");// 考虑到编码格
  10. BufferedReader bufferedReader = new BufferedReader(read);
  11. String lineTxt = null;
  12. //读取一行
  13. while ((lineTxt = bufferedReader.readLine()) != null) {
  14. //正则表达式
  15. result += lineTxt;
  16. }
  17. read.close();
  18. bufferedReader.close();
  19. } catch (IOException e) {
  20. System.out.println("线程"+name+",页html文件不存在: " + url);
  21. // e.printStackTrace();
  22. }
  23. return result;
  24. }
  25. /**
  26. * 获取html所有某个相同标签上属性值
  27. * @param html
  28. * @param element 标签元素
  29. * @param attr 属性
  30. * @return
  31. */
  32. public static List match(String html, String element, String attr) {
  33. List result = new ArrayList();
  34. String reg = "]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
  35. Matcher m = Pattern.compile(reg).matcher(html);
  36. while (m.find()) {
  37. String r = m.group(1);
  38. result.add(r);
  39. }
  40. return result;
  41. }
  42. public static String download(List<String> links,String type , int pageNum){
  43. /**
  44. https://th.wallhaven.cc/small/4l/4ly2o2.jpg
  45. https://w.wallhaven.cc/full/4l/wallhaven-4ly2o2.jpg
  46. */
  47. for (String url : links ) {
  48. String realUrl = "https://w.wallhaven.cc/full/".concat(url.split("/")[4]) .concat("/wallhaven-".concat(url.split("/")[5]));
  49. //System.out.println("当前路径: "+ realUrl);
  50. String dirSec = url.split("/")[4];
  51. String fileName = "-wallhaven-"+dirSec+"-"+url.split("/")[5];
  52. downFiles(realUrl, type , fileName,pageNum);
  53. }
  54. return "分任务下载完成";
  55. }
  56. /**
  57. * 下载文件
  58. * @return
  59. */
  60. public static void downFiles(String url,String type,String fileName,int pageNum){
  61. String name = Thread.currentThread().getName();
  62. try (InputStream ins = getInputStream(url)) {
  63. if(ins.available() ==0){
  64. return;
  65. }
  66. sumNum++;
  67. String saveDir = "I:\\picture\\"+type;
  68. Path target = Paths.get(saveDir, sumNum+fileName);
  69. Files.createDirectories(target.getParent());
  70. Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
  71. System.out.println("第"+pageNum+"页,"+"第"+sumNum+"张图片下载完成!");
  72. } catch (IOException e) {
  73. System.out.println("线程"+name+",第"+pageNum+"页,"+"下载文件时文件不存在: "+url);
  74. // e.printStackTrace();
  75. }
  76. }
  77. public static InputStream getInputStream(String url) throws IOException {
  78. URL realUrl = new URL(url);
  79. // 打开和URL之间的连接
  80. URLConnection connection = realUrl.openConnection();
  81. //模拟浏览器通用的请求属性
  82. connection.setRequestProperty("accept", "*/*");
  83. connection.setRequestProperty("connection", "Keep-Alive");
  84. connection.setRequestProperty("user-agent",
  85. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
  86. // 建立实际的连接
  87. connection.connect();
  88. InputStream ins = connection.getInputStream();
  89. return ins;
  90. }

以上就是基本思路

下面完整代码,采用线程池异步任务,由于网站有连接数限制,最大3个线程同时下载访问

  1. 异步任务配置
  2. /**
  3. * 线程池的配置:
  4. * @EnableAsync 开启异步的支持
  5. *
  6. */
  7. @Slf4j
  8. @Configuration
  9. @ComponentScan("com.zhyqin.service")
  10. @EnableAsync
  11. public class TaskExecutorConfig implements AsyncConfigurer {
  12. @Override
  13. public Executor getAsyncExecutor() {
  14. //初始化线程池
  15. ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
  16. taskExecutor.setMaxPoolSize(3);
  17. taskExecutor.setCorePoolSize(3);
  18. taskExecutor.setQueueCapacity(3);
  19. taskExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
  20. taskExecutor.initialize();
  21. log.info("线程池初始化成功!!");
  22. return taskExecutor;
  23. }

异步任务:

  1. @Slf4j
  2. @Service
  3. public class AsyncTaskService {
  4. @Async
  5. public void executeAsyncTask(String type, int pageNum){
  6. String name = Thread.currentThread().getName();
  7. log.info("当前线程"+name+"执行任务:{},当前页码{} ",type, pageNum);
  8. String url = "https://wallhaven.cc/search?q=id:"+type+"&page="+pageNum;
  9. String html = null;
  10. String downloadStatus="";
  11. try {
  12. html = getHtml(url);
  13. List links = match(html,"img","data-src");
  14. if (links==null || links.size()==0){
  15. Thread.sleep(5000);
  16. StaticMethod.failTimes = StaticMethod.failTimes+1;
  17. log.info("******失败次数:{}*****", StaticMethod.failTimes);
  18. return;
  19. }
  20. downloadStatus = StaticMethod.download(links,type, pageNum);
  21. } catch (Exception e) {
  22. log.info("################IO执行异常#########################");
  23. }
  24. log.info("当前线程"+name+"; 执行第{}页,执行状态:{}", pageNum, downloadStatus);
  25. }
  26. }

由于不确定规则是否通用,错略失败40次不再下载

  1. @Slf4j
  2. @Service
  3. public class PictureService {
  4. @Resource
  5. AsyncTaskService asyncTaskService;
  6. public String startDown( String type,int startPage) {
  7. while (StaticMethod.failTimes <40){
  8. if(startPage > 0) StaticMethod.currPageNum = startPage;
  9. asyncTaskService.executeAsyncTask(type,startPage);
  10. StaticMethod.currPageNum= StaticMethod.currPageNum + 1;
  11. startPage++;
  12. StaticMethod.failTimes=0;
  13. try {
  14. Thread.sleep(15000);
  15. } catch (InterruptedException e) {
  16. throw new RuntimeException(e);
  17. }
  18. }
  19. return "下载成功";
  20. }

工具静态方法类

  1. import java.io.BufferedReader;
  2. import java.io.IOException;
  3. import java.io.InputStream;
  4. import java.io.InputStreamReader;
  5. import java.net.URL;
  6. import java.net.URLConnection;
  7. import java.nio.file.Files;
  8. import java.nio.file.Path;
  9. import java.nio.file.Paths;
  10. import java.nio.file.StandardCopyOption;
  11. import java.util.ArrayList;
  12. import java.util.List;
  13. import java.util.Objects;
  14. import java.util.regex.Matcher;
  15. import java.util.regex.Pattern;
  16. public class StaticMethod {
  17. //下载总数
  18. public static int sumNum = 0;
  19. //进行的页数
  20. public static int currPageNum = 1;
  21. //失败次数
  22. public static int failTimes = 0;
  23. public static String download(List<String> links,String type , int pageNum){
  24. /**
  25. https://th.wallhaven.cc/small/4l/4ly2o2.jpg
  26. https://w.wallhaven.cc/full/4l/wallhaven-4ly2o2.jpg
  27. */
  28. for (String url : links ) {
  29. String realUrl = "https://w.wallhaven.cc/full/".concat(url.split("/")[4]) .concat("/wallhaven-".concat(url.split("/")[5]));
  30. //System.out.println("当前路径: "+ realUrl);
  31. String dirSec = url.split("/")[4];
  32. String fileName = "-wallhaven-"+dirSec+"-"+url.split("/")[5];
  33. downFiles(realUrl, type , fileName,pageNum);
  34. }
  35. return "分任务下载完成";
  36. }
  37. /**
  38. * 下载文件
  39. * @return
  40. */
  41. public static void downFiles(String url,String type,String fileName,int pageNum){
  42. String name = Thread.currentThread().getName();
  43. try (InputStream ins = getInputStream(url)) {
  44. if(ins.available() ==0){
  45. return;
  46. }
  47. sumNum++;
  48. String saveDir = "I:\\picture\\"+type;
  49. Path target = Paths.get(saveDir, sumNum+fileName);
  50. Files.createDirectories(target.getParent());
  51. Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
  52. System.out.println("第"+pageNum+"页,"+"第"+sumNum+"张图片下载完成!");
  53. } catch (IOException e) {
  54. System.out.println("线程"+name+",第"+pageNum+"页,"+"下载文件时文件不存在: "+url);
  55. // e.printStackTrace();
  56. }
  57. }
  58. public static InputStream getInputStream(String url) throws IOException {
  59. URL realUrl = new URL(url);
  60. // 打开和URL之间的连接
  61. URLConnection connection = realUrl.openConnection();
  62. //模拟浏览器通用的请求属性
  63. connection.setRequestProperty("accept", "*/*");
  64. connection.setRequestProperty("connection", "Keep-Alive");
  65. connection.setRequestProperty("user-agent",
  66. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
  67. // 建立实际的连接
  68. connection.connect();
  69. InputStream ins = connection.getInputStream();
  70. return ins;
  71. }
  72. /**
  73. * 获取页面html,
  74. */
  75. public static String getHtml(String url) throws IOException {
  76. String name = Thread.currentThread().getName();
  77. BufferedReader in = null;
  78. String result="";
  79. try (InputStream ins = getInputStream(url)) {
  80. InputStreamReader read = new InputStreamReader( ins, "gb2312");// 考虑到编码格
  81. BufferedReader bufferedReader = new BufferedReader(read);
  82. String lineTxt = null;
  83. //读取一行
  84. while ((lineTxt = bufferedReader.readLine()) != null) {
  85. //正则表达式
  86. result += lineTxt;
  87. }
  88. read.close();
  89. bufferedReader.close();
  90. } catch (IOException e) {
  91. System.out.println("线程"+name+",页html文件不存在: " + url);
  92. // e.printStackTrace();
  93. }
  94. return result;
  95. }
  96. /**
  97. * 获取html所有某个相同标签上属性值
  98. * @param html
  99. * @param element 标签元素
  100. * @param attr 属性
  101. * @return
  102. */
  103. public static List match(String html, String element, String attr) {
  104. List result = new ArrayList();
  105. String reg = "]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";
  106. Matcher m = Pattern.compile(reg).matcher(html);
  107. while (m.find()) {
  108. String r = m.group(1);
  109. result.add(r);
  110. }
  111. return result;
  112. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/437312
推荐阅读
相关标签
  

闽ICP备14008679号