当前位置:   article > 正文

使用org.apache.pdfbox 2.x PDF 转Doc 转图片 转字符串

org.apache.pdfbox

场景:

本来说要弄一个PDF简历解析的,但是百度搜索了一下一般都是需要Python语义或者人工智能一类的。所以做了一个pdf转doc的副产品。

注意本代码是基于2.x版本,对于新出的3.x应该是无效的。pdfbox的API一直都有在变动更新,网上搜罗的代码很多时候都不生效,还是找到对应的版本会好一点。

代码

依赖

  1. <dependency>
  2. <groupId>org.apache.pdfbox</groupId>
  3. <artifactId>pdfbox</artifactId>
  4. <version>2.0.22</version>
  5. </dependency>
  6. <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
  7. <dependency>
  8. <groupId>cn.hutool</groupId>
  9. <artifactId>hutool-all</artifactId>
  10. <version>5.3.5</version>
  11. </dependency>

 

工具类

  1. package com.pdftoword.demo.utils;
  2. import cn.hutool.core.io.IoUtil;
  3. import org.apache.pdfbox.pdmodel.PDDocument;
  4. import org.apache.pdfbox.rendering.PDFRenderer;
  5. import org.apache.pdfbox.text.PDFTextStripper;
  6. import org.springframework.stereotype.Service;
  7. import org.springframework.web.multipart.MultipartFile;
  8. import javax.imageio.ImageIO;
  9. import javax.servlet.http.HttpServletResponse;
  10. import java.awt.image.BufferedImage;
  11. import java.io.*;
  12. import java.net.URLEncoder;
  13. @Service
  14. public class PdfUtils {
  15. private String DOC_FILEURL = "D://text.doc"; //doc文件路径
  16. /**
  17. * pdf 转 doc 文件
  18. * @param file
  19. */
  20. public void convertWord(MultipartFile file,HttpServletResponse response){
  21. PDDocument doc = null;
  22. Writer writer = null;
  23. OutputStream os = null;
  24. PDFTextStripper pdfTextStripper = null;
  25. try{
  26. response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
  27. String fileName = URLEncoder.encode("文件", "UTF-8");
  28. response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
  29. response.setHeader("Content-Disposition", "attachment;filename="+fileName+".doc");
  30. doc = PDDocument.load(file.getInputStream());
  31. os = new FileOutputStream(DOC_FILEURL);
  32. writer = new OutputStreamWriter(os,"UTF-8");
  33. pdfTextStripper = new PDFTextStripper();
  34. int pageNumber = doc.getNumberOfPages();
  35. pdfTextStripper = new PDFTextStripper();
  36. pdfTextStripper.setSortByPosition(true);
  37. pdfTextStripper.setStartPage(1);
  38. pdfTextStripper.setEndPage(pageNumber);
  39. pdfTextStripper.writeText(doc,writer);
  40. //必须先关掉文件流再重新读取,否则会产生冲突
  41. writer.close();
  42. doc.close();
  43. File docFile = new File(DOC_FILEURL);
  44. cn.hutool.core.io.file.FileReader reader = new cn.hutool.core.io.file.FileReader(docFile);
  45. OutputStream ps = response.getOutputStream();
  46. byte [] bytes = reader.readBytes();
  47. IoUtil.write(ps,true,bytes);
  48. }catch (IOException e){
  49. e.printStackTrace();
  50. }
  51. }
  52. /**
  53. * pdf 转 String
  54. * @param file
  55. * @return
  56. */
  57. public String convertText(MultipartFile file){
  58. PDDocument doc = null;
  59. OutputStream os = null;
  60. Writer writer = null;
  61. PDFTextStripper pdfTextStripper = null;
  62. StringBuilder builder = new StringBuilder();
  63. String res = null;
  64. try{
  65. doc = PDDocument.load(file.getInputStream());
  66. pdfTextStripper = new PDFTextStripper();
  67. int pageNumber = doc.getNumberOfPages();
  68. pdfTextStripper = new PDFTextStripper();
  69. pdfTextStripper.setSortByPosition(true);
  70. pdfTextStripper.setStartPage(1);
  71. pdfTextStripper.setLineSeparator(",");
  72. pdfTextStripper.setEndPage(pageNumber);
  73. res = pdfTextStripper.getText(doc);
  74. doc.close();
  75. }catch (IOException e){
  76. e.printStackTrace();
  77. }
  78. return res;
  79. }
  80. /**
  81. *pdf 转 图片
  82. * @param file
  83. * @param response
  84. */
  85. public void toImage(MultipartFile file, HttpServletResponse response){
  86. PDDocument doc = null;
  87. try{
  88. response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
  89. String fileName = URLEncoder.encode("图片", "UTF-8");
  90. response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
  91. response.setHeader("Content-Disposition", "attachment;filename="+fileName+".png");
  92. doc = PDDocument.load(file.getInputStream());
  93. PDFRenderer renderer = new PDFRenderer(doc);
  94. int pageCount = doc.getNumberOfPages();
  95. for(int i = 0;i<pageCount;i++){
  96. BufferedImage bufferedImage = renderer.renderImage(i);
  97. // ImageIO.write(bufferedImage,"JPEG",new File(filePath+"-"+i+".jpg"));
  98. OutputStream os = response.getOutputStream();
  99. ImageIO.write(bufferedImage,"PNG",os);
  100. }
  101. }catch (IOException e){
  102. e.printStackTrace();
  103. }
  104. }
  105. }

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/445779
推荐阅读
相关标签
  

闽ICP备14008679号