赞
踩
本来说要弄一个PDF简历解析的,但是百度搜索了一下一般都是需要Python语义或者人工智能一类的。所以做了一个pdf转doc的副产品。
注意本代码是基于2.x版本,对于新出的3.x应该是无效的。pdfbox的API一直都有在变动更新,网上搜罗的代码很多时候都不生效,还是找到对应的版本会好一点。
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>2.0.22</version>
- </dependency>
-
- <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
- <dependency>
- <groupId>cn.hutool</groupId>
- <artifactId>hutool-all</artifactId>
- <version>5.3.5</version>
- </dependency>
- package com.pdftoword.demo.utils;
-
- import cn.hutool.core.io.IoUtil;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.rendering.PDFRenderer;
- import org.apache.pdfbox.text.PDFTextStripper;
- import org.springframework.stereotype.Service;
- import org.springframework.web.multipart.MultipartFile;
-
- import javax.imageio.ImageIO;
- import javax.servlet.http.HttpServletResponse;
- import java.awt.image.BufferedImage;
- import java.io.*;
- import java.net.URLEncoder;
-
-
- @Service
- public class PdfUtils {
-
- private String DOC_FILEURL = "D://text.doc"; //doc文件路径
-
- /**
- * pdf 转 doc 文件
- * @param file
- */
- public void convertWord(MultipartFile file,HttpServletResponse response){
- PDDocument doc = null;
- Writer writer = null;
- OutputStream os = null;
- PDFTextStripper pdfTextStripper = null;
-
- try{
- response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
- String fileName = URLEncoder.encode("文件", "UTF-8");
- response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
- response.setHeader("Content-Disposition", "attachment;filename="+fileName+".doc");
- doc = PDDocument.load(file.getInputStream());
- os = new FileOutputStream(DOC_FILEURL);
- writer = new OutputStreamWriter(os,"UTF-8");
- pdfTextStripper = new PDFTextStripper();
- int pageNumber = doc.getNumberOfPages();
- pdfTextStripper = new PDFTextStripper();
- pdfTextStripper.setSortByPosition(true);
- pdfTextStripper.setStartPage(1);
- pdfTextStripper.setEndPage(pageNumber);
- pdfTextStripper.writeText(doc,writer);
- //必须先关掉文件流再重新读取,否则会产生冲突
- writer.close();
- doc.close();
- File docFile = new File(DOC_FILEURL);
- cn.hutool.core.io.file.FileReader reader = new cn.hutool.core.io.file.FileReader(docFile);
- OutputStream ps = response.getOutputStream();
- byte [] bytes = reader.readBytes();
- IoUtil.write(ps,true,bytes);
-
- }catch (IOException e){
- e.printStackTrace();
- }
- }
-
- /**
- * pdf 转 String
- * @param file
- * @return
- */
- public String convertText(MultipartFile file){
- PDDocument doc = null;
- OutputStream os = null;
- Writer writer = null;
- PDFTextStripper pdfTextStripper = null;
- StringBuilder builder = new StringBuilder();
- String res = null;
- try{
- doc = PDDocument.load(file.getInputStream());
- pdfTextStripper = new PDFTextStripper();
- int pageNumber = doc.getNumberOfPages();
- pdfTextStripper = new PDFTextStripper();
- pdfTextStripper.setSortByPosition(true);
- pdfTextStripper.setStartPage(1);
- pdfTextStripper.setLineSeparator(",");
- pdfTextStripper.setEndPage(pageNumber);
- res = pdfTextStripper.getText(doc);
-
- doc.close();
- }catch (IOException e){
- e.printStackTrace();
- }
- return res;
- }
-
- /**
- *pdf 转 图片
- * @param file
- * @param response
- */
- public void toImage(MultipartFile file, HttpServletResponse response){
- PDDocument doc = null;
- try{
- response.setContentType("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;charset=utf-8");
- String fileName = URLEncoder.encode("图片", "UTF-8");
- response.setHeader("Access-Control-Expose-Headers", "Content-Disposition");
- response.setHeader("Content-Disposition", "attachment;filename="+fileName+".png");
- doc = PDDocument.load(file.getInputStream());
- PDFRenderer renderer = new PDFRenderer(doc);
- int pageCount = doc.getNumberOfPages();
- for(int i = 0;i<pageCount;i++){
- BufferedImage bufferedImage = renderer.renderImage(i);
- // ImageIO.write(bufferedImage,"JPEG",new File(filePath+"-"+i+".jpg"));
- OutputStream os = response.getOutputStream();
- ImageIO.write(bufferedImage,"PNG",os);
- }
- }catch (IOException e){
- e.printStackTrace();
- }
- }
-
-
-
-
- }

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。