赞
踩
请各大网友尊重本人原创知识分享,谨记本人博客:南国以南i、
提示:以下是本篇文章正文内容,下面案例可供参考
在日常开发中我们经常会解析各类文件如:.docx、pdf、txt
读取文件内容进行下一步逻辑处理,本文例举解析上述文件
友情链接
:创建Springboot 项目请移步 点我!点我!点我!
<dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.8.5</version> </dependency> <!-- 提取pdf中的文字--> <dependency> <groupId>com.itextpdf</groupId> <artifactId>itextpdf</artifactId> <version>5.5.6</version> </dependency> <!--获取pdf文件的总页数--> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>1.8.11</version> </dependency> <!-- poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.0.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>4.0.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>4.0.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.0.0</version> </dependency> <!-- FileUtils依赖--> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency>
// 定义静态的文件后缀 private static final String SUFFIX_DOC = ".doc"; private static final String SUFFIX_DOCX = ".docx"; private static final String SUFFIX_PDF = ".pdf"; private static final String SUFFIX_TXT = ".txt"; /** * . * 根据文件类型解析文件内容 * * @param file 文件 * @return 解析内容 */ private static String readFileToString(File file) { StringBuilder readStr = new StringBuilder(); String fileType = file.getName().substring(file.getName().lastIndexOf(".")); log.debug("解析文件类型为[{}]文件", fileType); switch (fileType) { case SUFFIX_DOC: { readStr.append(WordUtil.readWord(SUFFIX_DOC, file.getAbsolutePath())); break; } case SUFFIX_DOCX: { readStr.append(WordUtil.readWord(SUFFIX_DOCX, file.getAbsolutePath())); break; } case SUFFIX_PDF: { readStr.append(PDFToWordUtil.getTextFromPdf(file.getAbsolutePath())); break; } case SUFFIX_TXT: { readStr.append(TxtUtil.readTxtFile(file.getAbsolutePath())); break; } default: { log.error("文件类型不正确,请上传.doc、.docx、.pdf、.txt后缀文件"); throw new RuntimeException("文件类型不正确,请上传.doc、.docx、.pdf、.txt后缀文件"); } } return readStr.toString(); }
import lombok.extern.slf4j.Slf4j; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import java.io.FileInputStream; import java.io.InputStream; /** * Word 操作工具类 */ @Slf4j public class WordUtil { // 定义静态的文件后缀 public static final String SUFFIX_DOC = ".doc"; public static final String SUFFIX_DOCX = ".docx"; /** * 读取 Word 入口方法,根据后缀,调用方法 * * @param suffix 文件后缀 * @param filePath 文件路径 * @return */ public static String readWord(String suffix, String filePath) { String wordStr = ""; try (InputStream input = new FileInputStream(filePath)) { // docx 类型 if (SUFFIX_DOCX.equals(suffix)) { wordStr = readDocx(input); // doc 类型 } else if (SUFFIX_DOC.equals(suffix)) { wordStr = readDoc(input); } } catch (Exception e) { log.error("readWord [{}] is error", filePath); } wordStr = wordStr.replace("\n", ""); wordStr = wordStr.replace("\\\\r", ""); wordStr = wordStr.replace("\\\\t", ""); return wordStr; } /** * 读取 doc 类型,使用 WordExtractor 对象,传递输入流 * * @param inputStream * @return */ private static String readDoc(InputStream inputStream) { try { String content = ""; WordExtractor ex = new WordExtractor(inputStream); content = ex.getText(); ex.close(); return content; } catch (Exception e) { return null; } } /** * 读取 docx 类型,使用 XWPFDocument 对象,传递输入流 * * @param inputStream * @return */ private static String readDocx(InputStream inputStream) { try { String content = ""; XWPFDocument xdoc = new XWPFDocument(inputStream); XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc); content = extractor.getText(); extractor.close(); return content; } catch (Exception e) { return null; } } }
@Slf4j public class PDFToWordUtil { /** * @Description: 提取pdf中的文字 第一种方法 * @Param: fileUrlList:地址 * @Param: pages:页码 * @return: content:提取的文字 */ public static String PDFToWord(String fileUrlList) throws IOException { //linux---start--- fileUrlList = fileUrlList.replaceAll("\\\\", File.separator); //linux--end---- Integer pages = PDFToPage(fileUrlList); String fileName = fileUrlList;//源文件的位置 PdfReader reader = null;//PDF读取器 reader = new PdfReader(fileName); String content = ""; for (int i = 1; i <= pages; i++) { content += PdfTextExtractor.getTextFromPage(reader, i); // 读取PDF中第i页(用哪一页就写几)的文档内容,并转成String } content = content.replace("\n", ""); content = content.replace("\\\\r", ""); content = content.replace("\\\\t", ""); log.debug(content);//控制台打印PDF第一页的内容 return content; } /** * @Description: 提取pdf中的页码 * @Param: fileUrlList:地址 * @return: pages:页码数 */ public static int PDFToPage(String fileUrlList) { //linux---start--- fileUrlList = fileUrlList.replaceAll("\\\\", File.separator); //linux--end---- File file = new File(fileUrlList); PdfReader pdfReader = null; try { pdfReader = new PdfReader(new FileInputStream(file)); } catch (IOException e) { e.printStackTrace(); } int pages = pdfReader.getNumberOfPages(); log.debug("pdf文件的总页数为:" + pages); return pages; } /** * @Description: 提取pdf中的文字 第二种方法 * @Param: pdfPath:地址 * @return: content:提取的文字 */ public static String getTextFromPdf(String pdfPath) { String content = null; try { // 是否排序 boolean sort = false; // 开始提取页数 int startPage = 1; // 结束提取页数 int endPage = Integer.MAX_VALUE; //InputStream input = null; //linux---start--- // pdfPath = pdfPath.replaceAll("\\\\", File.separator); //linux--end---- File pdfFile = new File(pdfPath); PDDocument document = null; try (InputStream input = new FileInputStream(pdfFile)) { // 加载 pdf 文档 PDFParser parser = new PDFParser(input); parser.parse(); document = parser.getPDDocument(); // 获取内容信息 PDFTextStripper pts = new PDFTextStripper(); pts.setSortByPosition(sort); endPage = document.getNumberOfPages(); log.debug("Total Page: " + endPage); pts.setStartPage(startPage); pts.setEndPage(endPage); try { content = pts.getText(document); } catch (Exception e) { throw e; } log.debug("Get PDF Content ..."); } catch (Exception e) { throw e; } finally { if (null != document) document.close(); } content = content.replace("\n", ""); content = content.replace("\\\\r", ""); content = content.replace("\\\\t", ""); } catch (Exception e) { log.error("getTextFromPdf [{}] is error", pdfPath); } return content; } }
@Slf4j public class TxtUtil { /** * . * 获取文本内容 * * @return 文件内容 */ public static String readTxtFile(String filePath) { String txtStr = ""; try { File file = new File(filePath); if (!file.exists()) { log.error("可读文件不存在[{}]", file.getAbsolutePath()); } txtStr = FileUtils.readFileToString(file, CharsetUtil.UTF_8); } catch (IOException e) { log.error("readTxtFile [{}] is error", filePath); } txtStr = txtStr.replace("\n", ""); txtStr = txtStr.replace("\\\\r", ""); txtStr = txtStr.replace("\\\\t", ""); return txtStr; } /** * . * 写入数据 * * @param outPath 输出路径 * @param context 内容 */ public static void writeFile(String outPath, String context) { try { File file = new File(outPath); FileUtils.write(file, context, CharsetUtil.UTF_8, false); } catch (IOException e) { log.error("writeFile is error {}", e); } } }
我是南国以南i记录点滴每天成长一点点,学习是永无止境的!转载请附原文链接!!!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。