当前位置:   article > 正文

Java对比PDF文件内容_java 对比pdf

java 对比pdf

使用 java-diff-utils、pdfbox 完成PDF内容的比较以及差异处的高亮显示

文章仅对PDF文件中的文本内容进行对比,并在差异处高亮显示差异内容。新增文本为绿色、删除为红色。

在 DiffUtil 的 getDiffPdf 方法传入需要对比的文件路径,以及输出的文件路径。

(此工具类仅进行文本内容的对比,无法进行图片的比较,请注意甄别。)

spingboot版本 2.7.1

jdk 1.8

  1. <dependencies>
  2. <dependency>
  3. <groupId>org.springframework.boot</groupId>
  4. <artifactId>spring-boot-starter-web</artifactId>
  5. </dependency>
  6. <dependency>
  7. <groupId>io.github.java-diff-utils</groupId>
  8. <artifactId>java-diff-utils</artifactId>
  9. <version>4.11</version>
  10. </dependency>
  11. <dependency>
  12. <groupId>org.apache.pdfbox</groupId>
  13. <artifactId>pdfbox</artifactId>
  14. <version>2.0.29</version>
  15. </dependency>
  16. <dependency>
  17. <groupId>org.apache.pdfbox</groupId>
  18. <artifactId>pdfbox-tools</artifactId>
  19. <version>2.0.29</version>
  20. </dependency>
  21. <dependency>
  22. <groupId>org.apache.pdfbox</groupId>
  23. <artifactId>fontbox</artifactId>
  24. <version>2.0.29</version>
  25. </dependency>
  26. <dependency>
  27. <groupId>org.apache.pdfbox</groupId>
  28. <artifactId>xmpbox</artifactId>
  29. <version>2.0.29</version>
  30. </dependency>
  31. <dependency>
  32. <groupId>org.apache.pdfbox</groupId>
  33. <artifactId>preflight</artifactId>
  34. <version>2.0.29</version>
  35. </dependency>
  36. <dependency>
  37. <groupId>cn.hutool</groupId>
  38. <artifactId>hutool-all</artifactId>
  39. <version>5.6.0</version>
  40. </dependency>
  41. </dependencies>

引入以下三个类即可

工具类DiffUtil

  1. package com.pdf.pdfcomparer.util;
  2. import cn.hutool.core.collection.CollUtil;
  3. import com.github.difflib.UnifiedDiffUtils;
  4. import com.github.difflib.patch.Patch;
  5. import com.pdf.pdfcomparer.overide.GetCharLocationAndSize;
  6. import com.pdf.pdfcomparer.overide.LineTextPosition;
  7. import org.apache.pdfbox.pdmodel.PDDocument;
  8. import org.apache.pdfbox.pdmodel.PDPage;
  9. import org.apache.pdfbox.pdmodel.common.PDRectangle;
  10. import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
  11. import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
  12. import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
  13. import java.io.File;
  14. import java.io.IOException;
  15. import java.io.StringWriter;
  16. import java.util.ArrayList;
  17. import java.util.Arrays;
  18. import java.util.List;
  19. import java.util.stream.Collectors;
  20. public class DiffUtil {
  21. /**
  22. * 获取文本对比后的pdf
  23. * @param oldPdfPath 原pdf文件路径
  24. * @param newPdfPath 新pdf文件路径
  25. * @param outputOldPdfPath 原diff文件输出路径
  26. * @param outputNewPdfPath 新diff文件输出路径
  27. */
  28. public static int getDiffPdf(String oldPdfPath, String newPdfPath, String outputOldPdfPath, String outputNewPdfPath) {
  29. try {
  30. File file1 = new File(oldPdfPath);
  31. File file2 = new File(newPdfPath);
  32. String originalFileName = file1.getName();
  33. String revisedFileName = file2.getName();
  34. // 构建文本页信息
  35. List<GetCharLocationAndSize> charLAS1 = buildCharLocationAndSize(file1);
  36. List<GetCharLocationAndSize> charLAS2 = buildCharLocationAndSize(file2);
  37. // 获取所有文本详细信息
  38. List<LineTextPosition> lineTextPositions1 = new ArrayList<>();
  39. charLAS1.forEach(getCharLocationAndSize -> lineTextPositions1.addAll(getCharLocationAndSize.getLineTextPositions()));
  40. List<LineTextPosition> lineTextPositions2 = new ArrayList<>();
  41. charLAS2.forEach(getCharLocationAndSize -> lineTextPositions2.addAll(getCharLocationAndSize.getLineTextPositions()));
  42. List<String> original = CollUtil.emptyIfNull(lineTextPositions1).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
  43. List<String> revised = CollUtil.emptyIfNull(lineTextPositions2).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
  44. // 获取差异文本上下文信息
  45. List<String> unifiedDiff = getUnifiedDiff(originalFileName, original, revisedFileName, revised);
  46. // 设置文本差异标志
  47. List<String> signs = CollUtil.emptyIfNull(unifiedDiff).stream().filter(sign -> sign.startsWith("@@")).collect(Collectors.toList());
  48. if (signs.size() > 0) {
  49. signs.forEach(sign -> {
  50. if (!sign.equals("@@ -0,0 +0,0 @@")) {
  51. List<String> split = Arrays.asList(sign.split(" "));
  52. // 源文件标识
  53. String var1 = split.get(1);
  54. String[] var2 = var1.split(",");
  55. int start = Integer.parseInt(var2[0].substring(1)) - 1;
  56. int end = start + Integer.parseInt(var2[1]);
  57. for (int i = start; i < end; i++) {
  58. lineTextPositions1.get(i).setType("1");
  59. }
  60. // 新文件标识
  61. String var3 = split.get(2);
  62. String[] var4 = var3.split(",");
  63. int start1 = Integer.parseInt(var4[0].substring(1)) - 1;
  64. int end1 = start1 + Integer.parseInt(var4[1]);
  65. for (int i = start1; i < end1; i++) {
  66. lineTextPositions2.get(i).setType("2");
  67. }
  68. }
  69. });
  70. }
  71. // 输出差异文件
  72. PDDocument pdDocument1 = PDDocument.load(file1);
  73. // 突出显示删除
  74. for (LineTextPosition lineTextPosition : lineTextPositions1) {
  75. if (lineTextPosition.getType().equals("1")) {
  76. PDPage page = pdDocument1.getPage(lineTextPosition.getPageNum() - 1);
  77. PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
  78. markup.setColor(new PDColor(new float[] { 1, 0, 0}, PDDeviceRGB.INSTANCE));
  79. float x = lineTextPosition.getX();
  80. float y = lineTextPosition.getY();
  81. float width = lineTextPosition.getWidth();
  82. float height = lineTextPosition.getHeight();
  83. PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
  84. markup.setRectangle(bounds);
  85. // 从左下角计算 依次为左下、右下、左上、右上
  86. float []p=pDRectangle2QuadPoints(bounds);
  87. markup.setQuadPoints(p);
  88. page.getAnnotations().add(markup);
  89. }
  90. }
  91. pdDocument1.save(outputOldPdfPath);
  92. pdDocument1.close();
  93. PDDocument pdDocument2 = PDDocument.load(file2);
  94. // 突出显示删除
  95. for (LineTextPosition lineTextPosition : lineTextPositions2) {
  96. if (lineTextPosition.getType().equals("2")) {
  97. PDPage page = pdDocument2.getPage(lineTextPosition.getPageNum() - 1);
  98. PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
  99. markup.setColor(new PDColor(new float[] { 0, 255, 0}, PDDeviceRGB.INSTANCE));
  100. float x = lineTextPosition.getX();
  101. float y = lineTextPosition.getY();
  102. float width = lineTextPosition.getWidth();
  103. float height = lineTextPosition.getHeight();
  104. PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
  105. markup.setRectangle(bounds);
  106. // 从左下角计算 依次为左下、右下、左上、右上
  107. float []p=pDRectangle2QuadPoints(bounds);
  108. markup.setQuadPoints(p);
  109. page.getAnnotations().add(markup);
  110. }
  111. }
  112. pdDocument2.save(outputNewPdfPath);
  113. pdDocument2.close();
  114. } catch (IOException e) {
  115. return 1;
  116. }
  117. return 0;
  118. }
  119. /**
  120. * 获取上下文差异描述信息
  121. * @param originalFileName 源文件名称
  122. * @param original 源文件文本信息
  123. * @param revisedFileName 新文件名称
  124. * @param revised 新文件文本信息
  125. * @return
  126. */
  127. public static List<String> getUnifiedDiff(String originalFileName, List<String> original, String revisedFileName, List<String> revised) {
  128. Patch<String> patch = com.github.difflib.DiffUtils.diff(original, revised);
  129. List<String> unifiedDiff = UnifiedDiffUtils.generateUnifiedDiff(originalFileName, revisedFileName, original, patch, 0);
  130. int diffCount = unifiedDiff.size();
  131. if (diffCount == 0) {
  132. //如果两文件没差异则插入如下
  133. unifiedDiff.add("--- " + originalFileName);
  134. unifiedDiff.add("+++ " + revisedFileName);
  135. unifiedDiff.add("@@ -0,0 +0,0 @@");
  136. } else if (diffCount >= 3 && !unifiedDiff.get(2).contains("@@ -1,")) {
  137. // 如果至少有一处变化,并且变化不在第一行
  138. unifiedDiff.set(1, unifiedDiff.get(1));
  139. //如果第一行没变化则插入@@ -0,0 +0,0 @@
  140. unifiedDiff.add(2, "@@ -0,0 +0,0 @@");
  141. }
  142. return unifiedDiff;
  143. }
  144. // 矩阵置换坐标
  145. private static float[] pDRectangle2QuadPoints(PDRectangle bounds) {
  146. float []p=new float[8];
  147. p[0]=bounds.getLowerLeftX();
  148. p[1]=bounds.getLowerLeftY();
  149. p[2]=bounds.getUpperRightX();
  150. p[3]=bounds.getLowerLeftY();
  151. p[4]=bounds.getLowerLeftX();
  152. p[5]=bounds.getUpperRightY();
  153. p[6]=bounds.getUpperRightX();
  154. p[7]=bounds.getUpperRightY();
  155. return p;
  156. }
  157. /**
  158. * 构建PDFTextStripper
  159. * @param file
  160. * @return
  161. */
  162. private static List<GetCharLocationAndSize> buildCharLocationAndSize(File file) {
  163. List<GetCharLocationAndSize> pdfTextStripperList;
  164. PDDocument pdDocument = null;
  165. try {
  166. pdDocument = PDDocument.load(file);
  167. int pageCount = pdDocument.getNumberOfPages();
  168. pdfTextStripperList = new ArrayList<>(pageCount);
  169. for (int i = 1; i <= pageCount; i++) {
  170. GetCharLocationAndSize pdfTextStripper = new GetCharLocationAndSize();
  171. pdfTextStripper.setPageNum(i);
  172. pdfTextStripper.setSortByPosition(true);
  173. pdfTextStripper.setStartPage(i);
  174. pdfTextStripper.setEndPage(i);
  175. StringWriter writer = new StringWriter();
  176. pdfTextStripper.writeText(pdDocument, writer);
  177. pdfTextStripperList.add(pdfTextStripper);
  178. }
  179. } catch (Exception e) {
  180. return null;
  181. } finally {
  182. if (pdDocument != null) {
  183. try {
  184. pdDocument.close();
  185. } catch (IOException e) {
  186. e.printStackTrace();
  187. }
  188. }
  189. }
  190. return pdfTextStripperList;
  191. }
  192. }

工具类中用到的其他自定义的类

GetCharLocationAndSize.class   、  LineTextPosition.class

  1. package com.pdf.pdfcomparer.overide;
  2. import org.apache.pdfbox.text.PDFTextStripper;
  3. import org.apache.pdfbox.text.TextPosition;
  4. import java.io.IOException;
  5. import java.util.ArrayList;
  6. import java.util.List;
  7. /**
  8. * 每页pdf对应一个实例
  9. */
  10. public class GetCharLocationAndSize extends PDFTextStripper {
  11. private final List<LineTextPosition> lineTextPositions = new ArrayList<>();
  12. private int pageNum = 0;
  13. public GetCharLocationAndSize() throws IOException {
  14. }
  15. @Override
  16. protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
  17. // 记录行文本和每个字符位置信息
  18. LineTextPosition lineTextPosition = new LineTextPosition();
  19. lineTextPosition.setLineText(text);
  20. // 计算本段文字矩阵参数,距离页左上角的坐标
  21. TextPosition textPosition = textPositions.get(0);
  22. float pageHeight = textPosition.getPageHeight();
  23. float x = textPosition.getXDirAdj();
  24. float y = pageHeight - textPosition.getYDirAdj();
  25. float width = 0.0f;
  26. float height = 0.0f;
  27. for (TextPosition textPosition1 : textPositions) {
  28. width += textPosition1.getWidthDirAdj();
  29. if (textPosition1.getHeightDir() > height) {
  30. height = textPosition1.getHeightDir();
  31. }
  32. }
  33. lineTextPosition.setX(x);
  34. lineTextPosition.setY(y);
  35. lineTextPosition.setWidth(width);
  36. lineTextPosition.setHeight(height);
  37. lineTextPosition.setPageNum(pageNum);
  38. lineTextPositions.add(lineTextPosition);
  39. writeString(text);
  40. }
  41. public List<LineTextPosition> getLineTextPositions(){
  42. return lineTextPositions;
  43. }
  44. public int getPageNum() {
  45. return pageNum;
  46. }
  47. public void setPageNum(int pageNum) {
  48. this.pageNum = pageNum;
  49. }
  50. }
  1. package com.pdf.pdfcomparer.overide;
  2. public class LineTextPosition {
  3. /**
  4. * 行文本
  5. */
  6. private String lineText;
  7. /**
  8. * 文本矩阵左下坐标x
  9. */
  10. private float x;
  11. /**
  12. * 文本矩阵左下坐标y
  13. */
  14. private float y;
  15. /**
  16. * 宽度
  17. */
  18. private float width;
  19. /**
  20. * 高度
  21. */
  22. private float height;
  23. /**
  24. * 所在页数
  25. */
  26. private int pageNum;
  27. /**
  28. * 类型 "0":原文 ,"1":删除 ,"2":新增
  29. */
  30. private String type = "0";
  31. public String getLineText() {
  32. return lineText;
  33. }
  34. public void setLineText(String lineText) {
  35. this.lineText = lineText;
  36. }
  37. public float getX() {
  38. return x;
  39. }
  40. public void setX(float x) {
  41. this.x = x;
  42. }
  43. public float getY() {
  44. return y;
  45. }
  46. public void setY(float y) {
  47. this.y = y;
  48. }
  49. public float getWidth() {
  50. return width;
  51. }
  52. public void setWidth(float width) {
  53. this.width = width;
  54. }
  55. public float getHeight() {
  56. return height;
  57. }
  58. public void setHeight(float height) {
  59. this.height = height;
  60. }
  61. public int getPageNum() {
  62. return pageNum;
  63. }
  64. public void setPageNum(int pageNum) {
  65. this.pageNum = pageNum;
  66. }
  67. public String getType() {
  68. return type;
  69. }
  70. public void setType(String type) {
  71. this.type = type;
  72. }
  73. }

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/871663
推荐阅读
相关标签
  

闽ICP备14008679号