赞
踩
使用 java-diff-utils、pdfbox 完成PDF内容的比较以及差异处的高亮显示
文章仅对PDF文件中的文本内容进行对比,并在差异处高亮显示差异内容。新增文本为绿色、删除为红色。
在 DiffUtil 的 getDiffPdf 方法传入需要对比的文件路径,以及输出的文件路径。
(此工具类仅进行文本内容的对比,无法进行图片的比较,请注意甄别。)
spingboot版本 2.7.1
jdk 1.8
- <dependencies>
- <dependency>
- <groupId>org.springframework.boot</groupId>
- <artifactId>spring-boot-starter-web</artifactId>
- </dependency>
-
- <dependency>
- <groupId>io.github.java-diff-utils</groupId>
- <artifactId>java-diff-utils</artifactId>
- <version>4.11</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>2.0.29</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox-tools</artifactId>
- <version>2.0.29</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>fontbox</artifactId>
- <version>2.0.29</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>xmpbox</artifactId>
- <version>2.0.29</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>preflight</artifactId>
- <version>2.0.29</version>
- </dependency>
-
- <dependency>
- <groupId>cn.hutool</groupId>
- <artifactId>hutool-all</artifactId>
- <version>5.6.0</version>
- </dependency>
-
- </dependencies>
引入以下三个类即可
工具类DiffUtil
- package com.pdf.pdfcomparer.util;
-
- import cn.hutool.core.collection.CollUtil;
- import com.github.difflib.UnifiedDiffUtils;
- import com.github.difflib.patch.Patch;
- import com.pdf.pdfcomparer.overide.GetCharLocationAndSize;
- import com.pdf.pdfcomparer.overide.LineTextPosition;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.pdmodel.PDPage;
- import org.apache.pdfbox.pdmodel.common.PDRectangle;
- import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
- import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
- import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
-
- import java.io.File;
- import java.io.IOException;
- import java.io.StringWriter;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.List;
- import java.util.stream.Collectors;
-
-
- public class DiffUtil {
-
- /**
- * 获取文本对比后的pdf
- * @param oldPdfPath 原pdf文件路径
- * @param newPdfPath 新pdf文件路径
- * @param outputOldPdfPath 原diff文件输出路径
- * @param outputNewPdfPath 新diff文件输出路径
- */
- public static int getDiffPdf(String oldPdfPath, String newPdfPath, String outputOldPdfPath, String outputNewPdfPath) {
- try {
- File file1 = new File(oldPdfPath);
- File file2 = new File(newPdfPath);
- String originalFileName = file1.getName();
- String revisedFileName = file2.getName();
-
- // 构建文本页信息
- List<GetCharLocationAndSize> charLAS1 = buildCharLocationAndSize(file1);
- List<GetCharLocationAndSize> charLAS2 = buildCharLocationAndSize(file2);
- // 获取所有文本详细信息
- List<LineTextPosition> lineTextPositions1 = new ArrayList<>();
- charLAS1.forEach(getCharLocationAndSize -> lineTextPositions1.addAll(getCharLocationAndSize.getLineTextPositions()));
- List<LineTextPosition> lineTextPositions2 = new ArrayList<>();
- charLAS2.forEach(getCharLocationAndSize -> lineTextPositions2.addAll(getCharLocationAndSize.getLineTextPositions()));
- List<String> original = CollUtil.emptyIfNull(lineTextPositions1).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
- List<String> revised = CollUtil.emptyIfNull(lineTextPositions2).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
- // 获取差异文本上下文信息
- List<String> unifiedDiff = getUnifiedDiff(originalFileName, original, revisedFileName, revised);
- // 设置文本差异标志
- List<String> signs = CollUtil.emptyIfNull(unifiedDiff).stream().filter(sign -> sign.startsWith("@@")).collect(Collectors.toList());
- if (signs.size() > 0) {
- signs.forEach(sign -> {
- if (!sign.equals("@@ -0,0 +0,0 @@")) {
-
- List<String> split = Arrays.asList(sign.split(" "));
- // 源文件标识
- String var1 = split.get(1);
- String[] var2 = var1.split(",");
- int start = Integer.parseInt(var2[0].substring(1)) - 1;
- int end = start + Integer.parseInt(var2[1]);
- for (int i = start; i < end; i++) {
- lineTextPositions1.get(i).setType("1");
- }
-
- // 新文件标识
- String var3 = split.get(2);
- String[] var4 = var3.split(",");
- int start1 = Integer.parseInt(var4[0].substring(1)) - 1;
- int end1 = start1 + Integer.parseInt(var4[1]);
- for (int i = start1; i < end1; i++) {
- lineTextPositions2.get(i).setType("2");
- }
- }
- });
- }
-
- // 输出差异文件
- PDDocument pdDocument1 = PDDocument.load(file1);
- // 突出显示删除
- for (LineTextPosition lineTextPosition : lineTextPositions1) {
- if (lineTextPosition.getType().equals("1")) {
- PDPage page = pdDocument1.getPage(lineTextPosition.getPageNum() - 1);
- PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
- markup.setColor(new PDColor(new float[] { 1, 0, 0}, PDDeviceRGB.INSTANCE));
- float x = lineTextPosition.getX();
- float y = lineTextPosition.getY();
- float width = lineTextPosition.getWidth();
- float height = lineTextPosition.getHeight();
- PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
- markup.setRectangle(bounds);
- // 从左下角计算 依次为左下、右下、左上、右上
- float []p=pDRectangle2QuadPoints(bounds);
- markup.setQuadPoints(p);
- page.getAnnotations().add(markup);
- }
- }
- pdDocument1.save(outputOldPdfPath);
- pdDocument1.close();
-
- PDDocument pdDocument2 = PDDocument.load(file2);
- // 突出显示删除
- for (LineTextPosition lineTextPosition : lineTextPositions2) {
- if (lineTextPosition.getType().equals("2")) {
- PDPage page = pdDocument2.getPage(lineTextPosition.getPageNum() - 1);
- PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
- markup.setColor(new PDColor(new float[] { 0, 255, 0}, PDDeviceRGB.INSTANCE));
- float x = lineTextPosition.getX();
- float y = lineTextPosition.getY();
- float width = lineTextPosition.getWidth();
- float height = lineTextPosition.getHeight();
- PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
- markup.setRectangle(bounds);
- // 从左下角计算 依次为左下、右下、左上、右上
- float []p=pDRectangle2QuadPoints(bounds);
- markup.setQuadPoints(p);
- page.getAnnotations().add(markup);
- }
- }
- pdDocument2.save(outputNewPdfPath);
- pdDocument2.close();
- } catch (IOException e) {
- return 1;
- }
- return 0;
- }
-
- /**
- * 获取上下文差异描述信息
- * @param originalFileName 源文件名称
- * @param original 源文件文本信息
- * @param revisedFileName 新文件名称
- * @param revised 新文件文本信息
- * @return
- */
- public static List<String> getUnifiedDiff(String originalFileName, List<String> original, String revisedFileName, List<String> revised) {
- Patch<String> patch = com.github.difflib.DiffUtils.diff(original, revised);
- List<String> unifiedDiff = UnifiedDiffUtils.generateUnifiedDiff(originalFileName, revisedFileName, original, patch, 0);
- int diffCount = unifiedDiff.size();
- if (diffCount == 0) {
- //如果两文件没差异则插入如下
- unifiedDiff.add("--- " + originalFileName);
- unifiedDiff.add("+++ " + revisedFileName);
- unifiedDiff.add("@@ -0,0 +0,0 @@");
- } else if (diffCount >= 3 && !unifiedDiff.get(2).contains("@@ -1,")) {
- // 如果至少有一处变化,并且变化不在第一行
- unifiedDiff.set(1, unifiedDiff.get(1));
- //如果第一行没变化则插入@@ -0,0 +0,0 @@
- unifiedDiff.add(2, "@@ -0,0 +0,0 @@");
- }
- return unifiedDiff;
- }
-
- // 矩阵置换坐标
- private static float[] pDRectangle2QuadPoints(PDRectangle bounds) {
- float []p=new float[8];
- p[0]=bounds.getLowerLeftX();
- p[1]=bounds.getLowerLeftY();
- p[2]=bounds.getUpperRightX();
- p[3]=bounds.getLowerLeftY();
- p[4]=bounds.getLowerLeftX();
- p[5]=bounds.getUpperRightY();
- p[6]=bounds.getUpperRightX();
- p[7]=bounds.getUpperRightY();
- return p;
- }
-
- /**
- * 构建PDFTextStripper
- * @param file
- * @return
- */
- private static List<GetCharLocationAndSize> buildCharLocationAndSize(File file) {
- List<GetCharLocationAndSize> pdfTextStripperList;
- PDDocument pdDocument = null;
- try {
- pdDocument = PDDocument.load(file);
- int pageCount = pdDocument.getNumberOfPages();
- pdfTextStripperList = new ArrayList<>(pageCount);
- for (int i = 1; i <= pageCount; i++) {
- GetCharLocationAndSize pdfTextStripper = new GetCharLocationAndSize();
- pdfTextStripper.setPageNum(i);
- pdfTextStripper.setSortByPosition(true);
- pdfTextStripper.setStartPage(i);
- pdfTextStripper.setEndPage(i);
- StringWriter writer = new StringWriter();
- pdfTextStripper.writeText(pdDocument, writer);
- pdfTextStripperList.add(pdfTextStripper);
- }
- } catch (Exception e) {
- return null;
- } finally {
- if (pdDocument != null) {
- try {
- pdDocument.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- return pdfTextStripperList;
- }
- }
工具类中用到的其他自定义的类
GetCharLocationAndSize.class 、 LineTextPosition.class
- package com.pdf.pdfcomparer.overide;
-
- import org.apache.pdfbox.text.PDFTextStripper;
- import org.apache.pdfbox.text.TextPosition;
-
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
-
- /**
- * 每页pdf对应一个实例
- */
- public class GetCharLocationAndSize extends PDFTextStripper {
-
- private final List<LineTextPosition> lineTextPositions = new ArrayList<>();
-
- private int pageNum = 0;
-
- public GetCharLocationAndSize() throws IOException {
- }
-
- @Override
- protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
- // 记录行文本和每个字符位置信息
- LineTextPosition lineTextPosition = new LineTextPosition();
- lineTextPosition.setLineText(text);
- // 计算本段文字矩阵参数,距离页左上角的坐标
- TextPosition textPosition = textPositions.get(0);
- float pageHeight = textPosition.getPageHeight();
- float x = textPosition.getXDirAdj();
- float y = pageHeight - textPosition.getYDirAdj();
- float width = 0.0f;
- float height = 0.0f;
- for (TextPosition textPosition1 : textPositions) {
- width += textPosition1.getWidthDirAdj();
- if (textPosition1.getHeightDir() > height) {
- height = textPosition1.getHeightDir();
- }
- }
- lineTextPosition.setX(x);
- lineTextPosition.setY(y);
- lineTextPosition.setWidth(width);
- lineTextPosition.setHeight(height);
- lineTextPosition.setPageNum(pageNum);
- lineTextPositions.add(lineTextPosition);
- writeString(text);
- }
-
- public List<LineTextPosition> getLineTextPositions(){
- return lineTextPositions;
- }
-
- public int getPageNum() {
- return pageNum;
- }
-
- public void setPageNum(int pageNum) {
- this.pageNum = pageNum;
- }
- }
- package com.pdf.pdfcomparer.overide;
-
-
- public class LineTextPosition {
-
- /**
- * 行文本
- */
- private String lineText;
-
- /**
- * 文本矩阵左下坐标x
- */
- private float x;
-
- /**
- * 文本矩阵左下坐标y
- */
- private float y;
-
- /**
- * 宽度
- */
- private float width;
-
- /**
- * 高度
- */
- private float height;
-
- /**
- * 所在页数
- */
- private int pageNum;
-
- /**
- * 类型 "0":原文 ,"1":删除 ,"2":新增
- */
- private String type = "0";
-
- public String getLineText() {
- return lineText;
- }
-
- public void setLineText(String lineText) {
- this.lineText = lineText;
- }
-
- public float getX() {
- return x;
- }
-
- public void setX(float x) {
- this.x = x;
- }
-
- public float getY() {
- return y;
- }
-
- public void setY(float y) {
- this.y = y;
- }
-
- public float getWidth() {
- return width;
- }
-
- public void setWidth(float width) {
- this.width = width;
- }
-
- public float getHeight() {
- return height;
- }
-
- public void setHeight(float height) {
- this.height = height;
- }
-
- public int getPageNum() {
- return pageNum;
- }
-
- public void setPageNum(int pageNum) {
- this.pageNum = pageNum;
- }
-
- public String getType() {
- return type;
- }
-
- public void setType(String type) {
- this.type = type;
- }
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。