赞
踩
常用java操作PDF的库有PDFbox和itext,下面我会介绍如何使用PDFbox和itext来提取PDF的文字坐标。
1、通过定义一个类实现RenderListener,可以通过里面的几个方法来操作PDF中的文字和图片
- import java.awt.Color;
- import java.awt.image.BufferedImage;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Map;
-
- import javax.imageio.ImageIO;
-
- import com.itextpdf.awt.geom.Rectangle2D;
- import com.itextpdf.awt.geom.RectangularShape;
- import com.itextpdf.text.BaseColor;
- import com.itextpdf.text.Rectangle;
- import com.itextpdf.text.pdf.PdfContentByte;
- import com.itextpdf.text.pdf.parser.ImageRenderInfo;
- import com.itextpdf.text.pdf.parser.RenderListener;
- import com.itextpdf.text.pdf.parser.TextRenderInfo;
- public class TestRenderListener implements RenderListener {
- //用来存放文字的矩形
- List<Rectangle2D.Float> rectText = new ArrayList<Rectangle2D.Float>();
- //用来存放文字
- List<String> textList = new ArrayList<String>();
- //用来存放文字的y坐标
- List<Float> listY = new ArrayList<Float>();
- //用来存放每一行文字的坐标位置
- List<Map<String,Rectangle2D.Float>> rows_text_rect = new ArrayList<>();
- //PDF文件的路径
- protected String filepath = null;
- public TestRenderListener() {
- }
-
- //step 2,遇到"BT"执行
- @Override
- public void beginTextBlock() {
- // TODO Auto-generated method stub
- }
-
- //step 3
- /**
- * 文字主要处理方法
- */
- @Override
- public void renderText(TextRenderInfo renderInfo) {
- //获取文字的下面的矩形
- //Rectangle2D.Float rectBase = renderInfo.getBaseline().getBoundingRectange();
-
-
- String text = renderInfo.getText();
- if(text.length() > 0){
- RectangularShape rectBase = renderInfo.getBaseline().getBoundingRectange();
- //获取文字下面的矩形
- Rectangle2D.Float rectAscen = renderInfo.getAscentLine().getBoundingRectange();
- //计算出文字的边框矩形
- float leftX = (float) rectBase.getMinX();
- float leftY = (float) rectBase.getMinY()-1;
- float rightX = (float) rectAscen.getMaxX();
- float rightY = (float) rectAscen.getMaxY()+1;
-
- Rectangle2D.Float rect = new Rectangle2D.Float(leftX, leftY, rightX - leftX, rightY - leftY);
-
- System.out.println("text:"+text+"--x:"+rect.x + "--y:"+rect.y + "--width:"+rect.width + "--height:"+rect.height);
-
- if(listY.contains(rect.y)){
- int index = listY.indexOf(rect.y);
- float tempx = rect.x > rectText.get(index).x ? rectText.get(index).x : rect.x;
- rectText.set(index,new Rectangle2D.Float(tempx,rect.y,rect.width + rectText.get(index).width,rect.height));
- textList.set(index,textList.get(index) + text);
- }else{
- rectText.add(rect);
- textList.add(text);
- listY.add(rect.y);
- }
-
- Map<String,Rectangle2D.Float> map = new HashMap<>();
- map.put(text,rect);
- rows_text_rect.add(map);
- }
- }
- //step 4(最后执行的,只执行一次),遇到“ET”执行
- @Override
- public void endTextBlock() {
- // TODO Auto-generated method stub
- }
-
- //step 1(图片处理方法)
- @Override
- public void renderImage(ImageRenderInfo renderInfo) {
-
- }
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
2、使用自定义的类来实现获取PDF的文字坐标
- PdfReader reader = new PdfReader(pdfPath);
- //新建一个PDF解析对象
- PdfReaderContentParser parser = new PdfReaderContentParser(reader);
- //包含了PDF页面的信息,作为处理的对象
- PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("d:/test.pdf"));
- for(int i = 1;i <= reader.getNumberOfPages();i++){
- //新建一个ImageRenderListener对象,该对象实现了RenderListener接口,作为处理PDF的主要类
- TestRenderListener listener = new TestRenderListener();
- //解析PDF,并处理里面的文字
- parser.processContent(i, listener);
- //获取文字的矩形边框
- List<Rectangle2D.Float> rectText = listener.rectText;
- List<String> textList = listener.textList;
- List<Float> listY = listener.listY;
- List<Map<String,Rectangle2D.Float>> list_text = listener.rows_text_rect;
- for(int k = 0;k < list_text.size();k++){
- Map<String,Rectangle2D.Float> map = list_text.get(k);
- for(Map.Entry<String, Rectangle2D.Float>entry:map.entrySet()){
- System.out.println(entry.getKey()+"---"+entry.getValue());
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- }
- }
PDFbox与itext不同的是,PDFbox只能一个一个字的提取PDF的文字坐标,而itext是一段一段提取的。
PDFbox版本:1.8.13,不同版本可能部分代码写法不同。
- import java.io.*;
- import org.apache.pdfbox.exceptions.InvalidPasswordException;
-
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.pdmodel.PDPage;
- import org.apache.pdfbox.pdmodel.common.PDStream;
- import org.apache.pdfbox.util.PDFTextStripper;
- import org.apache.pdfbox.util.TextPosition;
-
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
-
- public class PrintTextLocations extends PDFTextStripper {
-
- static List<Float> list_postion = new ArrayList<Float>();
- static List<String> list_text = new ArrayList<String>();
-
-
- public PrintTextLocations() throws IOException {
- super.setSortByPosition(true);
- }
-
- public static void main(String[] args) throws Exception {
-
- PDDocument document = null;
- try {
- File input = new File("D://result.pdf");
- document = PDDocument.load(input);
- if (document.isEncrypted()) {
- document.decrypt("");
- }
- PrintTextLocations printer = new PrintTextLocations();
- List allPages = document.getDocumentCatalog().getAllPages();
- for (int i = 0; i < allPages.size(); i++) {
- PDPage page = (PDPage) allPages.get(i);
- System.out.println("Processing page: " + i);
- PDStream contents = page.getContents();
- if (contents != null) {
- printer.processStream(page, page.findResources(), page.getContents().getStream());
- }
- }
- } finally {
- if (document != null) {
- document.close();
- }
- }
-
- System.out.println(list_text.size());
- for(int i = 0;i < list_text.size();i++){
- System.out.println(list_text.get(i) );
- }
-
- }
-
- /**
- * @param text The text to be processed
- */
- @Override /* this is questionable, not sure if needed... */
- protected void processTextPosition(TextPosition text) {
- System.out.println("String[" + text.getXDirAdj() + ","
- + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
- + text.getXScale()+ " yscale="
- + text.getYScale() + " height=" + text.getHeightDir() + " space="
- + text.getWidthOfSpace() + " width="
- + text.getWidthDirAdj() + " x="
- + text.getX() + " y="
- + text.getY() + " y1="
- + text.getTextPos().getYPosition() + " x1="
- + text.getTextPos().getXPosition() + " x1="
- + text.getTextPos().getXScale() + " x1="
- + text.getTextPos().getYScale() + "]" + text.getCharacter());
- }
- }
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。