当前位置:   article > 正文

获取 pdf 中某个关键字的位置坐标_pdf获取坐标

pdf获取坐标

前言

  在制作印章的时候 通过后端计算印章位置坐标,需要 提供pdf页面关键字 即可找到此关键字在pdf中的页数以及x,y 坐标


一、两个基础配置类

1.KeyWordPositionListener   pdf签名帮助类

代码如下(示例):

  1. package com.z.boot.core.util.pdf;
  2. import com.itextpdf.awt.geom.Rectangle2D;
  3. import com.itextpdf.text.Rectangle;
  4. import com.itextpdf.text.pdf.parser.ImageRenderInfo;
  5. import com.itextpdf.text.pdf.parser.RenderListener;
  6. import com.itextpdf.text.pdf.parser.TextRenderInfo;
  7. import java.util.ArrayList;
  8. import java.util.List;
  9. /**
  10. * @ClassName KeyWordPositionListener
  11. * @Description pdf签名帮助类
  12. * @Date: 2021/3/17 16:57
  13. * @Version 1.0
  14. **/
  15. public class KeyWordPositionListener implements RenderListener{
  16. private List matches = new ArrayList();
  17. private List allItems = new ArrayList();
  18. private Rectangle curPageSize;
  19. /**
  20. * 匹配的关键字
  21. */
  22. private String keyword;
  23. /**
  24. * 匹配的当前页
  25. */
  26. private Integer pageNumber;
  27. public void beginTextBlock() {
  28. //do nothing
  29. }
  30. public void renderText(TextRenderInfo renderInfo) {
  31. String content = renderInfo.getText();
  32. content = content.replace("<", "").replace("《", "").replace("(", "").replace("(", "").replace("\"", "").replace("'", "")
  33. .replace(">", "").replace("》", "").replace(")", "").replace(")", "").replace("、", "").replace(".", "")
  34. .replace(":", "").replace(":", "").replace(" ", "");
  35. Rectangle2D.Float boundingRectange = renderInfo.getBaseline().getBoundingRectange();
  36. MatchItem item = new MatchItem();
  37. item.setContent(content);
  38. item.setPageNum(pageNumber);
  39. item.setPageWidth(curPageSize.getWidth()); //页面宽度
  40. item.setPageHeight(curPageSize.getHeight()); //页面高度
  41. item.setX(boundingRectange.x);
  42. item.setY(boundingRectange.y);
  43. item.setRectangeWidth(boundingRectange.getWidth()); //当前块级元素占的宽度
  44. item.setRectangeHeight(boundingRectange.getHeight()); //当前块级元素占的高度
  45. if(content!=null && content!=""){
  46. if(content.equalsIgnoreCase(keyword)) {
  47. matches.add(item);
  48. }
  49. }
  50. allItems.add(item);//先保存所有的项
  51. }
  52. public void endTextBlock() {
  53. //do nothing
  54. }
  55. public void renderImage(ImageRenderInfo renderInfo) {
  56. //do nothing
  57. }
  58. /**
  59. * 设置需要匹配的当前页
  60. * @param pageNumber
  61. */
  62. public void setPageNumber(Integer pageNumber) {
  63. this.pageNumber = pageNumber;
  64. }
  65. /**
  66. * 设置需要匹配的关键字,忽略大小写
  67. * @param keyword
  68. */
  69. public void setKeyword(String keyword) {
  70. this.keyword = keyword;
  71. }
  72. /**
  73. * 返回匹配的结果列表
  74. * @return
  75. */
  76. public List getMatches() {
  77. return matches;
  78. }
  79. void setCurPageSize(Rectangle rect) {
  80. this.curPageSize = rect;
  81. }
  82. public List getAllItems() {
  83. return allItems;
  84. }
  85. public void setAllItems(List allItems) {
  86. this.allItems = allItems;
  87. }
  88. }


2.MatchItem 

代码如下(示例):

  1. package com.z.boot.core.util.pdf;
  2. /**
  3. * @ClassName MatchItem
  4. * @Description
  5. * @Date: 2021/3/17 16:57
  6. * @Version 1.0
  7. **/
  8. public class MatchItem {
  9. private Integer pageNum;
  10. private Float x;
  11. private Float y;
  12. private Float pageWidth;
  13. private Float pageHeight;
  14. private String content;
  15. private double rectangeWidth; //块级元素宽度
  16. private double rectangeHeight; //块级元素高度
  17. public Integer getPageNum() {
  18. return pageNum;
  19. }
  20. public void setPageNum(Integer pageNum) {
  21. this.pageNum = pageNum;
  22. }
  23. public Float getX() {
  24. return x;
  25. }
  26. public void setX(Float x) {
  27. this.x = x;
  28. }
  29. public Float getY() {
  30. return y;
  31. }
  32. public void setY(Float y) {
  33. this.y = y;
  34. }
  35. public Float getPageWidth() {
  36. return pageWidth;
  37. }
  38. public void setPageWidth(Float pageWidth) {
  39. this.pageWidth = pageWidth;
  40. }
  41. public Float getPageHeight() {
  42. return pageHeight;
  43. }
  44. public void setPageHeight(Float pageHeight) {
  45. this.pageHeight = pageHeight;
  46. }
  47. public String getContent() {
  48. return content;
  49. }
  50. public void setContent(String content) {
  51. this.content = content;
  52. }
  53. public double getRectangeWidth() {
  54. return rectangeWidth;
  55. }
  56. public void setRectangeWidth(double rectangeWidth) {
  57. this.rectangeWidth = rectangeWidth;
  58. }
  59. public double getRectangeHeight() {
  60. return rectangeHeight;
  61. }
  62. public void setRectangeHeight(double rectangeHeight) {
  63. this.rectangeHeight = rectangeHeight;
  64. }
  65. @Override
  66. public String toString() {
  67. return "MatchItem [pageNum=" + pageNum + ", x=" + x + ", y=" + y
  68. + ", pageWidth=" + pageWidth + ", pageHeight=" + pageHeight
  69. + ", content=" + content + "]";
  70. }
  71. }

二、核心计算类

  1. package com.z.boot.core.util.pdf;
  2. import com.itextpdf.text.Rectangle;
  3. import com.itextpdf.text.pdf.PdfReader;
  4. import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
  5. import lombok.extern.slf4j.Slf4j;
  6. import java.io.IOException;
  7. import java.util.List;
  8. /**
  9. * @ClassName QuotationPdf
  10. * @Description
  11. * @Date: 2021/3/16 10:44
  12. * @Version 1.0
  13. **/
  14. @Slf4j
  15. public class QuotationPdf {
  16. /**
  17. * @param filepath
  18. * @param keyWords
  19. * @return float[]
  20. * @Date 2021/3/17 16:57
  21. * @Description 用于供外部类调用获取关键字所在PDF文件坐标
  22. * @Version 1.0
  23. */
  24. public static MatchItem getKeyWordsByPath(String filepath, String keyWords) throws Exception {
  25. try {
  26. PdfReader pdfReader = new PdfReader(filepath);
  27. int pageCount= pdfReader.getNumberOfPages();
  28. return matchPage(pdfReader,pageCount, keyWords);
  29. } catch (IOException e) {
  30. e.printStackTrace();
  31. return null;
  32. }
  33. }
  34. /**
  35. * 在文件中寻找指定的文字内容
  36. *
  37. * @param reader
  38. * @param pageNumber
  39. * @param keyword
  40. * @return
  41. * @throws Exception
  42. */
  43. public static MatchItem matchPage(PdfReader reader,
  44. Integer pageNumber, String keyword) throws Exception {
  45. MatchItem matchItem = null;
  46. PdfReaderContentParser parse = new PdfReaderContentParser(reader);
  47. KeyWordPositionListener renderListener = new KeyWordPositionListener();
  48. renderListener.setKeyword(keyword);
  49. int page = 0;
  50. for (page = 1; page <= pageNumber; page++) {
  51. renderListener.setPageNumber(page);
  52. Rectangle rectangle = reader.getPageSize(page);
  53. renderListener.setCurPageSize(rectangle);
  54. parse.processContent(page, renderListener);
  55. matchItem = findKeywordItems(renderListener, keyword);
  56. if(null != matchItem) {
  57. matchItem.setPageNum(page);
  58. break;
  59. }
  60. }
  61. reader.close();
  62. return matchItem;
  63. }
  64. /**
  65. * 找到匹配的关键词块
  66. *
  67. * @param renderListener
  68. * @param keyword
  69. * @return
  70. */
  71. public static MatchItem findKeywordItems(KeyWordPositionListener renderListener,
  72. String keyword) {
  73. // 先判断本页中是否存在关键词
  74. List allItems = renderListener.getAllItems();// 所有块LIST
  75. StringBuilder sbtemp = new StringBuilder("");
  76. for (int i = 0; i < allItems.size(); i++) {// 将一页中所有的块内容连接起来组成一个字符串。(空格用“正”代替)
  77. sbtemp.append(((MatchItem) allItems.get(i)).getContent());
  78. }
  79. int index = sbtemp.toString().indexOf(keyword);
  80. System.out.println(sbtemp.toString());
  81. if (index == -1) {// 一页组成的字符串没有关键词,直接return
  82. return null;
  83. }
  84. //获取关键词与块内容完全匹配的项
  85. List matches = renderListener.getMatches();
  86. if(null == matches || matches.size()<=0){
  87. //不存在,取本页中连续 拼接块内容 第一个关键字的位置
  88. //当拼接的长度超过或等于关键字在此页内容中出现的位置,即为找到
  89. sbtemp = new StringBuilder("");
  90. for (int i = 0; i < allItems.size(); i++) {
  91. sbtemp.append(((MatchItem) allItems.get(i)).getContent());
  92. if(sbtemp.toString().length() >= index+1){
  93. matches.add(allItems.get(i));
  94. break;
  95. }
  96. }
  97. }
  98. //第二种方式,跟上面是一样的,可以自行研究
  99. // if(null == matches || matches.size()<=0){
  100. // //若还为空,第二种情况:多个块内容拼成一个关键词 取连续的几个块内容拼接起来
  101. // //1.若关键字包含context继续拼接;2.若关键字等于context即找到
  102. // sbtemp = new StringBuffer("");
  103. // int ItmeIndex = 0; //块级元素计数器
  104. // for (int i = 0; i < allItems.size(); i++) {
  105. // String itemText = ((MatchItem) allItems.get(i)).getContent();
  106. // sbtemp.append(itemText);
  107. // if(keyword.contains(sbtemp.toString()) ){
  108. //
  109. // }else if(keyword.equals(sbtemp.toString())){
  110. // matches.add(allItems.get(i));
  111. // break;
  112. // }else if(sbtemp.toString().contains(keyword)){
  113. // //被块元素包含
  114. // int index1 = itemText.indexOf(keyword);
  115. // double rectangeWidth = ((MatchItem) allItems.get(i)).getRectangeWidth(); //块级元素的宽度
  116. // double oneWidth = rectangeWidth/itemText.length(); //单个字体的宽度
  117. // double finalWidth = (index1+Math.rint(keyword.length()/2) ) * oneWidth + ((MatchItem) allItems.get(i)).getX();
  118. // ((MatchItem) allItems.get(i)).setX((float) finalWidth);
  119. // matches.add(allItems.get(i));
  120. // break;
  121. // }else{
  122. // sbtemp = new StringBuffer("");
  123. // }
  124. // }
  125. // }
  126. if(null != matches && matches.size()>0){
  127. return (MatchItem) matches.get(0);
  128. }else{
  129. return null;
  130. }
  131. }
  132. /**
  133. * @param filepath
  134. * @return float[]
  135. * @Date 18:24 2020/3/7
  136. * @Description 用于供外部类获取PDF高度和宽度
  137. */
  138. public static float[] getWidthAndHeightByPath(String filepath, int page) {
  139. float[] coordinate = null;
  140. try {
  141. PdfReader pdfReader = new PdfReader(filepath);
  142. Rectangle pageSize = pdfReader.getPageSize(page);
  143. float height = pageSize.getHeight();
  144. float width = pageSize.getWidth();
  145. System.out.println("width = " + width + ", height = " + height);
  146. float[] coordinate1 = {width, height};
  147. pdfReader.close();
  148. return coordinate1;
  149. // Document document = new Document(pdfReader.getPageSize(1));
  150. Document document = new Document(PageSize.A4.rotate(), 0, 0, 30, 0);
  151. // // 获取页面宽度
  152. // float widths = document.getPageSize().getWidth();
  153. // // 获取页面高度
  154. // float heights = document.getPageSize().getHeight();
  155. // System.out.println("widths = "+widths+", heights = "+heights);
  156. } catch (IOException e) {
  157. e.printStackTrace();
  158. return coordinate;
  159. }
  160. }
  161. public static void main(String[] args) throws Exception {
  162. String filePath = "E:\\work\\bin\\wenlinworkspace\\repository\\laboratoryFile\\message\\wordtemp\\2020-12-14\\8ae123961e5d47faa4acee6d066efcf7.pdf";
  163. MatchItem a = getKeyWordsByPath(filePath, "bm_g_12443_1#_ResultData_32");
  164. System.out.println("width = " + a.getX()
  165. + ", height = " + a.getY()
  166. + ",page = " + a.getPageNum());
  167. //
  168. // float[] b = getWidthAndHeightByPath(filePath, Integer.parseInt(a.get("page").toString()));
  169. System.out.println(a.getPageWidth()+"-----"+a.getPageHeight());
  170. float x = (a.getX() - 35F) / a.getPageWidth();
  171. float y = (a.getY() - 52.5F) / a.getPageHeight();
  172. System.out.println("x = " + x + ", y = " + y);
  173. }
  174. }

三、使用 

  1. public static void main(String[] args) throws Exception {
  2. String filePath = "E:\\work\\bin\\wenlinworkspace\\repository\\laboratoryFile\\message\\wordtemp\\2020-12-14\\8ae123961e5d47faa4acee6d066efcf7.pdf";
  3. MatchItem a = getKeyWordsByPath(filePath, "bm_g_12443_1#_ResultData_32");
  4. System.out.println("width = " + a.getX()
  5. + ", height = " + a.getY()
  6. + ",page = " + a.getPageNum());
  7. //
  8. // float[] b = getWidthAndHeightByPath(filePath, Integer.parseInt(a.get("page").toString()));
  9. System.out.println(a.getPageWidth()+"-----"+a.getPageHeight());
  10. float x = (a.getX() - 35F) / a.getPageWidth();
  11. float y = (a.getY() - 52.5F) / a.getPageHeight();
  12. System.out.println("x = " + x + ", y = " + y);
  13. }
  1. MatchItem position = QuotationPdf.getKeyWordsByPath(pdfPath,"合同章_01");
  2. //position 里面包含了 所能用到的所有参数 横纵长度,以及页面长宽
  3. //页面比例 需自行计算

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号