当前位置:   article > 正文

python使用paddleocr实现识别pdf和图片_pycharm paddleocr pdf

pycharm paddleocr pdf

这周周末又整理了一下代码,支持了pdf识别

直接上代码

方便自己 方便大家

  1. from flask import Flask, request
  2. from paddleocr import PaddleOCR, draw_ocr
  3. from pdf2image import convert_from_path
  4. import json
  5. import pandas as pd
  6. import sys
  7. import datetime
  8. import numpy as np
  9. import numpy_financial as npf
  10. from dateutil.relativedelta import relativedelta
  11. import math
  12. import warnings
  13. import uuid
  14. import os
  15. warnings.filterwarnings('ignore')
  16. pd.set_option('display.max_rows', None)
  17. pd.set_option('display.max_columns', None)
  18. np.set_printoptions(edgeitems=5, precision=3, suppress=True)
  19. # Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
  20. # 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
  21. ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
  22. app = Flask(__name__)
  23. # 支持的文件类型
  24. supportFileType = ['pdf','img','image','jpg','jpeg','png']
  25. # 校验参数合法性
  26. def checkParam(file,fileType):
  27. # 默认返回内容
  28. return_result = {'code': '200', 'message': '处理成功', 'data': False}
  29. if file is None:
  30. print("文件参数为空")
  31. return_result['code'] = 'E100001'
  32. return_result['message'] = '文件参数为空'
  33. return json.dumps(return_result, ensure_ascii=False)
  34. if fileType is None:
  35. print("文件类型为空")
  36. errorMessage.append(",文件类型为空")
  37. return_result['code'] = 'E100002'
  38. return_result['message'] = '文件类型为空'
  39. return json.dumps(return_result, ensure_ascii=False)
  40. if fileType not in supportFileType:
  41. print("不支持的文件类型")
  42. errorMessage.append(",不支持的文件类型")
  43. return_result['code'] = 'E100003'
  44. return_result['message'] = '不支持的文件类型'
  45. return json.dumps(return_result, ensure_ascii=False)
  46. # 判断传入的json数据是否为空
  47. if request.get_data() is None:
  48. # if not request.get_data():
  49. return_result['return_code'] = '5004'
  50. return_result['return_info'] = '请求参数为空'
  51. return json.dumps(return_result, ensure_ascii=False)
  52. # 进行图片ocr
  53. def doImgOcr(tmpFileAbsoultePath,ocr_content):
  54. #OCR
  55. result = ocr.ocr(tmpFileAbsoultePath, cls=True)
  56. for line in result:
  57. print("line:")
  58. print(line)
  59. for tmpTwo in line:
  60. print("tmpTwo:")
  61. print(tmpTwo)
  62. for tmpThree in tmpTwo:
  63. print("tmpThree:")
  64. print(tmpThree)
  65. finalArr = str(tmpThree).split(",")
  66. finalStr = finalArr[0].replace("'","").replace("(","")
  67. print("finalStr:"+finalStr)
  68. if finalStr.find("[") < 0:
  69. ocr_content.append(finalStr)
  70. # 显示结果
  71. from PIL import Image
  72. img_path = tmpFileAbsoultePath
  73. image = Image.open(img_path).convert('RGB')
  74. boxes = [line[0] for line in result]
  75. txts = [line[1][0] for line in result]
  76. scores = [line[1][1] for line in result]
  77. #im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
  78. #im_show = Image.fromarray(im_show)
  79. #im_show.save('result.jpg')
  80. print("ocr_content:")
  81. print(ocr_content)
  82. # 删除文件
  83. #os.remove(tmpFileAbsoultePath)
  84. return ocr_content
  85. # 进行pdf ocr
  86. def doPdfOcr(tmpFileAbsoultePath,ocr_content):
  87. pages = convert_from_path(tmpFileAbsoultePath)
  88. print(pages)
  89. imgPathArr = []
  90. for i, page in enumerate(pages):
  91. imgNamePrefix = str(uuid.uuid4());
  92. imgNamePrefix = imgNamePrefix.replace("-","")
  93. imgName = '/opt/tmp/'+f'page_{i+1}_'+imgNamePrefix+'.png'
  94. imgPathArr.append(imgName)
  95. page.save(imgName, 'PNG')
  96. doImgOcr(imgName,ocr_content)
  97. print("imgPathArr:")
  98. print(imgPathArr)
  99. # ocr识别请求
  100. @app.route('/ocr', methods=['POST'])
  101. def do_ocr():
  102. # ocr识别结果文字
  103. ocr_content = []
  104. # 默认返回内容
  105. return_result = {'code': '200', 'message': '处理成功', 'data': ''}
  106. # 保存文件
  107. if request.method == 'POST':
  108. file = request.files['file']
  109. fileType = request.form.get('fileType')
  110. print("文件类型:"+fileType)
  111. # 参数校验
  112. checkParam(file,fileType)
  113. tmpFilenamePrefix = str(uuid.uuid4());
  114. print("tmpFilenamePrefix:"+tmpFilenamePrefix)
  115. tmpFilenamePrefix = tmpFilenamePrefix.replace("-","")
  116. tmpFileAbsoultePath = "/opt/" +tmpFilenamePrefix + "." + fileType
  117. print("tmpFileAbsoultePath:"+tmpFileAbsoultePath)
  118. file.save(tmpFileAbsoultePath)
  119. if fileType == 'pdf':
  120. # 文件为pdf
  121. doPdfOcr(tmpFileAbsoultePath,ocr_content)
  122. else:
  123. # 文件为图片
  124. doImgOcr(tmpFileAbsoultePath,ocr_content);
  125. return_result = {'code': '200', 'message': '处理成功', 'data': ocr_content}
  126. return json.dumps(return_result, ensure_ascii=False)
  127. if __name__ == "__main__":
  128. app.run(host="0.0.0.0", port="5000", debug=True)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/678654
推荐阅读
相关标签
  

闽ICP备14008679号