赞
踩
这周周末又整理了一下代码,支持了pdf识别
直接上代码
方便自己 方便大家
- from flask import Flask, request
- from paddleocr import PaddleOCR, draw_ocr
- from pdf2image import convert_from_path
- import json
- import pandas as pd
- import sys
- import datetime
- import numpy as np
- import numpy_financial as npf
- from dateutil.relativedelta import relativedelta
- import math
- import warnings
-
- import uuid
-
- import os
-
-
- warnings.filterwarnings('ignore')
-
-
- pd.set_option('display.max_rows', None)
- pd.set_option('display.max_columns', None)
- np.set_printoptions(edgeitems=5, precision=3, suppress=True)
-
- # Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
- # 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
- ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
-
-
- app = Flask(__name__)
-
- # 支持的文件类型
- supportFileType = ['pdf','img','image','jpg','jpeg','png']
-
- # 校验参数合法性
- def checkParam(file,fileType):
-
- # 默认返回内容
- return_result = {'code': '200', 'message': '处理成功', 'data': False}
- if file is None:
- print("文件参数为空")
- return_result['code'] = 'E100001'
- return_result['message'] = '文件参数为空'
- return json.dumps(return_result, ensure_ascii=False)
-
- if fileType is None:
- print("文件类型为空")
- errorMessage.append(",文件类型为空")
- return_result['code'] = 'E100002'
- return_result['message'] = '文件类型为空'
- return json.dumps(return_result, ensure_ascii=False)
- if fileType not in supportFileType:
- print("不支持的文件类型")
- errorMessage.append(",不支持的文件类型")
- return_result['code'] = 'E100003'
- return_result['message'] = '不支持的文件类型'
- return json.dumps(return_result, ensure_ascii=False)
-
-
- # 判断传入的json数据是否为空
- if request.get_data() is None:
- # if not request.get_data():
- return_result['return_code'] = '5004'
- return_result['return_info'] = '请求参数为空'
- return json.dumps(return_result, ensure_ascii=False)
-
- # 进行图片ocr
- def doImgOcr(tmpFileAbsoultePath,ocr_content):
- #OCR
- result = ocr.ocr(tmpFileAbsoultePath, cls=True)
- for line in result:
- print("line:")
- print(line)
- for tmpTwo in line:
- print("tmpTwo:")
- print(tmpTwo)
- for tmpThree in tmpTwo:
- print("tmpThree:")
- print(tmpThree)
- finalArr = str(tmpThree).split(",")
- finalStr = finalArr[0].replace("'","").replace("(","")
- print("finalStr:"+finalStr)
- if finalStr.find("[") < 0:
- ocr_content.append(finalStr)
-
- # 显示结果
- from PIL import Image
- img_path = tmpFileAbsoultePath
- image = Image.open(img_path).convert('RGB')
- boxes = [line[0] for line in result]
- txts = [line[1][0] for line in result]
- scores = [line[1][1] for line in result]
- #im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
- #im_show = Image.fromarray(im_show)
- #im_show.save('result.jpg')
-
-
- print("ocr_content:")
- print(ocr_content)
- # 删除文件
- #os.remove(tmpFileAbsoultePath)
-
- return ocr_content
-
- # 进行pdf ocr
- def doPdfOcr(tmpFileAbsoultePath,ocr_content):
- pages = convert_from_path(tmpFileAbsoultePath)
- print(pages)
-
- imgPathArr = []
-
- for i, page in enumerate(pages):
- imgNamePrefix = str(uuid.uuid4());
- imgNamePrefix = imgNamePrefix.replace("-","")
- imgName = '/opt/tmp/'+f'page_{i+1}_'+imgNamePrefix+'.png'
- imgPathArr.append(imgName)
- page.save(imgName, 'PNG')
- doImgOcr(imgName,ocr_content)
-
- print("imgPathArr:")
- print(imgPathArr)
-
- # ocr识别请求
- @app.route('/ocr', methods=['POST'])
- def do_ocr():
-
- # ocr识别结果文字
- ocr_content = []
- # 默认返回内容
- return_result = {'code': '200', 'message': '处理成功', 'data': ''}
-
- # 保存文件
- if request.method == 'POST':
- file = request.files['file']
- fileType = request.form.get('fileType')
- print("文件类型:"+fileType)
-
- # 参数校验
- checkParam(file,fileType)
-
-
- tmpFilenamePrefix = str(uuid.uuid4());
- print("tmpFilenamePrefix:"+tmpFilenamePrefix)
- tmpFilenamePrefix = tmpFilenamePrefix.replace("-","")
- tmpFileAbsoultePath = "/opt/" +tmpFilenamePrefix + "." + fileType
- print("tmpFileAbsoultePath:"+tmpFileAbsoultePath)
- file.save(tmpFileAbsoultePath)
-
- if fileType == 'pdf':
- # 文件为pdf
- doPdfOcr(tmpFileAbsoultePath,ocr_content)
- else:
- # 文件为图片
- doImgOcr(tmpFileAbsoultePath,ocr_content);
-
-
- return_result = {'code': '200', 'message': '处理成功', 'data': ocr_content}
- return json.dumps(return_result, ensure_ascii=False)
-
-
-
- if __name__ == "__main__":
-
- app.run(host="0.0.0.0", port="5000", debug=True)
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。