赞
踩
import pdfplumber import fitz from PIL import Image from aip import AipOcr import re import threading import xlwt import numpy as np import pandas as pd # ##提取pdf文档的文字写入运动报告.txt中 pdf_name=r'C:\Users\Pert\Desktop\公司文档\食物\中国食物成分表 标准版 第6版 第1册_14452917_.pdf' # f=open('运动报告.txt','w',encoding="utf-8") # pdf=pdfplumber.open(pdf_name) # page_count = len(pdf.pages) # print(page_count) # 得到页数 # for page in pdf.pages: # # print('---------- 第[%d]页 ----------' % page.page_number) # # # 获取当前页面的全部文本信息,包括表格中的文字 # # print(page.extract_text()) # f.write(str(page.extract_text())) # f.close() # pdf.close() class Acqire: def __init__(self): self.APP_ID = '24414511' self.API_KEY = 'OUogI3CydVfG54yeK4NwnYQt' self.SECRET_KEY = 'swhZn760fvuTAvppUHbCC3CAkRK7Xngw' ### 改变图片尺寸 def ResizeImage(self,path2): filein = path2 fileout = path2 width = 1600 height = 2000 img = Image.open(filein) out = img.resize((width, height),Image.ANTIALIAS) out.save(fileout) img.close() def get_file_content(self,filepath,num): def printI(): self.ResizeImage(filepath) client = AipOcr(self.APP_ID,self.API_KEY,self.SECRET_KEY) with open(filepath, 'rb') as fp: image = fp.read() fp.close() # 定义参数变量 options = { # 定义图像方向 'detect-direction': 'true', 'language-type': 'CHN_ENG' } result = client.general(image, options) fp=open('txt/{}.txt'.format(num),'w') try: for word in result['words_result']: fp.write(word['words']) fp.write('\n') except: print(num) fp.close() return printI def func(self,doc,i): imglist = doc.getPageImageList(i) for j, img in enumerate(imglist): xref = img[0] pix = fitz.Pixmap(doc, xref) # make pixmap from image if pix.n - pix.alpha < 4: # can be saved as PNG pix.writePNG("image/p%s-%s.png" % (i+1, j+1)) else: pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG("image/p%s-%s.png" % (i+1, j+1)) pix0 = None # free Pixmap resources pix = None # free Pixmap resources ######提取pdf里面的图片 doc=fitz.open(pdf_name) print(len(doc)) for i in range(len(doc)): locals()['thread_'+str(i)]=threading.Thread(target = Acqire().func(doc,i)) locals()['thread_'+str(i)].start() ######识别图片里的文字 x=6 filepath='img/p{}-1.png'.format(x) locals()['thread_'+str(x)]=threading.Thread(target = Acqire().get_file_content(filepath,x)) locals()['thread_'+str(x)].start() locals()['thread_'+str(x)].join() doc.close()
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。