赞
踩
最近接到一个需求,是要对上市公司的年度报告进行信息抽取,给定pdf文件和所需要抽取的字段名或者表格名,主要涉及到的是表格信息的抽取,具体例子如下图所示,能够抽取出【客户贷款平均收益率】这类的字段所对应的信息:
又比如下面表格抽取的例子,给定表格名称,要定位到表格所在页,将表格提取出来:
经过一段时间的调研,发现主要有以下两条技术路线:
经过对两种技术路线的效果对比后,同时考虑到目前的情况,决定结合两种来定制化我们的需求,即借助pdfplumber能够无损地抽取pdf的文字的特点来定位信息所在页,以及借助百度表格识别的高准确性来抽取字段信息或者表格。
在确定主要的技术路线后,在实际操作中有以下关键技术:
# 检测到目标在哪些页出现过
pdf_file_nums=[]
with pdfplumber.open(annualreport) as pdf:
pages= pdf.pages
seg_words=[j for j in jieba.cut(key_words,cut_all=False)]
for index, page in enumerate(pages):
texts=page.extract_text()
flag=True
for seg_word in seg_words:
if seg_word not in texts:
flag=False
break
if flag:
f = open(annualreport, 'rb')
pdf = base64.b64encode(f.read())
access_token = getToken()
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
params = {
"pdf_file": pdf,
'pdf_file_num':index+1
}
# print(params)
response = requests.post(request_url, data=params, headers=headers)
if response:
with open('result.json',encoding='utf-8',mode='w') as f_result:
f_result.write(json.dumps(response.json(),ensure_ascii=False,indent=4))
if 'table_num' in dict(response.json()):
if response.json()['table_num']>=1:
pdf_file_nums.append(index+1)
print(pdf_file_nums)
def CalSimilarity(text1,text2):
# 计算embeddings
# model = SentenceTransformer('all-mpnet-base-v2')
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embedding1 = model.encode(text1, convert_to_tensor=True)
embedding2 = model.encode(text2,convert_to_tensor=True)
# 计算文本之间的相似度
cosine_score = util.cos_sim(embedding1, embedding2)
return cosine_score.item()
理想是美好的,现实是残酷的,这种利用embbedding来计算相似度的方法准确度过低(其实这种方法更适合文字不同之间的语义相似度计算),考虑到目前的实际情况,如果表格所组合出的字段时目标字段,那么应该对目标字段的分词应该有较高覆盖(年报信息的准确性),又考虑到都是中文,所以用jieba对组合字段和目标字段进行分词,计算组合字段对目标字段的覆盖率,从而选出最优的匹配字段,覆盖率计算的代码如下:
def CalSimilarityByJieba(fieldname,forcompare):
seg1 = [i for i in jieba.cut(fieldname,cut_all=True)]
seg2 = [j for j in jieba.cut(forcompare,cut_all=True)]
ratio = len(list(set(seg1) & set(seg2)))/len(seg1)
# print(fieldname,forcompare,seg1,seg2,ratio)
return ratio
def is_numeric(character):
pattern = r'[^0-9.]' # 正则表达式取反操作,检索0-9.之外的非法字符
search = re.search(pattern, character)
# print(search)
return search is None
def is_contain_chinese(check_str):
"""
判断字符串中是否包含中文
:param check_str: {str} 需要检测的字符串
:return: {bool} 包含返回True, 不包含返回False
"""
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return True
return False
def extract_page(input_path, output_path, page_number):
with open(input_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
if page_number < 0 or page_number >= len(reader.pages):
print('Invalid page number.')
return
page = reader.pages[page_number]
writer.add_page(page)
with open(output_path, 'wb') as output_file:
writer.write(output_file)
print('Page extracted successfully.')
filename="兴业银行2022年度报告.pdf"
target_table = '贷款五级分类'
target_page = -1
# 检索到目标表在哪一页
with pdfplumber.open(filename) as pdf:
pages= pdf.pages
max_score_page = 0
for index, page in enumerate(pages):
texts = page.extract_text()
flag = True
seg_words = [word for word in jieba.cut(target_table,cut_all=False)]
# print(seg_words)
for seg_word in seg_words:
if seg_word not in texts:
flag = False
break
if flag:
target_page=index
break
print(target_page)
if target_page!=-1:
extract_page(filename,'temp.pdf',target_page)
# 将对应页存为图片
# resolution默认为72
with(Image(filename="temp.pdf",resolution=200)) as source:
images = source.sequence
pages = len(images)
for i in range(pages):
n = i + 1
newfilename = 'temp.jpeg'
Image(images[i]).save(filename=newfilename)
# 查询到表格所在的位置: (min_x, min_y, max_x, max_y)
def getToken():
url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=xxx&client_secret=xxx"
payload = ""
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
res = json.loads(response.text)
return res['access_token']
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/table"
f = open('temp.jpeg', 'rb')
image = base64.b64encode(f.read())
print(sys.getsizeof(image))
params = {
"image": image
}
access_token = getToken()
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
areas = []
if response:
with open('result.json',encoding='utf-8',mode='w') as f:
f.write(json.dumps(response.json(),ensure_ascii=False,indent=4))
for table in response.json()['tables_result']:
min_x, min_y, max_x, max_y = 10000, 10000, 0, 0
cells=table['body']
for cell in cells:
for point in cell['cell_location']:
if point['x'] < min_x:
min_x = point['x']
if point['x'] > max_x:
max_x = point['x']
if point['y'] < min_y:
min_y = point['y']
if point['y'] > max_y:
max_y = point['y']
areas.append((min_x-5,min_y-5,max_x+10,max_y+10))
print(areas)
from PIL import Image
img = Image.open("temp.jpeg")
print(img.size)
# cropped = img.crop((590, 1468, 1132, 1556)) # (left, upper, right, lower)
cropped = img.crop(areas[0]) # (left, upper, right, lower)
cropped.save("table.jpeg")
提取的效果如下,效果还行:
在实现过程中,遇到一个弄了很久的bug,那就是百度表格识别返回的结果总显示文件格式错误,后来发现是open同一个文件,多次read导致的。多次调用f.read(),第一次调用f.read()可以读取到内容,这时游标会移动到文章末尾,再次调用f.read()是获取不到内容的,可以使用f.seek(0)将游标移动到文章开头再次调用f.read()即可获取内容(参考:f.read()读取文件为空)。
总的来看,目前的进度和效果尚可,但是仍还存在两个关键问题:
革命尚未成功,继续加油吧!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。