当前位置:   article > 正文

如何在windows电脑上加载pdf文档到本地大模型上从无到有实现_ollama pdf

ollama pdf

1.环境配置

1.1 ollama配置

        在ollama官网下载ollama软件并安装,安装时需将安装路径添加到环境变量path里面。

1.2 ollama加载模型

        ollama官网上有对应大模型,通过ollama run 模型名称,即可下载。例如 ollama run qwen:7b;下载完成后即可进行聊天对话功能。模型大小根据自己的电脑性能进行选择,当前事例16G内存可运行。

1.3 python环境搭建,langchain 安装,vscode软件下载

2.pdf文档处理

2.1扫描版pdf文字识别并保存到world

当前大模型在处理扫描版pdf时,对扫描版的处理会报错,需先将其转换成可进行识别的pdf文件

代码如下,运行前需先进行相应包的安装,通过pip install 对应安装包即可:

  1. from docx import Document
  2. from paddleocr import PaddleOCR  
  3. import docx
  4. import os
  5. from docx import Document
  6. from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
  7. from docx.shared import  Cm
  8. import fitz
  9. from PIL import Image, ImageEnhance, ImageFilter
  10. # 初始化 OCR 模型  
  11. ocr = PaddleOCR(use_gpu=True, lang='ch' or 'en')
  12. def enhance_and_process_image(image_path, lang="chi_sim+eng"):
  13.     """图像增强后执行OCR处理,返回纯文本内容,忽略布局信息"""
  14.     try:
  15.         with Image.open(image_path) as im:
  16.      
  17.             # 图像增强(根据需要调整增强参数)
  18.             enhancer = ImageEnhance.Contrast(im)
  19.             im = enhancer.enhance(1.5)  # 增强对比度
  20.             #enhancer = ImageEnhance.Brightness(im)
  21.             #im = enhancer.enhance(1.2)  # 增亮
  22.             im = im.filter(ImageFilter.MedianFilter(size=3))  # 减少噪声
  23.             im.save(image_path)
  24.     except FileNotFoundError:
  25.         print(f"文件未找到: {image_path}")
  26.     except Exception as e:
  27.         print(f"处理{image_path}时发生未知错误:{e}")
  28.      
  29. def img_deal(image_path):
  30.     try:
  31.         result = ocr.ocr(image_path)
  32.         if result:  # Check if result is not None or empty before iterating
  33.             all_data = []
  34.             for page_items in result:
  35.                 for region in page_items:
  36.                     raw_bbox, (text, _) = region
  37.                     all_data.append((raw_bbox, text))
  38.             return all_data
  39.         else:
  40.             print(f"No text detected in image: {image_path}")
  41.             return []  # Return an empty list to avoid iteration over None
  42.     except Exception as e:
  43.         print(f"An error occurred processing image {image_path}: {e}")
  44.         return []  # In case of an exception, also return an empty list
  45.    
  46. def coord_to_indent(coord, scale_factor=1000):
  47.     """将坐标转换为Word的缩进量,这里假设1单位坐标等于scale_factor的厘米缩进"""
  48.     x, _ = coord
  49.     return Cm(x / scale_factor)  # 仅使用x坐标调整左缩进,您可以根据需要加入y坐标处理上下边距
  50.  
  51. def process_and_save_images_to_word(image_paths, output_word_path):
  52.     doc = Document()
  53.     all_recognized_data = []
  54.     for image_path in image_paths:
  55.         print(image_path)
  56.         #enhance_and_process_image(image_path)  # 先增强并保存图片
  57.         recognized_data = img_deal(image_path)
  58.         if recognized_data:  # 确保有数据才保存
  59.             all_recognized_data.append(recognized_data)
  60.             # 实时保存当前图片的识别结果到Word
  61.             for bbox, text in recognized_data:
  62.                 left_indent = coord_to_indent(bbox[0])
  63.                 paragraph = doc.add_paragraph(style='Normal')
  64.                 paragraph.paragraph_format.left_indent = left_indent
  65.                 paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
  66.                 run = paragraph.add_run(text)
  67.                 font = run.font
  68.                 font.size = docx.shared.Pt(12)
  69.             doc.add_page_break()  # 每处理完一个图片后添加分页符
  70.         # 实时保存文档,以确保每处理完一张图片就更新一次
  71.         temp_output_path = f"{output_word_path}_temp.docx"
  72.         doc.save(temp_output_path)
  73.        
  74.     # 最终保存文档并删除临时文件(如果需要的话)
  75.     os.rename(temp_output_path, output_word_path)
  76.     print(f"Text with position saved to Word document at: {output_word_path}")
  77. if __name__ == "__main__":
  78.     pdf_file_path = 'F:/ai/output'  
  79.     image_folder_path = "F:/ai/output/images/"
  80.     image_paths =sorted([os.path.join(image_folder_path, img) for img in os.listdir(image_folder_path) if img.endswith(('.png', '.jpg', '.jpeg'))])
  81.    
  82.     output_word_path = os.path.splitext(pdf_file_path)[0] + '.docx'
  83.     process_and_save_images_to_word(image_paths, output_word_path)

2.2  world文档转pdf

可将doc文件打开另存为pdf,亦可通过代码直接转换

2.3 pdf分割,实现多线程共同训练加载

通过将大文件分解成小文件,再通过多线程的方式进而提升训练速度,代码如下:

  1. import os
  2. import fitz # PyMuPDF
  3. def split_pdf(input_path, output_dir, max_pages=20):
  4. """
  5. 将一个PDF文件分割成多个不超过max_pages的PDF文件,
  6. 且在分割后的文件名中保留原文档的名称。
  7. :param input_path: 输入PDF文件的路径。
  8. :param output_dir: 输出PDF文件的目录。
  9. :param max_pages: 单个输出PDF文件的最大页数,默认为20。
  10. """
  11. for input_path_file in os.listdir(input_path):
  12. file = os.path.join(input_path, input_path_file)
  13. base_name = os.path.splitext(input_path_file)[0] # 提取不含扩展名的文件名
  14. # 打开PDF文件
  15. doc = fitz.open(file)
  16. total_pages = doc.page_count
  17. # 分割PDF
  18. for i in range(0, total_pages, max_pages):
  19. # 计算当前批次结束的页码,确保不会超过总页数
  20. end_page = min(i + max_pages, total_pages)
  21. # 创建一个新的PDF文档用于存储当前批次的页面
  22. new_doc = fitz.Document()
  23. # 将页面复制到新文档
  24. for page_num in range(i, end_page):
  25. page = doc.load_page(page_num)
  26. new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
  27. # 构造包含原文档名称的输出文件名
  28. output_filename = f"{base_name}_split_{i+1}-{end_page}.pdf"
  29. output_path = os.path.join(output_dir, output_filename)
  30. new_doc.save(output_path)
  31. new_doc.close()
  32. doc.close()
  33. # 示例使用
  34. input_pdf_path = r'F:\ai\DB\文件\原文件2'
  35. output_directory =r'F:\ai\DB\文件\文件2'
  36. # 确保输出目录存在
  37. os.makedirs(output_directory, exist_ok=True)
  38. split_pdf(input_pdf_path, output_directory)

3.开始pdf文件训练,并将其保存,代码如下,该代码的优势是不仅可以对新文档进行训练保存,亦可在原有已训练模型的基础上进行加载,进而将两个模型保存:

  1. import os
  2. from langchain_community.document_loaders import PDFPlumberLoader
  3. from langchain.text_splitter import RecursiveCharacterTextSplitter
  4. from langchain_community.embeddings import OllamaEmbeddings
  5. from langchain_community.vectorstores import FAISS
  6. from concurrent.futures import ProcessPoolExecutor
  7. import time
  8. import threading
  9. import concurrent.futures
  10. import pickle
  11. from typing import List, Tuple
  12. def process_pdf(file_path):
  13. """处理单个PDF文件并返回切分后的文档"""
  14. loader = PDFPlumberLoader(file_path)
  15. data = loader.load()
  16. text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=128)
  17. return text_splitter.split_documents(data)
  18. def batch_process_pdfs(paths,embeddings):
  19. """批量处理PDF文件并构建FAISS数据库,每次最多并行处理2个文件"""
  20. all_docs = []
  21. failed_files = [] # 用于记录处理失败的文件路径
  22. # 定义每次处理的文件数
  23. batch_size = 2
  24. print(len(paths))
  25. # 分批次处理文件
  26. for i in range(0, len(paths), batch_size):
  27. # 获取当前批次的文件路径
  28. current_batch_paths = paths[i:i+batch_size]
  29. try:
  30. with ProcessPoolExecutor(max_workers=batch_size) as executor:
  31. # 并行处理每个PDF文件并捕获异常
  32. futures = {executor.submit(process_pdf, path): path for path in current_batch_paths}
  33. for future in futures:
  34. path = futures[future]
  35. try:
  36. docs = future.result()
  37. all_docs.extend(docs)
  38. except Exception as e:
  39. print(f"处理文件 {path} 时发生错误: {e}")
  40. failed_files.append(path)
  41. continue
  42. # 确保当前批次所有任务完成后再继续
  43. concurrent.futures.wait(futures)
  44. except Exception as e:
  45. print(f"处理过程中发生未知错误: {e}")
  46. return None
  47. # 确保所有文件处理完毕后才进行后续操作
  48. if all_docs:
  49. print(f"处理完成,共处理 {len(all_docs)} 个文档。")
  50. db = FAISS.from_documents(all_docs, embeddings)
  51. print(f"成功构建数据库")
  52. db.save_local("faiss_index.pkl")
  53. else:
  54. print("没有成功处理的文档,无法继续构建数据库。")
  55. return None
  56. print(f"处理失败的文件有: {failed_files}")
  57. return db
  58. def save_db_async(db, path):
  59. try:
  60. if db is None:
  61. print("db is not initialized properly.")
  62. else:
  63. db.save_local(path)
  64. except Exception as e:
  65. print(f"异步保存数据库时发生错误: {e}")
  66. def load_existing_db(file_path,embeddings) :
  67. try:
  68. db = FAISS.load_local(file_path, embeddings=embeddings, allow_dangerous_deserialization=True)
  69. return db
  70. except FileNotFoundError:
  71. print("未找到现有数据库,将创建新的数据库。")
  72. return None
  73. except Exception as e:
  74. print(f"加载现有数据库时发生其他错误: {e}")
  75. return None
  76. def merge_dbs(old_db_data, new_db):
  77. """合并旧数据库和新文档"""
  78. if old_db_data:
  79. # 合并
  80. old_db_data.merge_from(new_db)
  81. db=old_db_data
  82. print(db.docstore._dict)
  83. else:
  84. db=new_db
  85. return db
  86. def main():
  87. start_time = time.time()
  88. path = r"F:\ai\DB\文件\文件2"
  89. pdf_files = [os.path.join(path, filename) for filename in os.listdir(path) if filename.endswith(".pdf")]
  90. embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
  91. # 加载可能存在的旧数据库
  92. path1=r"F:\ai\DB\IMAGE"
  93. existing_db = load_existing_db(path1,embeddings)
  94. # 批量处理所有PDF文件
  95. vector_db = batch_process_pdfs(pdf_files,embeddings)
  96. if vector_db is None:
  97. print("警告:处理文档时出现问题,无法继续。")
  98. return
  99. # 合并新旧数据
  100. final_db = merge_dbs(existing_db, vector_db)
  101. save_db_async(final_db, r"F:\ai\DB\IMAGE")
  102. end_time = time.time()
  103. elapsed_time = end_time - start_time
  104. print(f"处理完成,总耗时: {elapsed_time:.2f} 秒")
  105. os._exit(0)
  106. if __name__ == '__main__':
  107. main()

4.加载训练模型并对话

4.1 单次多伦对话,并添加有记忆功能,代码如下:
 

  1. # -*- coding: utf-8 -*-
  2. import logging
  3. from langchain_community.embeddings import OllamaEmbeddings
  4. from langchain_community.vectorstores import FAISS
  5. from langchain_community.llms import Ollama
  6. from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
  7. from langchain.memory import ConversationBufferWindowMemory
  8. from langchain.prompts import PromptTemplate
  9. from langchain.chains import RetrievalQA
  10. from langchain.callbacks.base import BaseCallbackManager
  11. from typing import List
  12. # 配置日志
  13. logging.basicConfig(level=logging.INFO)
  14. def load_file(path):
  15. embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
  16. db = FAISS.load_local(path, embeddings=embeddings,
  17. allow_dangerous_deserialization=True)
  18. retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 24})
  19. callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
  20. ollama_llm = Ollama(model="qwen:7b", temperature=0.2, top_p=0.9,
  21. callback_manager=callback_manager)
  22. memory = ConversationBufferWindowMemory(memory_key="history", k=5,
  23. return_messages=True)
  24. # 直接将memory作为参数传递
  25. qa_chain = RetrievalQA.from_chain_type(
  26. llm=ollama_llm,
  27. chain_type="stuff",
  28. retriever=retriever,
  29. memory=memory,
  30. verbose=True,
  31. )
  32. return qa_chain, retriever
  33. def handle_user_query(qa, retriever, user_query: str):
  34. try:
  35. docs = retriever.invoke(user_query)
  36. answer = qa.invoke(user_query, documents=docs)
  37. # 检查answer类型并打印
  38. if isinstance(answer, dict) and 'result' in answer:
  39. #print(f"\n问题: {user_query}\n回答: {answer['result']}")
  40. # 判断回答是否明显引用了文档
  41. if answer['result'] not in docs:
  42. # 检查回答内容是否提及了检索到的文档内容
  43. referenced = any(doc.page_content in answer['result'] for doc in docs)
  44. if not referenced:
  45. print("回答未直接基于现有文件。")
  46. else:
  47. print("没有找到直接的参考文献")
  48. return
  49. else:
  50. print("\n参考文献或相关文档:")
  51. # 显示检索到的文档信息
  52. unique_sources = set()
  53. for doc in docs:
  54. source = doc.metadata.get('source', '未知')
  55. unique_sources.add(source)
  56. i=0
  57. for source in unique_sources:
  58. if i==4:
  59. break
  60. i+=1
  61. print(f"来源: {source}")
  62. else:
  63. print("未预期的响应类型,请检查qa.invoke的返回值。")
  64. return answer
  65. except Exception as e:
  66. logging.error(f"Error handling user query: {e}")
  67. if __name__ == "__main__":
  68. path = r"F:\ai\DB\IMAGE"
  69. qa_chain, retriever = load_file(path)
  70. if qa_chain and retriever:
  71. while True:
  72. user_query = input("请输入您的问题(输入'退出'以结束): ")
  73. if user_query.lower() == '退出':
  74. break
  75. handle_user_query(qa_chain, retriever, user_query)
  76. else:
  77. logging.warning("Initialization failed. QA Chain or Retriever not properly loaded.")

4.2 通过读取word文档,添加辅助内容,进行多轮问答,并将问和答写入在word文档中,代码如下:

  1. # -*- coding: utf-8 -*-
  2. import logging
  3. from langchain.memory import ConversationBufferWindowMemory
  4. from langchain.chains import RetrievalQA
  5. from langchain.callbacks.base import BaseCallbackManager
  6. from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
  7. from langchain_community.embeddings import OllamaEmbeddings
  8. from langchain_community.llms import Ollama
  9. from langchain_community.vectorstores import FAISS
  10. from typing import Dict, List
  11. from docx import Document
  12. # 配置日志
  13. logging.basicConfig(level=logging.INFO)
  14. def read_docx_lines(file_path):
  15. """从.docx文件中读取每一行文本"""
  16. doc = Document(file_path)
  17. lines = []
  18. for para in doc.paragraphs:
  19. lines.extend(para.text.split('\n'))
  20. return [line.strip() for line in lines if line.strip()]
  21. def save_answers_to_word(answers, output_path):
  22. """将问题与回答保存至新的Word文档"""
  23. doc = Document()
  24. for question, answer in answers.items():
  25. print(answer)
  26. doc.add_paragraph(question)
  27. doc.add_paragraph(answer)
  28. doc.save(output_path)
  29. print(f"问答已保存至: {output_path}")
  30. def process_doc_and_query_ai(file_path, qa_chain, retriever):
  31. """从.docx文件读取内容,逐行查询AI并附加附加信息进行回答"""
  32. lines = read_docx_lines(file_path)
  33. answers = {}
  34. # 将附加信息定义为一个常量或直接在询问时添加,而不是在循环内修改列表项
  35. additional_info = (
  36. "请结合以下信息进行回答:结合医疗器械可用性工程注册审查指导原则;”
  37. )
  38. for line in lines:
  39. if line: # 确保行不为空
  40. # 正确的方式是构造一个包含原始问题和附加信息的新问题字符串
  41. full_query = f"{line} {additional_info}"
  42. answer = handle_user_query(qa_chain, retriever, full_query)
  43. answers[line] = answer['result']
  44. #print(f"问题: {line}\n回答: {answer['result']}\n")
  45. return answers
  46. def load_file(path):
  47. embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
  48. db = FAISS.load_local(path, embeddings=embeddings,
  49. allow_dangerous_deserialization=True)
  50. retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 24})
  51. callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
  52. ollama_llm = Ollama(model="qwen:7b", temperature=0.2, top_p=0.9,
  53. callback_manager=callback_manager)
  54. memory = ConversationBufferWindowMemory(memory_key="history", k=5,
  55. return_messages=True)
  56. # 直接将memory作为参数传递
  57. qa_chain = RetrievalQA.from_chain_type(
  58. llm=ollama_llm,
  59. chain_type="stuff",
  60. retriever=retriever,
  61. memory=memory,
  62. verbose=True,
  63. )
  64. return qa_chain, retriever
  65. def handle_user_query(qa, retriever, user_query):
  66. """处理用户查询并返回答案及出处"""
  67. # 使用retriever检索最相关的文档
  68. docs = retriever.invoke(user_query)
  69. # 构建答案
  70. answer = qa.invoke(user_query, documents=docs)
  71. # 检查answer类型并打印
  72. if isinstance(answer, dict) and 'result' in answer:
  73. #print(f"\n问题: {user_query}\n回答: {answer['result']}")
  74. # 判断回答是否明显引用了文档
  75. if answer['result'] not in docs:
  76. # 检查回答内容是否提及了检索到的文档内容
  77. referenced = any(doc.page_content in answer['result'] for doc in docs)
  78. if not referenced:
  79. print("回答未直接基于现有文件。")
  80. else:
  81. print("没有找到直接的参考文献")
  82. return
  83. else:
  84. print("\n参考文献或相关文档:")
  85. # 显示检索到的文档信息
  86. unique_sources = set()
  87. for doc in docs:
  88. source = doc.metadata.get('source', '未知')
  89. unique_sources.add(source)
  90. i=0
  91. for source in unique_sources:
  92. if i==4:
  93. break
  94. i+=1
  95. print(f"来源: {source}")
  96. else:
  97. print("未预期的响应类型,请检查qa.invoke的返回值。")
  98. return answer
  99. if __name__ == "__main__":
  100. path=r"F:\ai\DB\inventor\db_faiss"
  101. # 初始化资源
  102. qa_chain ,retriever= load_file(path)
  103. # .docx文件路径
  104. docx_file_path = r"F:\ai\path\可用性模板文件.docx"
  105. processed_answers = process_doc_and_query_ai(docx_file_path, qa_chain,retriever)
  106. # 保存结果到新的Word文档
  107. output_docx_path = r"F:\ai\path\output.docx"
  108. save_answers_to_word(processed_answers, output_docx_path)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/在线问答5/article/detail/963097
推荐阅读
相关标签
  

闽ICP备14008679号