赞
踩
1.环境配置
1.1 ollama配置
在ollama官网下载ollama软件并安装,安装时需将安装路径添加到环境变量path里面。
1.2 ollama加载模型
ollama官网上有对应大模型,通过ollama run 模型名称,即可下载。例如 ollama run qwen:7b;下载完成后即可进行聊天对话功能。模型大小根据自己的电脑性能进行选择,当前事例16G内存可运行。
1.3 python环境搭建,langchain 安装,vscode软件下载
2.pdf文档处理
2.1扫描版pdf文字识别并保存到world
当前大模型在处理扫描版pdf时,对扫描版的处理会报错,需先将其转换成可进行识别的pdf文件
代码如下,运行前需先进行相应包的安装,通过pip install 对应安装包即可:
-
-
- from docx import Document
-
- from paddleocr import PaddleOCR
-
- import docx
-
- import os
-
- from docx import Document
-
- from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
-
- from docx.shared import Cm
-
- import fitz
-
- from PIL import Image, ImageEnhance, ImageFilter
-
- # 初始化 OCR 模型
-
- ocr = PaddleOCR(use_gpu=True, lang='ch' or 'en')
-
- def enhance_and_process_image(image_path, lang="chi_sim+eng"):
-
- """图像增强后执行OCR处理,返回纯文本内容,忽略布局信息"""
-
- try:
-
- with Image.open(image_path) as im:
-
-
-
- # 图像增强(根据需要调整增强参数)
-
- enhancer = ImageEnhance.Contrast(im)
-
- im = enhancer.enhance(1.5) # 增强对比度
-
- #enhancer = ImageEnhance.Brightness(im)
-
- #im = enhancer.enhance(1.2) # 增亮
-
- im = im.filter(ImageFilter.MedianFilter(size=3)) # 减少噪声
-
- im.save(image_path)
-
- except FileNotFoundError:
-
- print(f"文件未找到: {image_path}")
-
- except Exception as e:
-
- print(f"处理{image_path}时发生未知错误:{e}")
-
-
-
- def img_deal(image_path):
-
- try:
-
- result = ocr.ocr(image_path)
-
- if result: # Check if result is not None or empty before iterating
-
- all_data = []
-
- for page_items in result:
-
- for region in page_items:
-
- raw_bbox, (text, _) = region
-
- all_data.append((raw_bbox, text))
-
- return all_data
-
- else:
-
- print(f"No text detected in image: {image_path}")
-
- return [] # Return an empty list to avoid iteration over None
-
- except Exception as e:
-
- print(f"An error occurred processing image {image_path}: {e}")
-
- return [] # In case of an exception, also return an empty list
-
-
-
- def coord_to_indent(coord, scale_factor=1000):
-
- """将坐标转换为Word的缩进量,这里假设1单位坐标等于scale_factor的厘米缩进"""
-
- x, _ = coord
-
- return Cm(x / scale_factor) # 仅使用x坐标调整左缩进,您可以根据需要加入y坐标处理上下边距
-
-
-
-
- def process_and_save_images_to_word(image_paths, output_word_path):
-
- doc = Document()
-
- all_recognized_data = []
-
- for image_path in image_paths:
-
- print(image_path)
-
- #enhance_and_process_image(image_path) # 先增强并保存图片
-
- recognized_data = img_deal(image_path)
-
- if recognized_data: # 确保有数据才保存
-
- all_recognized_data.append(recognized_data)
-
- # 实时保存当前图片的识别结果到Word
-
- for bbox, text in recognized_data:
-
- left_indent = coord_to_indent(bbox[0])
-
- paragraph = doc.add_paragraph(style='Normal')
-
- paragraph.paragraph_format.left_indent = left_indent
-
- paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
-
- run = paragraph.add_run(text)
-
- font = run.font
-
- font.size = docx.shared.Pt(12)
-
- doc.add_page_break() # 每处理完一个图片后添加分页符
-
- # 实时保存文档,以确保每处理完一张图片就更新一次
-
- temp_output_path = f"{output_word_path}_temp.docx"
-
- doc.save(temp_output_path)
-
-
-
- # 最终保存文档并删除临时文件(如果需要的话)
-
- os.rename(temp_output_path, output_word_path)
-
- print(f"Text with position saved to Word document at: {output_word_path}")
-
- if __name__ == "__main__":
-
- pdf_file_path = 'F:/ai/output'
-
- image_folder_path = "F:/ai/output/images/"
-
- image_paths =sorted([os.path.join(image_folder_path, img) for img in os.listdir(image_folder_path) if img.endswith(('.png', '.jpg', '.jpeg'))])
-
-
-
- output_word_path = os.path.splitext(pdf_file_path)[0] + '.docx'
-
- process_and_save_images_to_word(image_paths, output_word_path)
-
-

2.2 world文档转pdf
可将doc文件打开另存为pdf,亦可通过代码直接转换
2.3 pdf分割,实现多线程共同训练加载
通过将大文件分解成小文件,再通过多线程的方式进而提升训练速度,代码如下:
- import os
- import fitz # PyMuPDF
-
- def split_pdf(input_path, output_dir, max_pages=20):
- """
- 将一个PDF文件分割成多个不超过max_pages的PDF文件,
- 且在分割后的文件名中保留原文档的名称。
-
- :param input_path: 输入PDF文件的路径。
- :param output_dir: 输出PDF文件的目录。
- :param max_pages: 单个输出PDF文件的最大页数,默认为20。
- """
- for input_path_file in os.listdir(input_path):
- file = os.path.join(input_path, input_path_file)
- base_name = os.path.splitext(input_path_file)[0] # 提取不含扩展名的文件名
- # 打开PDF文件
- doc = fitz.open(file)
- total_pages = doc.page_count
-
- # 分割PDF
- for i in range(0, total_pages, max_pages):
- # 计算当前批次结束的页码,确保不会超过总页数
- end_page = min(i + max_pages, total_pages)
-
- # 创建一个新的PDF文档用于存储当前批次的页面
- new_doc = fitz.Document()
-
- # 将页面复制到新文档
- for page_num in range(i, end_page):
- page = doc.load_page(page_num)
- new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
-
- # 构造包含原文档名称的输出文件名
- output_filename = f"{base_name}_split_{i+1}-{end_page}.pdf"
- output_path = os.path.join(output_dir, output_filename)
- new_doc.save(output_path)
- new_doc.close()
-
- doc.close()
-
- # 示例使用
- input_pdf_path = r'F:\ai\DB\文件\原文件2'
- output_directory =r'F:\ai\DB\文件\文件2'
-
- # 确保输出目录存在
- os.makedirs(output_directory, exist_ok=True)
-
- split_pdf(input_pdf_path, output_directory)

3.开始pdf文件训练,并将其保存,代码如下,该代码的优势是不仅可以对新文档进行训练保存,亦可在原有已训练模型的基础上进行加载,进而将两个模型保存:
- import os
- from langchain_community.document_loaders import PDFPlumberLoader
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- from langchain_community.embeddings import OllamaEmbeddings
- from langchain_community.vectorstores import FAISS
- from concurrent.futures import ProcessPoolExecutor
- import time
- import threading
- import concurrent.futures
-
- import pickle
- from typing import List, Tuple
-
-
- def process_pdf(file_path):
- """处理单个PDF文件并返回切分后的文档"""
- loader = PDFPlumberLoader(file_path)
- data = loader.load()
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=128)
- return text_splitter.split_documents(data)
-
-
- def batch_process_pdfs(paths,embeddings):
- """批量处理PDF文件并构建FAISS数据库,每次最多并行处理2个文件"""
- all_docs = []
- failed_files = [] # 用于记录处理失败的文件路径
-
- # 定义每次处理的文件数
- batch_size = 2
- print(len(paths))
- # 分批次处理文件
- for i in range(0, len(paths), batch_size):
- # 获取当前批次的文件路径
- current_batch_paths = paths[i:i+batch_size]
-
- try:
- with ProcessPoolExecutor(max_workers=batch_size) as executor:
- # 并行处理每个PDF文件并捕获异常
- futures = {executor.submit(process_pdf, path): path for path in current_batch_paths}
-
- for future in futures:
- path = futures[future]
- try:
- docs = future.result()
- all_docs.extend(docs)
- except Exception as e:
- print(f"处理文件 {path} 时发生错误: {e}")
- failed_files.append(path)
- continue
-
- # 确保当前批次所有任务完成后再继续
- concurrent.futures.wait(futures)
-
- except Exception as e:
- print(f"处理过程中发生未知错误: {e}")
- return None
-
- # 确保所有文件处理完毕后才进行后续操作
- if all_docs:
- print(f"处理完成,共处理 {len(all_docs)} 个文档。")
- db = FAISS.from_documents(all_docs, embeddings)
- print(f"成功构建数据库")
- db.save_local("faiss_index.pkl")
-
-
- else:
- print("没有成功处理的文档,无法继续构建数据库。")
- return None
- print(f"处理失败的文件有: {failed_files}")
- return db
-
- def save_db_async(db, path):
-
- try:
- if db is None:
- print("db is not initialized properly.")
- else:
- db.save_local(path)
-
- except Exception as e:
- print(f"异步保存数据库时发生错误: {e}")
-
- def load_existing_db(file_path,embeddings) :
-
- try:
-
- db = FAISS.load_local(file_path, embeddings=embeddings, allow_dangerous_deserialization=True)
- return db
-
- except FileNotFoundError:
- print("未找到现有数据库,将创建新的数据库。")
- return None
- except Exception as e:
- print(f"加载现有数据库时发生其他错误: {e}")
- return None
-
-
- def merge_dbs(old_db_data, new_db):
- """合并旧数据库和新文档"""
- if old_db_data:
- # 合并
- old_db_data.merge_from(new_db)
- db=old_db_data
- print(db.docstore._dict)
- else:
- db=new_db
- return db
-
-
- def main():
- start_time = time.time()
- path = r"F:\ai\DB\文件\文件2"
- pdf_files = [os.path.join(path, filename) for filename in os.listdir(path) if filename.endswith(".pdf")]
-
- embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
-
- # 加载可能存在的旧数据库
- path1=r"F:\ai\DB\IMAGE"
- existing_db = load_existing_db(path1,embeddings)
-
-
- # 批量处理所有PDF文件
- vector_db = batch_process_pdfs(pdf_files,embeddings)
- if vector_db is None:
- print("警告:处理文档时出现问题,无法继续。")
- return
-
- # 合并新旧数据
- final_db = merge_dbs(existing_db, vector_db)
- save_db_async(final_db, r"F:\ai\DB\IMAGE")
-
- end_time = time.time()
- elapsed_time = end_time - start_time
- print(f"处理完成,总耗时: {elapsed_time:.2f} 秒")
- os._exit(0)
-
- if __name__ == '__main__':
- main()

4.加载训练模型并对话
4.1 单次多伦对话,并添加有记忆功能,代码如下:
- # -*- coding: utf-8 -*-
- import logging
- from langchain_community.embeddings import OllamaEmbeddings
- from langchain_community.vectorstores import FAISS
- from langchain_community.llms import Ollama
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
- from langchain.memory import ConversationBufferWindowMemory
- from langchain.prompts import PromptTemplate
- from langchain.chains import RetrievalQA
- from langchain.callbacks.base import BaseCallbackManager
- from typing import List
-
-
- # 配置日志
- logging.basicConfig(level=logging.INFO)
-
-
-
- def load_file(path):
- embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
- db = FAISS.load_local(path, embeddings=embeddings,
- allow_dangerous_deserialization=True)
- retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 24})
-
- callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
- ollama_llm = Ollama(model="qwen:7b", temperature=0.2, top_p=0.9,
- callback_manager=callback_manager)
-
- memory = ConversationBufferWindowMemory(memory_key="history", k=5,
- return_messages=True)
-
-
- # 直接将memory作为参数传递
- qa_chain = RetrievalQA.from_chain_type(
- llm=ollama_llm,
- chain_type="stuff",
- retriever=retriever,
- memory=memory,
- verbose=True,
- )
-
- return qa_chain, retriever
-
-
- def handle_user_query(qa, retriever, user_query: str):
- try:
- docs = retriever.invoke(user_query)
- answer = qa.invoke(user_query, documents=docs)
-
- # 检查answer类型并打印
- if isinstance(answer, dict) and 'result' in answer:
- #print(f"\n问题: {user_query}\n回答: {answer['result']}")
-
- # 判断回答是否明显引用了文档
- if answer['result'] not in docs:
- # 检查回答内容是否提及了检索到的文档内容
- referenced = any(doc.page_content in answer['result'] for doc in docs)
- if not referenced:
- print("回答未直接基于现有文件。")
-
- else:
- print("没有找到直接的参考文献")
- return
- else:
- print("\n参考文献或相关文档:")
-
- # 显示检索到的文档信息
- unique_sources = set()
- for doc in docs:
- source = doc.metadata.get('source', '未知')
- unique_sources.add(source)
- i=0
- for source in unique_sources:
- if i==4:
- break
- i+=1
- print(f"来源: {source}")
- else:
- print("未预期的响应类型,请检查qa.invoke的返回值。")
- return answer
-
- except Exception as e:
- logging.error(f"Error handling user query: {e}")
-
- if __name__ == "__main__":
- path = r"F:\ai\DB\IMAGE"
- qa_chain, retriever = load_file(path)
- if qa_chain and retriever:
- while True:
- user_query = input("请输入您的问题(输入'退出'以结束): ")
- if user_query.lower() == '退出':
- break
- handle_user_query(qa_chain, retriever, user_query)
- else:
- logging.warning("Initialization failed. QA Chain or Retriever not properly loaded.")

4.2 通过读取word文档,添加辅助内容,进行多轮问答,并将问和答写入在word文档中,代码如下:
- # -*- coding: utf-8 -*-
- import logging
- from langchain.memory import ConversationBufferWindowMemory
- from langchain.chains import RetrievalQA
- from langchain.callbacks.base import BaseCallbackManager
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
- from langchain_community.embeddings import OllamaEmbeddings
- from langchain_community.llms import Ollama
- from langchain_community.vectorstores import FAISS
- from typing import Dict, List
-
- from docx import Document
-
- # 配置日志
- logging.basicConfig(level=logging.INFO)
-
- def read_docx_lines(file_path):
- """从.docx文件中读取每一行文本"""
- doc = Document(file_path)
- lines = []
- for para in doc.paragraphs:
- lines.extend(para.text.split('\n'))
- return [line.strip() for line in lines if line.strip()]
-
- def save_answers_to_word(answers, output_path):
- """将问题与回答保存至新的Word文档"""
-
- doc = Document()
- for question, answer in answers.items():
- print(answer)
- doc.add_paragraph(question)
- doc.add_paragraph(answer)
-
- doc.save(output_path)
- print(f"问答已保存至: {output_path}")
-
- def process_doc_and_query_ai(file_path, qa_chain, retriever):
- """从.docx文件读取内容,逐行查询AI并附加附加信息进行回答"""
- lines = read_docx_lines(file_path)
- answers = {}
- # 将附加信息定义为一个常量或直接在询问时添加,而不是在循环内修改列表项
- additional_info = (
- "请结合以下信息进行回答:结合医疗器械可用性工程注册审查指导原则;”
- )
-
- for line in lines:
- if line: # 确保行不为空
- # 正确的方式是构造一个包含原始问题和附加信息的新问题字符串
- full_query = f"{line} {additional_info}"
- answer = handle_user_query(qa_chain, retriever, full_query)
- answers[line] = answer['result']
- #print(f"问题: {line}\n回答: {answer['result']}\n")
- return answers
- def load_file(path):
- embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
- db = FAISS.load_local(path, embeddings=embeddings,
- allow_dangerous_deserialization=True)
- retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 24})
-
- callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
- ollama_llm = Ollama(model="qwen:7b", temperature=0.2, top_p=0.9,
- callback_manager=callback_manager)
-
- memory = ConversationBufferWindowMemory(memory_key="history", k=5,
- return_messages=True)
-
-
- # 直接将memory作为参数传递
- qa_chain = RetrievalQA.from_chain_type(
- llm=ollama_llm,
- chain_type="stuff",
- retriever=retriever,
- memory=memory,
- verbose=True,
- )
-
- return qa_chain, retriever
- def handle_user_query(qa, retriever, user_query):
- """处理用户查询并返回答案及出处"""
- # 使用retriever检索最相关的文档
- docs = retriever.invoke(user_query)
-
- # 构建答案
- answer = qa.invoke(user_query, documents=docs)
-
-
- # 检查answer类型并打印
- if isinstance(answer, dict) and 'result' in answer:
- #print(f"\n问题: {user_query}\n回答: {answer['result']}")
-
- # 判断回答是否明显引用了文档
- if answer['result'] not in docs:
- # 检查回答内容是否提及了检索到的文档内容
- referenced = any(doc.page_content in answer['result'] for doc in docs)
- if not referenced:
- print("回答未直接基于现有文件。")
- else:
- print("没有找到直接的参考文献")
- return
- else:
- print("\n参考文献或相关文档:")
-
- # 显示检索到的文档信息
- unique_sources = set()
- for doc in docs:
- source = doc.metadata.get('source', '未知')
- unique_sources.add(source)
- i=0
- for source in unique_sources:
- if i==4:
- break
- i+=1
- print(f"来源: {source}")
- else:
- print("未预期的响应类型,请检查qa.invoke的返回值。")
- return answer
- if __name__ == "__main__":
- path=r"F:\ai\DB\inventor\db_faiss"
- # 初始化资源
- qa_chain ,retriever= load_file(path)
- # .docx文件路径
- docx_file_path = r"F:\ai\path\可用性模板文件.docx"
- processed_answers = process_doc_and_query_ai(docx_file_path, qa_chain,retriever)
-
- # 保存结果到新的Word文档
- output_docx_path = r"F:\ai\path\output.docx"
- save_answers_to_word(processed_answers, output_docx_path)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。