赞
踩
原本方法:继承langchain.llms.base import LLM类 实现一个CustomLLM
我使用的是Qwen2模型,原本改写的模型:
- class QwenLM(LLM):
- # 基于本地 Qwen 自定义 LLM 类
- tokenizer : AutoTokenizer = None
- model: AutoModelForCausalLM = None
-
- def __init__(self, model_path :str):
- # model_path: Qwen 模型路径
- # 从本地初始化模型
- super().__init__()
- print("正在从本地加载模型...")
- model_dir = ''
- self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
- self.model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
- # Specify hyperparameters for generation
- self.model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
- print("完成本地模型的加载")
-
- def history_to_messages(self,history):
- messages = [{'role': 'system', 'content': 'xxx'}]
- for h in history:
- messages.append({'role': 'user', 'content': h[0]})
- messages.append({'role': 'assistant', 'content': h[1]})
- return messages
-
- def model_chat(self,model, tokenizer, prompt, history=None):
-
- model_name_or_path = 'Qwen2'
-
- # Assume other models have chat method
- if "Qwen2" not in model_name_or_path:
- return model.chat(tokenizer, prompt, history=history)
- else:
- if history is None:
- history = []
- else:
- history = copy.deepcopy(history)
-
- messages = self.history_to_messages(history)
- messages.append({'role': 'user', 'content': prompt})
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
- model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
- generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
- response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- history.append((prompt, response))
- return response, history
-
- def _call(self, prompt : str, stop: Optional[List[str]] = None,
- run_manager: Optional[CallbackManagerForLLMRun] = None,
- **kwargs: Any):
- # 重写调用函数
- # response, history = self.model.chat(self.tokenizer, prompt , history=[])
-
- response,history = self.model_chat(self.model,self.tokenizer,prompt)
- return response
-
- @property
- def _llm_type(self) -> str:
- return "QwenLM"
但是发现用这种情况去推理的时候每次都要load model和tokenizer,非常耗时,所以想到先把模型用vllm部署然后推理,能大幅提升模型推理的效率,故我改写并定义VLLMChatModel模型如下:
- class VLLMChatModel(LLM):
- # 基于vLLM的自定义LLM类
- api_client: OpenAI = None
-
- def __init__(self, api_base_url: str):
- super().__init__()
- self.api_client = OpenAI(api_key="EMPTY", base_url=api_base_url)
-
- def history_to_messages(self, history):
- messages = [{'role': 'system', 'content': ''}] #system prompt自行补充
- for h in history:
- messages.append({'role': 'user', 'content': h[0]})
- messages.append({'role': 'assistant', 'content': h[1]})
- return messages
-
- def model_chat(self, prompt, history=None):
- if history is None:
- history = []
-
- messages = self.history_to_messages(history)
- messages.append({'role': 'user', 'content': prompt})
-
- # 调用vLLM API
- response = self.api_client.chat.completions.create(
- model="Qwen2-7B-Instruct_2",
- messages=messages,
- )
-
- generated_text = response.choices[0].message.content
- history.append((prompt, generated_text))
- return generated_text, history
-
- def _call(self, prompt: str, stop: Optional[List[str]] = None,
- run_manager: Optional[CallbackManagerForLLMRun] = None,
- **kwargs: Any):
- # 重写调用函数
- response, history = self.model_chat(prompt)
- return response
-
- @property
- def _llm_type(self) -> str:
- return "VLLMChatModel"
vllm启动的代码如下 :
- export CUDA_VISIBLE_DEVICES=4,5,6,7
-
- python -m vllm.entrypoints.openai.api_server \
- --served-model-name Qwen2-7B-Instruct_2 \ #这个对应定义函数中的model=""
- --model '' \ #Qwen2模型的local path
- --gpu-memory-utilization 0.9 \
- --tensor-parallel-size 4 \ #并行的卡数
- --port 8002 \
- --dtype float
调用定义好的VLLMChatModel:
- llm = VLLMChatModel(api_base_url="http://localhost:8002/v1")
-
- # llm = QwenLM(model_path = "") #之前定义的QwenLM其实也可以用
这样之后就可以实现在langchain框架下用自定义的vllm chat model来实现高效率的推理了
完整项目:我实现的是一个用langchain框架结合知识库RAG然后用Qwen2-7b-Instruct模型推理的项目。
langchain框架结合知识库RAG框架代码参考:
- import os
- os.environ['CUDA_VISIBLE_DEVICES'] = '4,5,6,7'
-
- from langchain.document_loaders import UnstructuredFileLoader
- from langchain.document_loaders import UnstructuredMarkdownLoader
- from langchain_community.document_loaders.csv_loader import CSVLoader
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- from langchain.vectorstores import Chroma
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
- from langchain.prompts import PromptTemplate
- from langchain.chains import RetrievalQA
- from tqdm import tqdm
- import os
- from langchain.llms.base import LLM
- from typing import Any, List, Optional
- from langchain.callbacks.manager import CallbackManagerForLLMRun
- from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-
- from openai import OpenAI
-
- import copy
-
-
- class VLLMChatModel(LLM):
- # 基于vLLM的自定义LLM类
- api_client: OpenAI = None
-
- def __init__(self, api_base_url: str):
- super().__init__()
- self.api_client = OpenAI(api_key="EMPTY", base_url=api_base_url)
-
- def history_to_messages(self, history):
- messages = [{'role': 'system', 'content': ''}] #system prompt自行补充
- for h in history:
- messages.append({'role': 'user', 'content': h[0]})
- messages.append({'role': 'assistant', 'content': h[1]})
- return messages
-
- def model_chat(self, prompt, history=None):
- if history is None:
- history = []
-
- messages = self.history_to_messages(history)
- messages.append({'role': 'user', 'content': prompt})
-
- # 调用vLLM API
- response = self.api_client.chat.completions.create(
- model="Qwen2-7B-Instruct_2",
- messages=messages,
- )
-
- generated_text = response.choices[0].message.content
- history.append((prompt, generated_text))
- return generated_text, history
-
- def _call(self, prompt: str, stop: Optional[List[str]] = None,
- run_manager: Optional[CallbackManagerForLLMRun] = None,
- **kwargs: Any):
- # 重写调用函数
- response, history = self.model_chat(prompt)
- return response
-
- @property
- def _llm_type(self) -> str:
- return "VLLMChatModel"
-
-
- class QwenLM(LLM):
- # 基于本地 Qwen 自定义 LLM 类
- tokenizer : AutoTokenizer = None
- model: AutoModelForCausalLM = None
-
- def __init__(self, model_path :str):
- # model_path: Qwen 模型路径
- # 从本地初始化模型
- super().__init__()
- print("正在从本地加载模型...")
- model_dir = '' #Qwen2模型的local path
- self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
- self.model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
- # Specify hyperparameters for generation
- self.model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
- print("完成本地模型的加载")
-
- def history_to_messages(self,history):
- messages = [{'role': 'system', 'content': ''}] #system prompt自行补充
- for h in history:
- messages.append({'role': 'user', 'content': h[0]})
- messages.append({'role': 'assistant', 'content': h[1]})
- return messages
-
- def model_chat(self,model, tokenizer, prompt, history=None):
-
- model_name_or_path = 'Qwen2'
-
- # Assume other models have chat method
- if "Qwen2" not in model_name_or_path:
- return model.chat(tokenizer, prompt, history=history)
- else:
- if history is None:
- history = []
- else:
- history = copy.deepcopy(history)
-
- messages = self.history_to_messages(history)
- messages.append({'role': 'user', 'content': prompt})
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
- model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
- generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
- response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- history.append((prompt, response))
- return response, history
-
- def _call(self, prompt : str, stop: Optional[List[str]] = None,
- run_manager: Optional[CallbackManagerForLLMRun] = None,
- **kwargs: Any):
- # 重写调用函数
- # response, history = self.model.chat(self.tokenizer, prompt , history=[])
-
- response,history = self.model_chat(self.model,self.tokenizer,prompt)
- return response
-
-
- @property
- def _llm_type(self) -> str:
- return "QwenLM"
-
-
- # 获取文件路径函数
- def get_files(dir_path):
- # args:dir_path,目标文件夹路径
- file_list = []
- for filepath, dirnames, filenames in os.walk(dir_path):
- # os.walk 函数将递归遍历指定文件夹
- for filename in filenames:
- # 通过后缀名判断文件类型是否满足要求
- if filename.endswith(".md"):
- # 如果满足要求,将其绝对路径加入到结果列表
- file_list.append(os.path.join(filepath, filename))
- elif filename.endswith(".txt"):
- file_list.append(os.path.join(filepath, filename))
- elif filename.endswith(".csv"):
- file_list.append(os.path.join(filepath, filename))
- return file_list
-
- # 加载文件函数
- def get_text(dir_path):
- # args:dir_path,目标文件夹路径
- # 首先调用上文定义的函数得到目标文件路径列表
- file_lst = get_files(dir_path)
- # docs 存放加载之后的纯文本对象
- docs = []
- # 遍历所有目标文件
- for one_file in tqdm(file_lst):
- file_type = one_file.split('.')[-1]
- if file_type == 'md':
- loader = UnstructuredMarkdownLoader(one_file)
- elif file_type == 'txt':
- loader = UnstructuredFileLoader(one_file)
- elif file_type == 'csv':
- loader = CSVLoader(one_file)
- else:
- # 如果是不符合条件的文件,直接跳过
- continue
- docs.extend(loader.load())
- return docs
-
- # 目标文件夹
- # tar_dir = [
- # "/root/autodl-tmp/qwen",
- # "/root/autodl-tmp/Qwen",
- # ]
-
-
- #构建向量数据库并持久化到磁盘上
- def generate_vectordb():
- tar_dir = ["/raid/home/specter/enhance_llm/qa_bot/hbv_data"]
-
- # 加载目标文件
- docs = []
- for dir_path in tar_dir:
- docs.extend(get_text(dir_path))
-
- # 对文本进行分块
- text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=500, chunk_overlap=150)
- split_docs = text_splitter.split_documents(docs)
-
- # 加载开源词向量模型
- embeddings = HuggingFaceEmbeddings(model_name="/raid/home/specter/enhance_llm/qa_bot/embedding_model")
-
- # 构建向量数据库
- # 定义持久化路径
- persist_directory = '/raid/home/specter/enhance_llm/qa_bot/data_base/vector_db/chroma'
- # 加载数据库
- vectordb = Chroma.from_documents(
- documents=split_docs,
- embedding=embeddings,
- persist_directory=persist_directory # 允许我们将persist_directory目录保存到磁盘上
- )
- # 将加载的向量数据库持久化到磁盘上
- vectordb.persist()
-
-
-
- def decode_2(user_description):
-
- #构建向量数据库并持久化到磁盘上
- generate_vectordb()
-
- # 加载开源词向量模型
- embeddings = HuggingFaceEmbeddings(model_name="/raid/home/specter/enhance_llm/qa_bot/embedding_model")
-
- # 向量数据库持久化路径
- persist_directory = '/raid/home/specter/enhance_llm/qa_bot/data_base/vector_db/chroma'
-
- # 加载数据库
- vectordb = Chroma(
- persist_directory=persist_directory,
- embedding_function=embeddings
- )
-
- llm = VLLMChatModel(api_base_url="http://localhost:8002/v1")
-
- # llm = QwenLM(model_path = "") #loacl path自行补充
-
- # 我们所构造的 Prompt 模板
- template = """使用以下上下文来回答最后的问题。如果你不知道答案,就说你不知道,不要试图编造答案。尽量使答案简明扼要。总是在回答的最后说“谢谢你的提问!”。
- {context}
- 问题: {question}
- 有用的回答:"""
-
- # 调用 LangChain 的方法来实例化一个 Template 对象,该对象包含了 context 和 question 两个变量,在实际调用时,这两个变量会被检索到的文档片段和用户提问填充
- QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question"],template=template)
-
- qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectordb.as_retriever(),return_source_documents=True,chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})
-
- question = user_description
-
- result = qa_chain({"query": question})
- print("检索问答链回答 question 的结果:")
- print(result["result"])
-
- return result["result"]
-
- # # 仅 LLM 回答效果
- # result_2 = llm(question)
- # print("大模型回答 question 的结果:")
- # print(result_2)
-
-
- if __name__ =='__main__':
- decode_2('') #补充user_description内容
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。