当前位置:   article > 正文

langchain自定义vllm chat model实现知识库RAG快速推理_langchain vllm

langchain vllm

原本方法:继承langchain.llms.base import LLM类 实现一个CustomLLM

BaseLLM官方文档

我使用的是Qwen2模型,原本改写的模型:

  1. class QwenLM(LLM):
  2. # 基于本地 Qwen 自定义 LLM 类
  3. tokenizer : AutoTokenizer = None
  4. model: AutoModelForCausalLM = None
  5. def __init__(self, model_path :str):
  6. # model_path: Qwen 模型路径
  7. # 从本地初始化模型
  8. super().__init__()
  9. print("正在从本地加载模型...")
  10. model_dir = ''
  11. self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
  12. self.model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
  13. # Specify hyperparameters for generation
  14. self.model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
  15. print("完成本地模型的加载")
  16. def history_to_messages(self,history):
  17. messages = [{'role': 'system', 'content': 'xxx'}]
  18. for h in history:
  19. messages.append({'role': 'user', 'content': h[0]})
  20. messages.append({'role': 'assistant', 'content': h[1]})
  21. return messages
  22. def model_chat(self,model, tokenizer, prompt, history=None):
  23. model_name_or_path = 'Qwen2'
  24. # Assume other models have chat method
  25. if "Qwen2" not in model_name_or_path:
  26. return model.chat(tokenizer, prompt, history=history)
  27. else:
  28. if history is None:
  29. history = []
  30. else:
  31. history = copy.deepcopy(history)
  32. messages = self.history_to_messages(history)
  33. messages.append({'role': 'user', 'content': prompt})
  34. text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  35. model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
  36. generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
  37. generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
  38. response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  39. history.append((prompt, response))
  40. return response, history
  41. def _call(self, prompt : str, stop: Optional[List[str]] = None,
  42. run_manager: Optional[CallbackManagerForLLMRun] = None,
  43. **kwargs: Any):
  44. # 重写调用函数
  45. # response, history = self.model.chat(self.tokenizer, prompt , history=[])
  46. response,history = self.model_chat(self.model,self.tokenizer,prompt)
  47. return response
  48. @property
  49. def _llm_type(self) -> str:
  50. return "QwenLM"

但是发现用这种情况去推理的时候每次都要load model和tokenizer,非常耗时,所以想到先把模型用vllm部署然后推理,能大幅提升模型推理的效率,故我改写并定义VLLMChatModel模型如下:

  1. class VLLMChatModel(LLM):
  2. # 基于vLLM的自定义LLM类
  3. api_client: OpenAI = None
  4. def __init__(self, api_base_url: str):
  5. super().__init__()
  6. self.api_client = OpenAI(api_key="EMPTY", base_url=api_base_url)
  7. def history_to_messages(self, history):
  8. messages = [{'role': 'system', 'content': ''}] #system prompt自行补充
  9. for h in history:
  10. messages.append({'role': 'user', 'content': h[0]})
  11. messages.append({'role': 'assistant', 'content': h[1]})
  12. return messages
  13. def model_chat(self, prompt, history=None):
  14. if history is None:
  15. history = []
  16. messages = self.history_to_messages(history)
  17. messages.append({'role': 'user', 'content': prompt})
  18. # 调用vLLM API
  19. response = self.api_client.chat.completions.create(
  20. model="Qwen2-7B-Instruct_2",
  21. messages=messages,
  22. )
  23. generated_text = response.choices[0].message.content
  24. history.append((prompt, generated_text))
  25. return generated_text, history
  26. def _call(self, prompt: str, stop: Optional[List[str]] = None,
  27. run_manager: Optional[CallbackManagerForLLMRun] = None,
  28. **kwargs: Any):
  29. # 重写调用函数
  30. response, history = self.model_chat(prompt)
  31. return response
  32. @property
  33. def _llm_type(self) -> str:
  34. return "VLLMChatModel"

vllm启动的代码如下 :

  1. export CUDA_VISIBLE_DEVICES=4,5,6,7
  2. python -m vllm.entrypoints.openai.api_server \
  3. --served-model-name Qwen2-7B-Instruct_2 \ #这个对应定义函数中的model=""
  4. --model '' \ #Qwen2模型的local path
  5. --gpu-memory-utilization 0.9 \
  6. --tensor-parallel-size 4 \ #并行的卡数
  7. --port 8002 \
  8. --dtype float

调用定义好的VLLMChatModel:

  1. llm = VLLMChatModel(api_base_url="http://localhost:8002/v1")
  2. # llm = QwenLM(model_path = "") #之前定义的QwenLM其实也可以用

这样之后就可以实现在langchain框架下用自定义的vllm chat model来实现高效率的推理了

完整项目:我实现的是一个用langchain框架结合知识库RAG然后用Qwen2-7b-Instruct模型推理的项目。

langchain框架结合知识库RAG框架代码参考:

https://blog.csdn.net/FL1623863129/article/details/137900683?spm=1001.2014.3001.5506icon-default.png?t=N7T8https://blog.csdn.net/FL1623863129/article/details/137900683?spm=1001.2014.3001.5506完整代码:
 

  1. import os
  2. os.environ['CUDA_VISIBLE_DEVICES'] = '4,5,6,7'
  3. from langchain.document_loaders import UnstructuredFileLoader
  4. from langchain.document_loaders import UnstructuredMarkdownLoader
  5. from langchain_community.document_loaders.csv_loader import CSVLoader
  6. from langchain.text_splitter import RecursiveCharacterTextSplitter
  7. from langchain.vectorstores import Chroma
  8. from langchain.embeddings.huggingface import HuggingFaceEmbeddings
  9. from langchain.prompts import PromptTemplate
  10. from langchain.chains import RetrievalQA
  11. from tqdm import tqdm
  12. import os
  13. from langchain.llms.base import LLM
  14. from typing import Any, List, Optional
  15. from langchain.callbacks.manager import CallbackManagerForLLMRun
  16. from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
  17. from openai import OpenAI
  18. import copy
  19. class VLLMChatModel(LLM):
  20. # 基于vLLM的自定义LLM类
  21. api_client: OpenAI = None
  22. def __init__(self, api_base_url: str):
  23. super().__init__()
  24. self.api_client = OpenAI(api_key="EMPTY", base_url=api_base_url)
  25. def history_to_messages(self, history):
  26. messages = [{'role': 'system', 'content': ''}] #system prompt自行补充
  27. for h in history:
  28. messages.append({'role': 'user', 'content': h[0]})
  29. messages.append({'role': 'assistant', 'content': h[1]})
  30. return messages
  31. def model_chat(self, prompt, history=None):
  32. if history is None:
  33. history = []
  34. messages = self.history_to_messages(history)
  35. messages.append({'role': 'user', 'content': prompt})
  36. # 调用vLLM API
  37. response = self.api_client.chat.completions.create(
  38. model="Qwen2-7B-Instruct_2",
  39. messages=messages,
  40. )
  41. generated_text = response.choices[0].message.content
  42. history.append((prompt, generated_text))
  43. return generated_text, history
  44. def _call(self, prompt: str, stop: Optional[List[str]] = None,
  45. run_manager: Optional[CallbackManagerForLLMRun] = None,
  46. **kwargs: Any):
  47. # 重写调用函数
  48. response, history = self.model_chat(prompt)
  49. return response
  50. @property
  51. def _llm_type(self) -> str:
  52. return "VLLMChatModel"
  53. class QwenLM(LLM):
  54. # 基于本地 Qwen 自定义 LLM 类
  55. tokenizer : AutoTokenizer = None
  56. model: AutoModelForCausalLM = None
  57. def __init__(self, model_path :str):
  58. # model_path: Qwen 模型路径
  59. # 从本地初始化模型
  60. super().__init__()
  61. print("正在从本地加载模型...")
  62. model_dir = '' #Qwen2模型的local path
  63. self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
  64. self.model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
  65. # Specify hyperparameters for generation
  66. self.model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
  67. print("完成本地模型的加载")
  68. def history_to_messages(self,history):
  69. messages = [{'role': 'system', 'content': ''}] #system prompt自行补充
  70. for h in history:
  71. messages.append({'role': 'user', 'content': h[0]})
  72. messages.append({'role': 'assistant', 'content': h[1]})
  73. return messages
  74. def model_chat(self,model, tokenizer, prompt, history=None):
  75. model_name_or_path = 'Qwen2'
  76. # Assume other models have chat method
  77. if "Qwen2" not in model_name_or_path:
  78. return model.chat(tokenizer, prompt, history=history)
  79. else:
  80. if history is None:
  81. history = []
  82. else:
  83. history = copy.deepcopy(history)
  84. messages = self.history_to_messages(history)
  85. messages.append({'role': 'user', 'content': prompt})
  86. text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  87. model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
  88. generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
  89. generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
  90. response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  91. history.append((prompt, response))
  92. return response, history
  93. def _call(self, prompt : str, stop: Optional[List[str]] = None,
  94. run_manager: Optional[CallbackManagerForLLMRun] = None,
  95. **kwargs: Any):
  96. # 重写调用函数
  97. # response, history = self.model.chat(self.tokenizer, prompt , history=[])
  98. response,history = self.model_chat(self.model,self.tokenizer,prompt)
  99. return response
  100. @property
  101. def _llm_type(self) -> str:
  102. return "QwenLM"
  103. # 获取文件路径函数
  104. def get_files(dir_path):
  105. # args:dir_path,目标文件夹路径
  106. file_list = []
  107. for filepath, dirnames, filenames in os.walk(dir_path):
  108. # os.walk 函数将递归遍历指定文件夹
  109. for filename in filenames:
  110. # 通过后缀名判断文件类型是否满足要求
  111. if filename.endswith(".md"):
  112. # 如果满足要求,将其绝对路径加入到结果列表
  113. file_list.append(os.path.join(filepath, filename))
  114. elif filename.endswith(".txt"):
  115. file_list.append(os.path.join(filepath, filename))
  116. elif filename.endswith(".csv"):
  117. file_list.append(os.path.join(filepath, filename))
  118. return file_list
  119. # 加载文件函数
  120. def get_text(dir_path):
  121. # args:dir_path,目标文件夹路径
  122. # 首先调用上文定义的函数得到目标文件路径列表
  123. file_lst = get_files(dir_path)
  124. # docs 存放加载之后的纯文本对象
  125. docs = []
  126. # 遍历所有目标文件
  127. for one_file in tqdm(file_lst):
  128. file_type = one_file.split('.')[-1]
  129. if file_type == 'md':
  130. loader = UnstructuredMarkdownLoader(one_file)
  131. elif file_type == 'txt':
  132. loader = UnstructuredFileLoader(one_file)
  133. elif file_type == 'csv':
  134. loader = CSVLoader(one_file)
  135. else:
  136. # 如果是不符合条件的文件,直接跳过
  137. continue
  138. docs.extend(loader.load())
  139. return docs
  140. # 目标文件夹
  141. # tar_dir = [
  142. # "/root/autodl-tmp/qwen",
  143. # "/root/autodl-tmp/Qwen",
  144. # ]
  145. #构建向量数据库并持久化到磁盘上
  146. def generate_vectordb():
  147. tar_dir = ["/raid/home/specter/enhance_llm/qa_bot/hbv_data"]
  148. # 加载目标文件
  149. docs = []
  150. for dir_path in tar_dir:
  151. docs.extend(get_text(dir_path))
  152. # 对文本进行分块
  153. text_splitter = RecursiveCharacterTextSplitter(
  154. chunk_size=500, chunk_overlap=150)
  155. split_docs = text_splitter.split_documents(docs)
  156. # 加载开源词向量模型
  157. embeddings = HuggingFaceEmbeddings(model_name="/raid/home/specter/enhance_llm/qa_bot/embedding_model")
  158. # 构建向量数据库
  159. # 定义持久化路径
  160. persist_directory = '/raid/home/specter/enhance_llm/qa_bot/data_base/vector_db/chroma'
  161. # 加载数据库
  162. vectordb = Chroma.from_documents(
  163. documents=split_docs,
  164. embedding=embeddings,
  165. persist_directory=persist_directory # 允许我们将persist_directory目录保存到磁盘上
  166. )
  167. # 将加载的向量数据库持久化到磁盘上
  168. vectordb.persist()
  169. def decode_2(user_description):
  170. #构建向量数据库并持久化到磁盘上
  171. generate_vectordb()
  172. # 加载开源词向量模型
  173. embeddings = HuggingFaceEmbeddings(model_name="/raid/home/specter/enhance_llm/qa_bot/embedding_model")
  174. # 向量数据库持久化路径
  175. persist_directory = '/raid/home/specter/enhance_llm/qa_bot/data_base/vector_db/chroma'
  176. # 加载数据库
  177. vectordb = Chroma(
  178. persist_directory=persist_directory,
  179. embedding_function=embeddings
  180. )
  181. llm = VLLMChatModel(api_base_url="http://localhost:8002/v1")
  182. # llm = QwenLM(model_path = "") #loacl path自行补充
  183. # 我们所构造的 Prompt 模板
  184. template = """使用以下上下文来回答最后的问题。如果你不知道答案,就说你不知道,不要试图编造答案。尽量使答案简明扼要。总是在回答的最后说“谢谢你的提问!”。
  185. {context}
  186. 问题: {question}
  187. 有用的回答:"""
  188. # 调用 LangChain 的方法来实例化一个 Template 对象,该对象包含了 context 和 question 两个变量,在实际调用时,这两个变量会被检索到的文档片段和用户提问填充
  189. QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context","question"],template=template)
  190. qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectordb.as_retriever(),return_source_documents=True,chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})
  191. question = user_description
  192. result = qa_chain({"query": question})
  193. print("检索问答链回答 question 的结果:")
  194. print(result["result"])
  195. return result["result"]
  196. # # 仅 LLM 回答效果
  197. # result_2 = llm(question)
  198. # print("大模型回答 question 的结果:")
  199. # print(result_2)
  200. if __name__ =='__main__':
  201. decode_2('') #补充user_description内容
声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号