当前位置:   article > 正文

LLM大语言模型(七):部署ChatGLM3-6B并提供HTTP server能力_bge-large-zh-v1.5 下载

bge-large-zh-v1.5 下载

目录

HighLight

部署ChatGLM3-6B并开启HTTP server能力

下载embedding模型bge-large-zh-v1.5

HTTP接口问答示例

LLM讲了个尴尬的笑话~


HighLight

将LLM服务化(如提供HTTP server能力),才能在其上构建自己的应用。

部署ChatGLM3-6B并开启HTTP server能力

下载embedding模型bge-large-zh-v1.5

启动模型需要

https://www.modelscope.cn/models/Xorbits/bge-large-zh-v1.5/files

# set LLM path 修改为自己的路径
MODEL_PATH = os.environ.get('MODEL_PATH', 'D:\\github\\chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)

# embedding model修改为自己的路径
EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', "D:\\github\\bge-large-zh-v1.5")

参考ChatGLM官方提供的demo

openai_api_demo/api_server.py

  1. import os
  2. import time
  3. import tiktoken
  4. import torch
  5. import uvicorn
  6. from fastapi import FastAPI, HTTPException, Response
  7. from fastapi.middleware.cors import CORSMiddleware
  8. from contextlib import asynccontextmanager
  9. from typing import List, Literal, Optional, Union
  10. from loguru import logger
  11. from pydantic import BaseModel, Field
  12. from transformers import AutoTokenizer, AutoModel
  13. from utils import process_response, generate_chatglm3, generate_stream_chatglm3
  14. from sentence_transformers import SentenceTransformer
  15. from sse_starlette.sse import EventSourceResponse
  16. # Set up limit request time
  17. EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
  18. # set LLM path
  19. MODEL_PATH = os.environ.get('MODEL_PATH', 'D:\\github\\chatglm3-6b')
  20. TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
  21. # set Embedding Model path
  22. EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', "D:\\github\\bge-large-zh-v1.5")
  23. @asynccontextmanager
  24. async def lifespan(app: FastAPI):
  25. yield
  26. if torch.cuda.is_available():
  27. torch.cuda.empty_cache()
  28. torch.cuda.ipc_collect()
  29. app = FastAPI(lifespan=lifespan)
  30. app.add_middleware(
  31. CORSMiddleware,
  32. allow_origins=["*"],
  33. allow_credentials=True,
  34. allow_methods=["*"],
  35. allow_headers=["*"],
  36. )
  37. class ModelCard(BaseModel):
  38. id: str
  39. object: str = "model"
  40. created: int = Field(default_factory=lambda: int(time.time()))
  41. owned_by: str = "owner"
  42. root: Optional[str] = None
  43. parent: Optional[str] = None
  44. permission: Optional[list] = None
  45. class ModelList(BaseModel):
  46. object: str = "list"
  47. data: List[ModelCard] = []
  48. class FunctionCallResponse(BaseModel):
  49. name: Optional[str] = None
  50. arguments: Optional[str] = None
  51. class ChatMessage(BaseModel):
  52. role: Literal["user", "assistant", "system", "function"]
  53. content: str = None
  54. name: Optional[str] = None
  55. function_call: Optional[FunctionCallResponse] = None
  56. class DeltaMessage(BaseModel):
  57. role: Optional[Literal["user", "assistant", "system"]] = None
  58. content: Optional[str] = None
  59. function_call: Optional[FunctionCallResponse] = None
  60. ## for Embedding
  61. class EmbeddingRequest(BaseModel):
  62. input: List[str]
  63. model: str
  64. class CompletionUsage(BaseModel):
  65. prompt_tokens: int
  66. completion_tokens: int
  67. total_tokens: int
  68. class EmbeddingResponse(BaseModel):
  69. data: list
  70. model: str
  71. object: str
  72. usage: CompletionUsage
  73. # for ChatCompletionRequest
  74. class UsageInfo(BaseModel):
  75. prompt_tokens: int = 0
  76. total_tokens: int = 0
  77. completion_tokens: Optional[int] = 0
  78. class ChatCompletionRequest(BaseModel):
  79. model: str
  80. messages: List[ChatMessage]
  81. temperature: Optional[float] = 0.8
  82. top_p: Optional[float] = 0.8
  83. max_tokens: Optional[int] = None
  84. stream: Optional[bool] = False
  85. tools: Optional[Union[dict, List[dict]]] = None
  86. repetition_penalty: Optional[float] = 1.1
  87. class ChatCompletionResponseChoice(BaseModel):
  88. index: int
  89. message: ChatMessage
  90. finish_reason: Literal["stop", "length", "function_call"]
  91. class ChatCompletionResponseStreamChoice(BaseModel):
  92. delta: DeltaMessage
  93. finish_reason: Optional[Literal["stop", "length", "function_call"]]
  94. index: int
  95. class ChatCompletionResponse(BaseModel):
  96. model: str
  97. id: str
  98. object: Literal["chat.completion", "chat.completion.chunk"]
  99. choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
  100. created: Optional[int] = Field(default_factory=lambda: int(time.time()))
  101. usage: Optional[UsageInfo] = None
  102. @app.get("/health")
  103. async def health() -> Response:
  104. """Health check."""
  105. return Response(status_code=200)
  106. @app.post("/v1/embeddings", response_model=EmbeddingResponse)
  107. async def get_embeddings(request: EmbeddingRequest):
  108. embeddings = [embedding_model.encode(text) for text in request.input]
  109. embeddings = [embedding.tolist() for embedding in embeddings]
  110. def num_tokens_from_string(string: str) -> int:
  111. """
  112. Returns the number of tokens in a text string.
  113. use cl100k_base tokenizer
  114. """
  115. encoding = tiktoken.get_encoding('cl100k_base')
  116. num_tokens = len(encoding.encode(string))
  117. return num_tokens
  118. response = {
  119. "data": [
  120. {
  121. "object": "embedding",
  122. "embedding": embedding,
  123. "index": index
  124. }
  125. for index, embedding in enumerate(embeddings)
  126. ],
  127. "model": request.model,
  128. "object": "list",
  129. "usage": CompletionUsage(
  130. prompt_tokens=sum(len(text.split()) for text in request.input),
  131. completion_tokens=0,
  132. total_tokens=sum(num_tokens_from_string(text) for text in request.input),
  133. )
  134. }
  135. return response
  136. @app.get("/v1/models", response_model=ModelList)
  137. async def list_models():
  138. model_card = ModelCard(
  139. id="chatglm3-6b"
  140. )
  141. return ModelList(
  142. data=[model_card]
  143. )
  144. @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
  145. async def create_chat_completion(request: ChatCompletionRequest):
  146. global model, tokenizer
  147. if len(request.messages) < 1 or request.messages[-1].role == "assistant":
  148. raise HTTPException(status_code=400, detail="Invalid request")
  149. gen_params = dict(
  150. messages=request.messages,
  151. temperature=request.temperature,
  152. top_p=request.top_p,
  153. max_tokens=request.max_tokens or 1024,
  154. echo=False,
  155. stream=request.stream,
  156. repetition_penalty=request.repetition_penalty,
  157. tools=request.tools,
  158. )
  159. logger.debug(f"==== request ====\n{gen_params}")
  160. if request.stream:
  161. # Use the stream mode to read the first few characters, if it is not a function call, direct stram output
  162. predict_stream_generator = predict_stream(request.model, gen_params)
  163. output = next(predict_stream_generator)
  164. if not contains_custom_function(output):
  165. return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
  166. # Obtain the result directly at one time and determine whether tools needs to be called.
  167. logger.debug(f"First result output:\n{output}")
  168. function_call = None
  169. if output and request.tools:
  170. try:
  171. function_call = process_response(output, use_tool=True)
  172. except:
  173. logger.warning("Failed to parse tool call")
  174. # CallFunction
  175. if isinstance(function_call, dict):
  176. function_call = FunctionCallResponse(**function_call)
  177. """
  178. In this demo, we did not register any tools.
  179. You can use the tools that have been implemented in our `tools_using_demo` and implement your own streaming tool implementation here.
  180. Similar to the following method:
  181. function_args = json.loads(function_call.arguments)
  182. tool_response = dispatch_tool(tool_name: str, tool_params: dict)
  183. """
  184. tool_response = ""
  185. if not gen_params.get("messages"):
  186. gen_params["messages"] = []
  187. gen_params["messages"].append(ChatMessage(
  188. role="assistant",
  189. content=output,
  190. ))
  191. gen_params["messages"].append(ChatMessage(
  192. role="function",
  193. name=function_call.name,
  194. content=tool_response,
  195. ))
  196. # Streaming output of results after function calls
  197. generate = predict(request.model, gen_params)
  198. return EventSourceResponse(generate, media_type="text/event-stream")
  199. else:
  200. # Handled to avoid exceptions in the above parsing function process.
  201. generate = parse_output_text(request.model, output)
  202. return EventSourceResponse(generate, media_type="text/event-stream")
  203. # Here is the handling of stream = False
  204. response = generate_chatglm3(model, tokenizer, gen_params)
  205. # Remove the first newline character
  206. if response["text"].startswith("\n"):
  207. response["text"] = response["text"][1:]
  208. response["text"] = response["text"].strip()
  209. usage = UsageInfo()
  210. function_call, finish_reason = None, "stop"
  211. if request.tools:
  212. try:
  213. function_call = process_response(response["text"], use_tool=True)
  214. except:
  215. logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")
  216. if isinstance(function_call, dict):
  217. finish_reason = "function_call"
  218. function_call = FunctionCallResponse(**function_call)
  219. message = ChatMessage(
  220. role="assistant",
  221. content=response["text"],
  222. function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
  223. )
  224. logger.debug(f"==== message ====\n{message}")
  225. choice_data = ChatCompletionResponseChoice(
  226. index=0,
  227. message=message,
  228. finish_reason=finish_reason,
  229. )
  230. task_usage = UsageInfo.model_validate(response["usage"])
  231. for usage_key, usage_value in task_usage.model_dump().items():
  232. setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
  233. return ChatCompletionResponse(
  234. model=request.model,
  235. id="", # for open_source model, id is empty
  236. choices=[choice_data],
  237. object="chat.completion",
  238. usage=usage
  239. )
  240. async def predict(model_id: str, params: dict):
  241. global model, tokenizer
  242. choice_data = ChatCompletionResponseStreamChoice(
  243. index=0,
  244. delta=DeltaMessage(role="assistant"),
  245. finish_reason=None
  246. )
  247. chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
  248. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  249. previous_text = ""
  250. for new_response in generate_stream_chatglm3(model, tokenizer, params):
  251. decoded_unicode = new_response["text"]
  252. delta_text = decoded_unicode[len(previous_text):]
  253. previous_text = decoded_unicode
  254. finish_reason = new_response["finish_reason"]
  255. if len(delta_text) == 0 and finish_reason != "function_call":
  256. continue
  257. function_call = None
  258. if finish_reason == "function_call":
  259. try:
  260. function_call = process_response(decoded_unicode, use_tool=True)
  261. except:
  262. logger.warning(
  263. "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
  264. if isinstance(function_call, dict):
  265. function_call = FunctionCallResponse(**function_call)
  266. delta = DeltaMessage(
  267. content=delta_text,
  268. role="assistant",
  269. function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
  270. )
  271. choice_data = ChatCompletionResponseStreamChoice(
  272. index=0,
  273. delta=delta,
  274. finish_reason=finish_reason
  275. )
  276. chunk = ChatCompletionResponse(
  277. model=model_id,
  278. id="",
  279. choices=[choice_data],
  280. object="chat.completion.chunk"
  281. )
  282. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  283. choice_data = ChatCompletionResponseStreamChoice(
  284. index=0,
  285. delta=DeltaMessage(),
  286. finish_reason="stop"
  287. )
  288. chunk = ChatCompletionResponse(
  289. model=model_id,
  290. id="",
  291. choices=[choice_data],
  292. object="chat.completion.chunk"
  293. )
  294. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  295. yield '[DONE]'
  296. def predict_stream(model_id, gen_params):
  297. """
  298. The function call is compatible with stream mode output.
  299. The first seven characters are determined.
  300. If not a function call, the stream output is directly generated.
  301. Otherwise, the complete character content of the function call is returned.
  302. :param model_id:
  303. :param gen_params:
  304. :return:
  305. """
  306. output = ""
  307. is_function_call = False
  308. has_send_first_chunk = False
  309. for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
  310. decoded_unicode = new_response["text"]
  311. delta_text = decoded_unicode[len(output):]
  312. output = decoded_unicode
  313. # When it is not a function call and the character length is> 7,
  314. # try to judge whether it is a function call according to the special function prefix
  315. if not is_function_call and len(output) > 7:
  316. # Determine whether a function is called
  317. is_function_call = contains_custom_function(output)
  318. if is_function_call:
  319. continue
  320. # Non-function call, direct stream output
  321. finish_reason = new_response["finish_reason"]
  322. # Send an empty string first to avoid truncation by subsequent next() operations.
  323. if not has_send_first_chunk:
  324. message = DeltaMessage(
  325. content="",
  326. role="assistant",
  327. function_call=None,
  328. )
  329. choice_data = ChatCompletionResponseStreamChoice(
  330. index=0,
  331. delta=message,
  332. finish_reason=finish_reason
  333. )
  334. chunk = ChatCompletionResponse(
  335. model=model_id,
  336. id="",
  337. choices=[choice_data],
  338. created=int(time.time()),
  339. object="chat.completion.chunk"
  340. )
  341. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  342. send_msg = delta_text if has_send_first_chunk else output
  343. has_send_first_chunk = True
  344. message = DeltaMessage(
  345. content=send_msg,
  346. role="assistant",
  347. function_call=None,
  348. )
  349. choice_data = ChatCompletionResponseStreamChoice(
  350. index=0,
  351. delta=message,
  352. finish_reason=finish_reason
  353. )
  354. chunk = ChatCompletionResponse(
  355. model=model_id,
  356. id="",
  357. choices=[choice_data],
  358. created=int(time.time()),
  359. object="chat.completion.chunk"
  360. )
  361. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  362. if is_function_call:
  363. yield output
  364. else:
  365. yield '[DONE]'
  366. async def parse_output_text(model_id: str, value: str):
  367. """
  368. Directly output the text content of value
  369. :param model_id:
  370. :param value:
  371. :return:
  372. """
  373. choice_data = ChatCompletionResponseStreamChoice(
  374. index=0,
  375. delta=DeltaMessage(role="assistant", content=value),
  376. finish_reason=None
  377. )
  378. chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
  379. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  380. choice_data = ChatCompletionResponseStreamChoice(
  381. index=0,
  382. delta=DeltaMessage(),
  383. finish_reason="stop"
  384. )
  385. chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
  386. yield "{}".format(chunk.model_dump_json(exclude_unset=True))
  387. yield '[DONE]'
  388. def contains_custom_function(value: str) -> bool:
  389. """
  390. Determine whether 'function_call' according to a special function prefix.
  391. For example, the functions defined in "tools_using_demo/tool_register.py" are all "get_xxx" and start with "get_"
  392. [Note] This is not a rigorous judgment method, only for reference.
  393. :param value:
  394. :return:
  395. """
  396. return value and 'get_' in value
  397. if __name__ == "__main__":
  398. # Load LLM
  399. tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
  400. model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
  401. # load Embedding
  402. embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda")
  403. uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)

HTTP接口问答示例

  1. curl -H "Content-Type: application/json" -X POST -d '{
  2. "messages": [
  3. {
  4. "role":"user",
  5. "content":"给我讲个笑话"
  6. }
  7. ],
  8. "model":"chatglm3-6b"
  9. }' http://localhost:8000/v1/chat/completions

返回结果

  1. HTTP/1.1 200 OK
  2. date: Sat, 16 Mar 2024 13:16:00 GMT
  3. server: uvicorn
  4. content-length: 611
  5. content-type: application/json
  6. Connection: close
  7. {
  8. "model": "chatglm3-6b",
  9. "id": "",
  10. "object": "chat.completion",
  11. "choices": [
  12. {
  13. "index": 0,
  14. "message": {
  15. "role": "assistant",
  16. "content": "好的,给您讲一个轻松的笑话:\n\n有一天,小明在公园里捡到一个神奇的灯笼。他捧着灯笼说了:“我希望我成为世界上最聪明的人!”突然,他变成了一个女人。\n\n这个笑话是在玩弄性别刻板印象,暗示女性比男性更聪明。希望这个笑话能带给您快乐!",
  17. "name": null,
  18. "function_call": null
  19. },
  20. "finish_reason": "stop"
  21. }
  22. ],
  23. "created": 1710594964,
  24. "usage": {
  25. "prompt_tokens": 11,
  26. "total_tokens": 83,
  27. "completion_tokens": 72
  28. }
  29. }

LLM讲了个尴尬的笑话~

若HTTPserver在处理请求的过程中出现failed to open nvrtc-builtins64_121.dll错误,请参考下文1解决。

参考

  1. ChatGLM3-6B独立部署提供HTTP服务failed to open nvrtc-builtins64_121.dll-CSDN博客
  2. LLM大语言模型(四):在ChatGLM3-6B中使用langchain_chatglm3-6b langchain-CSDN博客
  3. LLM大语言模型(一):ChatGLM3-6B本地部署-CSDN博客
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/494993
推荐阅读
相关标签
  

闽ICP备14008679号