赞
踩
zkz098大佬找到的BUG和给出的解决方案
cd Project
git clone https://github.com/THUDM/ChatGLM2-6B.git
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/THUDM/chatglm2-6b
# 这里下载的只是一个占位符,并不是真正的模型
https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/?p=%2Fchatglm2-6b&mode=list
[模型文件]: https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/?p=%2Fchatglm2-6b&mode=list
下载这些文件,完成后替换掉步骤2时下载的模型实现
2023/8/20
由于openai_api中没有返回一个关键词:usage,会导致调用时出现报错
需要做如下修改
import time import tiktoken import torch import uvicorn from pydantic import BaseModel, Field from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager from typing import Any, Dict, List, Literal, Optional, Union from transformers import AutoTokenizer, AutoModel from sse_starlette.sse import ServerSentEvent, EventSourceResponse @asynccontextmanager async def lifespan(app: FastAPI): # collects GPU memory yield if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() app = FastAPI(lifespan=lifespan) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) class ModelCard(BaseModel): id: str object: str = "model" created: int = Field(default_factory=lambda: int(time.time())) owned_by: str = "owner" root: Optional[str] = None parent: Optional[str] = None permission: Optional[list] = None class ModelList(BaseModel): object: str = "list" data: List[ModelCard] = [] class ChatMessage(BaseModel): role: Literal["user", "assistant", "system"] content: str class DeltaMessage(BaseModel): role: Optional[Literal["user", "assistant", "system"]] = None content: Optional[str] = None class ChatCompletionRequest(BaseModel): model: str messages: List[ChatMessage] temperature: Optional[float] = None top_p: Optional[float] = None max_length: Optional[int] = None stream: Optional[bool] = False class ChatCompletionResponseChoice(BaseModel): index: int message: ChatMessage finish_reason: Literal["stop", "length"] class ChatCompletionResponseStreamChoice(BaseModel): index: int delta: DeltaMessage finish_reason: Optional[Literal["stop", "length"]] class ChatCompletionResponse(BaseModel): model: str object: Literal["chat.completion", "chat.completion.chunk"] choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]] created: Optional[int] = Field(default_factory=lambda: int(time.time())) usage: dict @app.get("/v1/models", response_model=ModelList) async def list_models(): global model_args model_card = ModelCard(id="gpt-3.5-turbo") return ModelList(data=[model_card]) @app.post("/v1/chat/completions", response_model=ChatCompletionResponse) async def create_chat_completion(request: ChatCompletionRequest): global model, tokenizer if request.messages[-1].role != "user": raise HTTPException(status_code=400, detail="Invalid request") query = request.messages[-1].content prev_messages = request.messages[:-1] if len(prev_messages) > 0 and prev_messages[0].role == "system": query = prev_messages.pop(0).content + query history = [] if len(prev_messages) % 2 == 0: for i in range(0, len(prev_messages), 2): if prev_messages[i].role == "user" and prev_messages[i + 1].role == "assistant": history.append([prev_messages[i].content, prev_messages[i + 1].content]) if request.stream: generate = predict(query, history, request.model) return EventSourceResponse(generate, media_type="text/event-stream") response, _ = model.chat(tokenizer, query, history=history) choice_data = ChatCompletionResponseChoice( index=0, message=ChatMessage(role="assistant", content=response), finish_reason="stop" ) encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") pt = len(encoding.encode(query)) rt = len(encoding.encode(response)) usage_data = { "prompt_tokens": pt, "completion_tokens": rt, "total_tokens": pt + rt } return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage_data) async def predict(query: str, history: List[List[str]], model_id: str): global model, tokenizer choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant"), finish_reason=None ) chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk") yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) current_length = 0 for new_response, _ in model.stream_chat(tokenizer, query, history): if len(new_response) == current_length: continue new_text = new_response[current_length:] current_length = len(new_response) choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(content=new_text), finish_reason=None ) chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk") yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(), finish_reason="stop" ) chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk") yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) yield '[DONE]'
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).quantize(8).half().cuda()
# model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).quantize(4).half().cuda()
# model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).half().cuda()
# "models/chatglm2-6b"改为你存放模型的路径
# 多显卡支持,使用下面两行代替上面一行,将num_gpus改为你实际的显卡数量
# from utils import load_model_on_gpus
# model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
model.eval()
uvicorn.run(app, host='0.0.0.0', port=8080, workers=1, root_path="/ChatGLM/OpenAPI")
# port改为你自己设置的port,这里的设置需要和nginx中适配
下载nginx(装过的略过)
apt install nginx -y
添加配置(根据自己实际情况来)
# /etc/nginx/nginx.conf -> 在http{}中添加如下内容 include /etc/nginx/myHost/*.conf; # 创建文件夹 /etc/nginx/myHost (根据自己实际情况来) cd /etc/nginx mkdir myHost cd myHost # 创建文件 /etc/nginx/myHost/ChatGLM.conf touch ChatGLM.conf # 在ChatGLM.conf中添加如下内容 server { listen 8080; server_name i-2.gpushare.com; # Change this to your domain name location /ChatGLM/OpenAPI/ { # Change this if you'd like to server your Gradio app on a different path proxy_pass http://0.0.0.0:18203/; # Change this if your Gradio app will be running on a different port proxy_redirect off; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; proxy_set_header Host $host; } }
conda create -n glm python=3.8 -y
conda activate glm
pip install -r requirements.txt
pip install tiktoken
nohup python openai_api.py
修改桌宠的chatapi配置,将api-url改为你自己的url,key不用填
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。