赞
踩
利用FastAPI和uvicorn构建本地化部署大语言模型的接口服务,实现大模型与后续应用(如langchain等)的解耦。整个流程分为服务端与客户端。
import uvicorn from fastapi import FastAPI from pydantic import BaseModel import uvicorn, json, datetime import torch import os from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7" app = FastAPI() class Query(BaseModel): text: str path = "/workdir/model/baichuan13b_chat/" tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) model.generation_config = GenerationConfig.from_pretrained(path) @app.post("/chat/") async def chat(query: Query): input_ids = tokenizer([query.text]).input_ids output_ids = model.generate( torch.as_tensor(input_ids).cuda(), do_sample=False, temperature=0.1, repetition_penalty=1, max_new_tokens=1024) output_ids = output_ids[0][len(input_ids[0]):] outputs = tokenizer.decode(output_ids, skip_special_tokens=True, spaces_between_special_tokens=False) return {"result": outputs} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=6667)
import requests
url = "http://0.0.0.0:6667/chat/"
query = {"text": "你好,请做一段自我介绍。"}
response = requests.post(url, json=query)
if response.status_code == 200:
result = response.json()
print("BOT:", result["result"])
else:
print("Error:", response.status_code, response.text)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。