赞
踩
.ChatGLM3-6b代码
https://github.com/THUDM/ChatGLM3
2.FastGPT
https://github.com/labring/FastGPT
3.向量模型m3e,可以通过docker部署
https://huggingface.co/moka-ai/m3e-base
先安装Anaconda3Index of /anaconda/archive/ | 清华大学开源软件镜像站 | Tsinghua Open Source Mirror
先创建名字是chatglm3-demo的环境
- conda create -n chatglm3-demo python=3.10
- conda activate chatglm3-demo
chatglm3-6b需要Python 3.10以上。
cd /ChatGLM3-main/composite_demo
换到demo下测试模型,先安装requirements.txt,可以先把torch删除单独安装。
pip install -r requirements.txt
Code Interpreter 还需要安装 Jupyter 内核。新建系统变量名=IPYKERNEL。值=chatglm3-demo
ipython kernel install --name chatglm3-demo --user
安装pytorch和torchvision的对应版本
torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl
在下载包的位置安装
- pip install torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
- pip install torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl
可在网站搜索对应版本https://download.pytorch.org/whl/
验证是否使用GPU学习
- import torch
- from transformers import __version__ as transformers_version
- import torchvision
-
- print("PyTorch VER:", torch.__version__)
- print("Transformers version:", transformers_version)
- print("TorchVision version:", torchvision.__version__)
-
- # 检查是否有可用的 GPU
- if torch.cuda.is_available():
- print("CUDA version:", torch.version.cuda)
- print("GPU TRUE")
- else:
- print("GPU FALSE")
-
- # 检查其他库的版本
- # 这里可以添加其他库的检查
打开composite_demo/client.py修改模型位置
运行命令测试,缺少什么模块就安装
streamlit run main.py
如果显存低于12G,回答响应太慢,改变量化模型后,可以正常对话。
正常对话,代码和环境都可以运行。
切换到D:\...\ChatGLM3-main\openai_api_demo
修改openai_api.py使用chatglm3模型位置
如果有其他模型,放在一个目录
上postman测试。请求体测试。
- {
- "model": "string",
- "messages": [
- {
- "role": "user",
- "content": "你好",
- "name": "string",
- "function_call": {
- "name": "string",
- "arguments": "string"
- }
- }
- ],
- "temperature": 0.8,
- "top_p": 0.8,
- "max_tokens": 0,
- "stream": false,
- "functions": {},
- "repetition_penalty": 1.1
- }
成功后运行
python openai_api.py
可替换openai_api.py代码
- # coding=utf-8
- # Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
- # Usage: python openai_api.py
- # Visit http://localhost:8000/docs for documents.
-
- # 在OpenAI的API中,max_tokens 等价于 HuggingFace 的 max_new_tokens 而不是 max_length,
- # 例如,对于6b模型,设置max_tokens = 8192,则会报错,因为扣除历史记录和提示词后,模型不能输出那么多的tokens。
-
- import os
- import time
- import json
- from contextlib import asynccontextmanager
- from typing import List, Literal, Optional, Union
-
- import torch
- from torch.cuda import get_device_properties
- import uvicorn
- from fastapi import FastAPI, HTTPException
- from fastapi.middleware.cors import CORSMiddleware
- from loguru import logger
- from pydantic import BaseModel, Field
- from sse_starlette.sse import EventSourceResponse
- from transformers import AutoTokenizer, AutoModel
- from utils import process_response, generate_chatglm3, generate_stream_chatglm3
-
- MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM_chatglm3-6b')
- TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
- DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-
- @asynccontextmanager
- async def lifespan(app: FastAPI): # collects GPU memory
- yield
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
- torch.cuda.ipc_collect()
-
-
- app = FastAPI(lifespan=lifespan)
-
- app.add_middleware(
- CORSMiddleware,
- allow_origins=["*"],
- allow_credentials=True,
- allow_methods=["*"],
- allow_headers=["*"],
- )
-
-
- class ModelCard(BaseModel):
- id: str
- object: str = "model"
- created: int = Field(default_factory=lambda: int(time.time()))
- owned_by: str = "owner"
- root: Optional[str] = None
- parent: Optional[str] = None
- permission: Optional[list] = None
-
-
- class ModelList(BaseModel):
- object: str = "list"
- data: List[ModelCard] = []
-
-
- class FunctionCallResponse(BaseModel):
- name: Optional[str] = None
- arguments: Optional[str] = None
-
-
- class ChatMessage(BaseModel):
- role: Literal["user", "assistant", "system", "function"]
- content: str = None
- name: Optional[str] = None
- function_call: Optional[FunctionCallResponse] = None
-
-
- class DeltaMessage(BaseModel):
- role: Optional[Literal["user", "assistant", "system"]] = None
- content: Optional[str] = None
- function_call: Optional[FunctionCallResponse] = None
-
-
- class ChatCompletionRequest(BaseModel):
- model: str
- messages: List[ChatMessage]
- temperature: Optional[float] = 0.8
- top_p: Optional[float] = 0.8
- max_tokens: Optional[int] = None
- stream: Optional[bool] = False
- functions: Optional[Union[dict, List[dict]]] = None
- # Additional parameters
- repetition_penalty: Optional[float] = 1.1
-
-
- class ChatCompletionResponseChoice(BaseModel):
- index: int
- message: ChatMessage
- finish_reason: Literal["stop", "length", "function_call"]
-
-
- class ChatCompletionResponseStreamChoice(BaseModel):
- index: int
- delta: DeltaMessage
- finish_reason: Optional[Literal["stop", "length", "function_call"]]
-
-
- class UsageInfo(BaseModel):
- prompt_tokens: int = 0
- total_tokens: int = 0
- completion_tokens: Optional[int] = 0
-
-
- class ChatCompletionResponse(BaseModel):
- model: str
- object: Literal["chat.completion", "chat.completion.chunk"]
- choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
- created: Optional[int] = Field(default_factory=lambda: int(time.time()))
- usage: Optional[UsageInfo] = None
-
-
- @app.get("/v1/models", response_model=ModelList)
- async def list_models():
- model_card = ModelCard(id="chatglm3-6b")
- return ModelList(data=[model_card])
-
-
- @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
- async def create_chat_completion(request: ChatCompletionRequest):
- global model, tokenizer
-
- if len(request.messages) < 1 or request.messages[-1].role == "assistant":
- raise HTTPException(status_code=400, detail="Invalid request")
-
- gen_params = dict(
- messages=request.messages,
- temperature=request.temperature,
- top_p=request.top_p,
- max_tokens=request.max_tokens or 1024,
- echo=False,
- stream=request.stream,
- repetition_penalty=request.repetition_penalty,
- functions=request.functions,
- )
-
- logger.debug(f"==== request ====\n{gen_params}")
-
- if request.stream:
-
- # Use the stream mode to read the first few characters, if it is not a function call, direct stram output
- predict_stream_generator = predict_stream(request.model, gen_params)
- output = next(predict_stream_generator)
- if not contains_custom_function(output):
- return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
-
- # Obtain the result directly at one time and determine whether tools needs to be called.
- logger.debug(f"First result output:\n{output}")
-
- function_call = None
- if output and request.functions:
- try:
- function_call = process_response(output, use_tool=True)
- except:
- logger.warning("Failed to parse tool call")
-
- # CallFunction
- if isinstance(function_call, dict):
- function_call = FunctionCallResponse(**function_call)
-
-
- """
- In this demo, we did not register any tools.
- You can use the tools that have been implemented in our `tool_using` and implement your own streaming tool implementation here.
- Similar to the following method:
- function_args = json.loads(function_call.arguments)
- tool_response = dispatch_tool(tool_name: str, tool_params: dict)
- """
- tool_response = ""
-
- if not gen_params.get("messages"):
- gen_params["messages"] = []
-
- gen_params["messages"].append(ChatMessage(
- role="assistant",
- content=output,
- ))
- gen_params["messages"].append(ChatMessage(
- role="function",
- name=function_call.name,
- content=tool_response,
- ))
-
- # Streaming output of results after function calls
- generate = predict(request.model, gen_params)
- return EventSourceResponse(generate, media_type="text/event-stream")
-
- else:
- # Handled to avoid exceptions in the above parsing function process.
- generate = parse_output_text(request.model, output)
- return EventSourceResponse(generate, media_type="text/event-stream")
-
- # Here is the handling of stream = False
- response = generate_chatglm3(model, tokenizer, gen_params)
-
- # Remove the first newline character
- if response["text"].startswith("\n"):
- response["text"] = response["text"][1:]
- response["text"] = response["text"].strip()
- usage = UsageInfo()
- function_call, finish_reason = None, "stop"
- if request.functions:
- try:
- function_call = process_response(response["text"], use_tool=True)
- except:
- logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")
-
- if isinstance(function_call, dict):
- finish_reason = "function_call"
- function_call = FunctionCallResponse(**function_call)
-
- message = ChatMessage(
- role="assistant",
- content=response["text"],
- function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
- )
-
- logger.debug(f"==== message ====\n{message}")
-
- choice_data = ChatCompletionResponseChoice(
- index=0,
- message=message,
- finish_reason=finish_reason,
- )
- task_usage = UsageInfo.model_validate(response["usage"])
- for usage_key, usage_value in task_usage.model_dump().items():
- setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
- return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
-
-
- async def predict(model_id: str, params: dict):
- global model, tokenizer
-
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=DeltaMessage(role="assistant"),
- finish_reason=None
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
- previous_text = ""
- for new_response in generate_stream_chatglm3(model, tokenizer, params):
- decoded_unicode = new_response["text"]
- delta_text = decoded_unicode[len(previous_text):]
- previous_text = decoded_unicode
-
- finish_reason = new_response["finish_reason"]
- if len(delta_text) == 0 and finish_reason != "function_call":
- continue
-
- function_call = None
- if finish_reason == "function_call":
- try:
- function_call = process_response(decoded_unicode, use_tool=True)
- except:
- logger.warning(
- "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
-
- if isinstance(function_call, dict):
- function_call = FunctionCallResponse(**function_call)
-
- delta = DeltaMessage(
- content=delta_text,
- role="assistant",
- function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
- )
-
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=delta,
- finish_reason=finish_reason
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=DeltaMessage(),
- finish_reason="stop"
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
- yield '[DONE]'
-
-
- def predict_stream(model_id, gen_params):
- """
- The function call is compatible with stream mode output.
- The first seven characters are determined.
- If not a function call, the stream output is directly generated.
- Otherwise, the complete character content of the function call is returned.
- :param model_id:
- :param gen_params:
- :return:
- """
- output = ""
- is_function_call = False
- has_send_first_chunk = False
- for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
- decoded_unicode = new_response["text"]
- delta_text = decoded_unicode[len(output):]
- output = decoded_unicode
-
- # When it is not a function call and the character length is> 7,
- # try to judge whether it is a function call according to the special function prefix
- if not is_function_call and len(output) > 7:
-
- # Determine whether a function is called
- is_function_call = contains_custom_function(output)
- if is_function_call:
- continue
-
- # Non-function call, direct stream output
- finish_reason = new_response["finish_reason"]
-
- # Send an empty string first to avoid truncation by subsequent next() operations.
- if not has_send_first_chunk:
- message = DeltaMessage(
- content="",
- role="assistant",
- function_call=None,
- )
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=message,
- finish_reason=finish_reason
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
- send_msg = delta_text if has_send_first_chunk else output
- has_send_first_chunk = True
- message = DeltaMessage(
- content=send_msg,
- role="assistant",
- function_call=None,
- )
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=message,
- finish_reason=finish_reason
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
- if is_function_call:
- yield output
- else:
- yield '[DONE]'
-
-
- async def parse_output_text(model_id: str, value: str):
- """
- Directly output the text content of value
- :param model_id:
- :param value:
- :return:
- """
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=DeltaMessage(role="assistant", content=value),
- finish_reason=None
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
- choice_data = ChatCompletionResponseStreamChoice(
- index=0,
- delta=DeltaMessage(),
- finish_reason="stop"
- )
- chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
- yield "{}".format(chunk.model_dump_json(exclude_unset=True))
- yield '[DONE]'
-
-
- def contains_custom_function(value: str) -> bool:
- """
- Determine whether 'function_call' according to a special function prefix.
- For example, the functions defined in "tool_using/tool_register.py" are all "get_xxx" and start with "get_"
- [Note] This is not a rigorous judgment method, only for reference.
- :param value:
- :return:
- """
- return value and 'get_' in value
-
-
- if __name__ == "__main__":
-
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
- model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
- if torch.cuda.is_available():
- total_vram_in_gb = get_device_properties(0).total_memory / 1073741824
- print(f'\033[32m显存大小: {total_vram_in_gb:.2f} GB\033[0m')
- with torch.cuda.device(f'cuda:{0}'):
- torch.cuda.empty_cache()
- torch.cuda.ipc_collect()
- if total_vram_in_gb > 13:
- model = model.half().cuda()
- print(f'\033[32m使用显卡fp16精度运行\033[0m')
- elif total_vram_in_gb > 10:
- model = model.half().quantize(8).cuda()
- print(f'\033[32m使用显卡int8量化运行\033[0m')
- elif total_vram_in_gb > 4.5:
- model = model.half().quantize(4).cuda()
- print(f'\033[32m使用显卡int4量化运行\033[0m')
- else:
- model = model.float()
- print('\033[32m使用cpu运行\033[0m')
- else:
- model = model.float()
- print('\033[32m使用cpu运行\033[0m')
- model = model.eval()
-
- #bilibili@十字鱼 https://space.bilibili.com/893892 感谢参考——秋葉aaaki、大江户战士
-
- uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
用于调用各种模型的节点,技术文档建议docker部署,可以用ubuntu20.04,windows程序里开启虚拟化。这里用VirtualBox,开启VT-x/AMD-V,需要在BIOS开启虚拟化功能,有些主板在安全设置里。网络端口转发添加3000、13000等,看需要增加规则。
打开ubuntu20,更新software可能需要一些时间,安装Code,Terminator用于之后的操作。首先解决权限问题。docker及docker守护程序的检查会涉及到权限问题。可将用户名添加到docker组,建议使用管理员权限操作。
sudo usermod -aG docker 用户名
打开code,新建terminal,拉取one-api的镜像,端口为13000
docker run --name one-api -d --restart always -p 13000:3000 -e TZ=Asia/Shanghai -v /home/ubuntu/data/one-api:/data justsong/one-api
进入localhost:13000,登录root,密码123456
chatglm3的Base URL:http//localhost:8000
继续添加m3e渠道,Base URL:http://localhost:6200
添加新令牌,提交。复制箭头下第一个到txt黏贴,例如:https://chat.oneapi.pro/#/?settings={"key":"sk-fAAfFClsyVXxvAgp57Ab758260124a958aF00a2d49CcB625","url":"http://localhost:3000"}
用docker部署m3e模型,默认用CPU运行:
docker run -d -p 6200:6008 --name=m3e-large-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
使用GPU运行:
docker run -d -p 6200:6008 --gpus all --name=m3e-large-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
原镜像:
docker run -d -p 6200:6008 --name=m3e-large-api stawky/m3e-large-api:latest
成功运行后测试,会反馈一组嵌入向量数据,说明成功部署
- curl --location --request POST 'http://localhost:6200/v1/embeddings' \
- --header 'Authorization: Bearer sk-aaabbbcccdddeeefffggghhhiiijjjkkk' \
- --header 'Content-Type: application/json' \
- --data-raw '{
- "model": "m3e",
- "input": ["laf是什么"]
- }'
FastGPT也是Linux部署,这里就用Ubuntu20,打开Code,新建Terminal
下载docker-compose文件:
-
- curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml
下载config文件:
-
- curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml
拉取镜像:docker-compose pull
在后台运行容器:docker-compose up -d
FastGPT 4.6.8后mango副本集需要手动初始化操作
- # 查看 mongo 容器是否正常运行
- docker ps
- # 进入容器
- docker exec -it mongo bash
-
- # 连接数据库
- mongo -u myname -p mypassword --authenticationDatabase admin
-
- # 初始化副本集。如果需要外网访问,mongo:27017 可以改成 ip:27017。但是需要同时修改 FastGPT 连接的参数(MONGODB_URI=mongodb://myname:mypassword@mongo:27017/fastgpt?authSource=admin => MONGODB_URI=mongodb://myname:mypassword@ip:27017/fastgpt?authSource=admin)
- rs.initiate({
- _id: "rs0",
- members: [
- { _id: 0, host: "mongo:27017" }
- ]
- })
- # 检查状态。如果提示 rs0 状态,则代表运行成功
- rs.status()
docker-compose文件修改OPENAI_BASE_URL:http://localhost:13000/v1
连接到One-API的端口,localhost改为本地地址
docker-compose文件修改CHAT_API_KEY:填入从OneAPI令牌复制的key
config文件修改,直接复制
- {
- "systemEnv": {
- "openapiPrefix": "fastgpt",
- "vectorMaxProcess": 15,
- "qaMaxProcess": 15,
- "pgHNSWEfSearch": 100
- },
- "llmModels": [
- {
- "model": "chatglm3",
- "name": "chatglm3",
- "maxContext": 4000,
- "maxResponse": 4000,
- "quoteMaxToken": 2000,
- "maxTemperature": 1,
- "vision": false,
- "defaultSystemChatPrompt": ""
- },
- {
- "model": "gpt-3.5-turbo-1106",
- "name": "gpt-3.5-turbo",
- "maxContext": 16000,
- "maxResponse": 4000,
- "quoteMaxToken": 13000,
- "maxTemperature": 1.2,
- "inputPrice": 0,
- "outputPrice": 0,
- "censor": false,
- "vision": false,
- "datasetProcess": false,
- "toolChoice": true,
- "functionCall": false,
- "customCQPrompt": "",
- "customExtractPrompt": "",
- "defaultSystemChatPrompt": "",
- "defaultConfig":{}
- },
- {
- "model": "gpt-3.5-turbo-16k",
- "name": "gpt-3.5-turbo-16k",
- "maxContext": 16000,
- "maxResponse": 16000,
- "quoteMaxToken": 13000,
- "maxTemperature": 1.2,
- "inputPrice": 0,
- "outputPrice": 0,
- "censor": false,
- "vision": false,
- "datasetProcess": true,
- "toolChoice": true,
- "functionCall": false,
- "customCQPrompt": "",
- "customExtractPrompt": "",
- "defaultSystemChatPrompt": "",
- "defaultConfig":{}
- },
- {
- "model": "gpt-4-0125-preview",
- "name": "gpt-4-turbo",
- "maxContext": 125000,
- "maxResponse": 4000,
- "quoteMaxToken": 100000,
- "maxTemperature": 1.2,
- "inputPrice": 0,
- "outputPrice": 0,
- "censor": false,
- "vision": false,
- "datasetProcess": false,
- "toolChoice": true,
- "functionCall": false,
- "customCQPrompt": "",
- "customExtractPrompt": "",
- "defaultSystemChatPrompt": "",
- "defaultConfig":{}
- },
- {
- "model": "gpt-4-vision-preview",
- "name": "gpt-4-vision",
- "maxContext": 128000,
- "maxResponse": 4000,
- "quoteMaxToken": 100000,
- "maxTemperature": 1.2,
- "inputPrice": 0,
- "outputPrice": 0,
- "censor": false,
- "vision": false,
- "datasetProcess": false,
- "toolChoice": true,
- "functionCall": false,
- "customCQPrompt": "",
- "customExtractPrompt": "",
- "defaultSystemChatPrompt": "",
- "defaultConfig":{}
- }
- ],
- "vectorModels": [
- {
- "model": "m3e",
- "name": "m3e",
- "price": 0.1,
- "defaultToken": 500,
- "maxToken": 1800
- },
- {
- "model": "text-embedding-ada-002",
- "name": "Embedding-2",
- "inputPrice": 0,
- "outputPrice": 0,
- "defaultToken": 700,
- "maxToken": 3000,
- "weight": 100,
- "defaultConfig":{}
- }
- ],
- "reRankModels": [],
- "audioSpeechModels": [
- {
- "model": "tts-1",
- "name": "OpenAI TTS1",
- "inputPrice": 0,
- "outputPrice": 0,
- "voices": [
- { "label": "Alloy", "value": "alloy", "bufferId": "openai-Alloy" },
- { "label": "Echo", "value": "echo", "bufferId": "openai-Echo" },
- { "label": "Fable", "value": "fable", "bufferId": "openai-Fable" },
- { "label": "Onyx", "value": "onyx", "bufferId": "openai-Onyx" },
- { "label": "Nova", "value": "nova", "bufferId": "openai-Nova" },
- { "label": "Shimmer", "value": "shimmer", "bufferId": "openai-Shimmer" }
- ]
- }
- ],
- "whisperModel": {
- "model": "whisper-1",
- "name": "Whisper1",
- "inputPrice": 0,
- "outputPrice": 0
- }
- }
如果所有步骤成功,访问localhost:3000端口,localhost记得改为本机地址,进入FastGPT页面。本地模型chatglm3的API需要开启。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。