赞
踩
知识要点:
1、按照网上搜索的一些代码,如使用auto_gptq原生库进行训练后量化,可能会正常量化,但是在线推理时会出现如找不到bin文件或者tf文件,即模型权重文件,所以和网上大部分代码不同的地方在于,需要提前保存对应模型的权重文件,如果是BaiChuan13B,那么在进行模型量化前,对其进行保存
代码如下:
def save_bin(pretrained_model_dir, quantized_model_dir):
from transformers import AutoModelForCausalLM
import torch
import os
original_model = AutoModelForCausalLM.from_pretrained(
pretrained_model_dir,
trust_remote_code=True,
torch_dtype=torch.float16, # 不执行这个保存的bin文件会非常的大,大概50多G
safetensors=True
)
print("保存bin文件...")
model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
torch.save(original_model.state_dict(), model_path)
print("保存bin文件完成...")
量化代码,使用原生库auto_gptq进行量化:
def from_authority_autogptq(pretrained_model_dir, quantized_model_dir): from transformers import AutoTokenizer, AutoModelForCausalLM from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig import logging import torch import os logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" ) # 量化分词器加载 tokenizer = AutoTokenizer.from_pretrained( pretrained_model_dir, use_fast=False, trust_remote_code=True ) examples = [ tokenizer( "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." ) ] # 量化参数配置 quantize_config = BaseQuantizeConfig( bits=4, # quantize model to 4-bit group_size=128, # it is recommended to set the value to 128 desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad ) # load un-quantized model, by default, the model will always be loaded into CPU memory quantize_model = AutoGPTQForCausalLM.from_pretrained( pretrained_model_dir, quantize_config=quantize_config, trust_remote_code=True, device_map="auto", ) print("开始量化模型.......") quantize_model.quantize(examples) # save model weights print("保存量化文件...") quantize_model.save_quantized(quantized_model_dir) print("保存量化文件完成...") print("保存tokenizer...") tokenizer.save_pretrained(quantized_model_dir) print("保存tokenizer完成...")
按照上述步骤,此时模型量化文件保存成功,接下来就是模型在线推理
def get_baichuan2_autogptq(quantized_model_dir): from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig import torch # 模型地址 model_id = quantized_model_dir print("加载分词器tokenizer...") tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True, use_fast=False ) ''' warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed ''' print("加载量化model...") quantized_model_4bit = AutoModelForCausalLM.from_pretrained( # 要载入的模型名称 model_id, load_in_4bit=True, # 仅使用本地模型,不通过网络下载模型 local_files_only=True, # 指定模型精度 torch_dtype=torch.float16, trust_remote_code=True, safetensors=True ) print("加载config...") quantized_model_4bit.generation_config = GenerationConfig.from_pretrained( model_id ) # 实例测试 print("生成...") messages = [] messages.append({"role": "user", "content":"亚历山大为何如此厉害"}) response = quantized_model_4bit.chat(tokenizer, messages) print(response) return response
最后整合代码:
'''bin 文件是保存的是原始的加载模型文件,不涉及量化操作的模型过程,不然会报错或者加载不出来!!!''' def save_bin(pretrained_model_dir, quantized_model_dir): from transformers import AutoModelForCausalLM import torch import os original_model = AutoModelForCausalLM.from_pretrained( pretrained_model_dir, trust_remote_code=True, torch_dtype=torch.float16, # 不执行这个保存的bin文件会非常的大,大概50多G safetensors=True ) print("保存bin文件...") model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin") torch.save(original_model.state_dict(), model_path) print("保存bin文件完成...") # auto_gptq原生库, 量化占用显存7-10G不等,用时23分钟,推理18G def from_authority_autogptq(pretrained_model_dir, quantized_model_dir): from transformers import AutoTokenizer, AutoModelForCausalLM from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig import logging import torch import os logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" ) # 量化分词器加载 tokenizer = AutoTokenizer.from_pretrained( pretrained_model_dir, use_fast=False, trust_remote_code=True ) examples = [ tokenizer( "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." ) ] # 量化参数配置 quantize_config = BaseQuantizeConfig( bits=4, # quantize model to 4-bit group_size=128, # it is recommended to set the value to 128 desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad ) # load un-quantized model, by default, the model will always be loaded into CPU memory quantize_model = AutoGPTQForCausalLM.from_pretrained( pretrained_model_dir, quantize_config=quantize_config, trust_remote_code=True, device_map="auto", ) print("开始量化模型.......") quantize_model.quantize(examples) # save model weights print("保存量化文件...") quantize_model.save_quantized(quantized_model_dir) print("保存量化文件完成...") print("保存tokenizer...") tokenizer.save_pretrained(quantized_model_dir) print("保存tokenizer完成...") # 加载量化后的模型方法 def get_baichuan2_autogptq(quantized_model_dir): from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig import torch # 模型地址 model_id = quantized_model_dir print("加载分词器tokenizer...") tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True, use_fast=False ) ''' warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed ''' print("加载量化model...") quantized_model_4bit = AutoModelForCausalLM.from_pretrained( # 要载入的模型名称 model_id, load_in_4bit=True, # 仅使用本地模型,不通过网络下载模型 local_files_only=True, # 指定模型精度 torch_dtype=torch.float16, trust_remote_code=True, safetensors=True ) print("加载config...") quantized_model_4bit.generation_config = GenerationConfig.from_pretrained( model_id ) # 实例测试 print("生成...") messages = [] messages.append({"role": "user", "content":"```桥架\n1、名称:机房走线架(铝合金) 2、规格:300mm*100mm 3、含支吊架制作安装 4、其它:具体详见图纸、技术规范书、图集、招标文件、招标答疑、政府相关文件、规范等其它资料,满足验收要求```\n请仔细阅读上文,并从中分析出实体列表中的各实体。请使用json字典格式回答,其中,键为各实体名称,值为从文本中提取出的内容(若没有相应实体则值为'无')。\n实体列表如下(目标实体之间通过“;”隔开): ```名称;型号;材质;类型;规格;接地方式```"}) response = quantized_model_4bit.chat(tokenizer, messages) print(response) return response if __name__ == "__main__": # from_transformers_autogptq 方法量化模型 # pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat" # quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq" # from_transformers_autogptq(pretrained_model_dir, quantized_model_dir) import datetime print("程序开始时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # 地址 pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat" quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq" # 第一步:保存原始模型的Bin文件,然后再量化(很关键) # save_bin(pretrained_model_dir, quantized_model_dir) # 第二部:执行来自autogptq原始包量化模型 # from_authority_autogptq(pretrained_model_dir, quantized_model_dir) # 第三部:使用量化模型进行推理(需要添加对应文件) get_baichuan2_autogptq(quantized_model_dir) print("程序结束时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
对应包版本:
auto-gptq==0.6.0
transformers==4.39.2
torch==2.0.1
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。