当前位置:   article > 正文

BaiChuan13B-GPTQ量化详解

BaiChuan13B-GPTQ量化详解

知识要点:
1、按照网上搜索的一些代码,如使用auto_gptq原生库进行训练后量化,可能会正常量化,但是在线推理时会出现如找不到bin文件或者tf文件,即模型权重文件,所以和网上大部分代码不同的地方在于,需要提前保存对应模型的权重文件,如果是BaiChuan13B,那么在进行模型量化前,对其进行保存
代码如下:

def save_bin(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoModelForCausalLM
    import torch
    import os
    
    original_model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_dir, 
            trust_remote_code=True,
            torch_dtype=torch.float16,      # 不执行这个保存的bin文件会非常的大,大概50多G
            safetensors=True
        )
    print("保存bin文件...")
    model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
    torch.save(original_model.state_dict(), model_path)
    print("保存bin文件完成...")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

量化代码,使用原生库auto_gptq进行量化:

def from_authority_autogptq(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
    import logging
    import torch
    import os
    
    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
    )


    # 量化分词器加载
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_dir, 
        use_fast=False, 
        trust_remote_code=True
    )
    
    examples = [
        tokenizer(
            "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]
    
    # 量化参数配置
    quantize_config = BaseQuantizeConfig(
        bits=4,             # quantize model to 4-bit
        group_size=128,     # it is recommended to set the value to 128
        desc_act=False,     # set to False can significantly speed up inference but the perplexity may slightly bad
    )

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    quantize_model = AutoGPTQForCausalLM.from_pretrained(
        pretrained_model_dir, 
        quantize_config=quantize_config, 
        trust_remote_code=True,
        device_map="auto",
    )
    
    
    print("开始量化模型.......")
    quantize_model.quantize(examples)
    
    # save model weights
    print("保存量化文件...")
    quantize_model.save_quantized(quantized_model_dir)
    print("保存量化文件完成...")
    
    print("保存tokenizer...")
    tokenizer.save_pretrained(quantized_model_dir)
    print("保存tokenizer完成...")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52

按照上述步骤,此时模型量化文件保存成功,接下来就是模型在线推理

def get_baichuan2_autogptq(quantized_model_dir):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from transformers.generation.utils import GenerationConfig
    import torch
    # 模型地址
    model_id = quantized_model_dir
    
    print("加载分词器tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True,
        use_fast=False
    )
    
    '''
    warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default).
    This will lead to slow inference or training speed
    '''
    
    print("加载量化model...")
    quantized_model_4bit = AutoModelForCausalLM.from_pretrained(
        # 要载入的模型名称
        model_id, 
        load_in_4bit=True,
        # 仅使用本地模型,不通过网络下载模型
        local_files_only=True,
        # 指定模型精度
        torch_dtype=torch.float16,
        trust_remote_code=True,
        safetensors=True
    )
    
    print("加载config...")
    quantized_model_4bit.generation_config = GenerationConfig.from_pretrained(
        model_id
    )

    # 实例测试
    print("生成...")
    messages = []
    messages.append({"role": "user", "content":"亚历山大为何如此厉害"})
    response = quantized_model_4bit.chat(tokenizer, messages)
    print(response)
    return response 
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44

最后整合代码:

'''bin 文件是保存的是原始的加载模型文件,不涉及量化操作的模型过程,不然会报错或者加载不出来!!!'''
def save_bin(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoModelForCausalLM
    import torch
    import os
    
    original_model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_dir, 
            trust_remote_code=True,
            torch_dtype=torch.float16,      # 不执行这个保存的bin文件会非常的大,大概50多G
            safetensors=True
        )
    print("保存bin文件...")
    model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
    torch.save(original_model.state_dict(), model_path)
    print("保存bin文件完成...")



# auto_gptq原生库, 量化占用显存7-10G不等,用时23分钟,推理18G
def from_authority_autogptq(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
    import logging
    import torch
    import os
    
    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
    )


    # 量化分词器加载
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_dir, 
        use_fast=False, 
        trust_remote_code=True
    )
    
    examples = [
        tokenizer(
            "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]
    
    # 量化参数配置
    quantize_config = BaseQuantizeConfig(
        bits=4,             # quantize model to 4-bit
        group_size=128,     # it is recommended to set the value to 128
        desc_act=False,     # set to False can significantly speed up inference but the perplexity may slightly bad
    )

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    quantize_model = AutoGPTQForCausalLM.from_pretrained(
        pretrained_model_dir, 
        quantize_config=quantize_config, 
        trust_remote_code=True,
        device_map="auto",
    )
    
    
    print("开始量化模型.......")
    quantize_model.quantize(examples)
    
    # save model weights
    print("保存量化文件...")
    quantize_model.save_quantized(quantized_model_dir)
    print("保存量化文件完成...")
    
    print("保存tokenizer...")
    tokenizer.save_pretrained(quantized_model_dir)
    print("保存tokenizer完成...")



# 加载量化后的模型方法
def get_baichuan2_autogptq(quantized_model_dir):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from transformers.generation.utils import GenerationConfig
    import torch
    # 模型地址
    model_id = quantized_model_dir
    
    print("加载分词器tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True,
        use_fast=False
    )
    
    '''
    warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default).
    This will lead to slow inference or training speed
    '''
    
    print("加载量化model...")
    quantized_model_4bit = AutoModelForCausalLM.from_pretrained(
        # 要载入的模型名称
        model_id, 
        load_in_4bit=True,
        # 仅使用本地模型,不通过网络下载模型
        local_files_only=True,
        # 指定模型精度
        torch_dtype=torch.float16,
        trust_remote_code=True,
        safetensors=True
    )
    
    print("加载config...")
    quantized_model_4bit.generation_config = GenerationConfig.from_pretrained(
        model_id
    )

    # 实例测试
    print("生成...")
    messages = []
    messages.append({"role": "user", "content":"```桥架\n1、名称:机房走线架(铝合金) 2、规格:300mm*100mm 3、含支吊架制作安装 4、其它:具体详见图纸、技术规范书、图集、招标文件、招标答疑、政府相关文件、规范等其它资料,满足验收要求```\n请仔细阅读上文,并从中分析出实体列表中的各实体。请使用json字典格式回答,其中,键为各实体名称,值为从文本中提取出的内容(若没有相应实体则值为'无')。\n实体列表如下(目标实体之间通过“;”隔开): ```名称;型号;材质;类型;规格;接地方式```"})
    response = quantized_model_4bit.chat(tokenizer, messages)
    print(response)
    return response 





if __name__ == "__main__":
    # from_transformers_autogptq 方法量化模型
    # pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat"
    # quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq"
    # from_transformers_autogptq(pretrained_model_dir, quantized_model_dir)
    
    import datetime
    print("程序开始时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
    
    # 地址
    pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat"
    quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq"
    
    # 第一步:保存原始模型的Bin文件,然后再量化(很关键)
    # save_bin(pretrained_model_dir, quantized_model_dir)
    
    # 第二部:执行来自autogptq原始包量化模型
    # from_authority_autogptq(pretrained_model_dir, quantized_model_dir)
    
    # 第三部:使用量化模型进行推理(需要添加对应文件)
    get_baichuan2_autogptq(quantized_model_dir)
    
    
    print("程序结束时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151

对应包版本:

auto-gptq==0.6.0
transformers==4.39.2
torch==2.0.1
  • 1
  • 2
  • 3
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/488634
推荐阅读
相关标签
  

闽ICP备14008679号