BaiChuan13B-GPTQ量化详解

知识要点:

1、按照网上搜索的一些代码,如使用auto_gptq原生库进行训练后量化,可能会正常量化,但是在线推理时会出现如找不到bin文件或者tf文件,即模型权重文件,所以和网上大部分代码不同的地方在于,需要提前保存对应模型的权重文件,如果是BaiChuan13B,那么在进行模型量化前,对其进行保存

代码如下:

def save_bin(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoModelForCausalLM
    import torch
    import os
    
    original_model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_dir, 
            trust_remote_code=True,
            torch_dtype=torch.float16,      # 不执行这个保存的bin文件会非常的大,大概50多G
            safetensors=True
        )
    print("保存bin文件...")
    model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
    torch.save(original_model.state_dict(), model_path)
    print("保存bin文件完成...")

量化代码,使用原生库auto_gptq进行量化:

def from_authority_autogptq(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
    import logging
    import torch
    import os
    
    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
    )


    # 量化分词器加载
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_dir, 
        use_fast=False, 
        trust_remote_code=True
    )
    
    examples = [
        tokenizer(
            "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]
    
    # 量化参数配置
    quantize_config = BaseQuantizeConfig(
        bits=4,             # quantize model to 4-bit
        group_size=128,     # it is recommended to set the value to 128
        desc_act=False,     # set to False can significantly speed up inference but the perplexity may slightly bad
    )

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    quantize_model = AutoGPTQForCausalLM.from_pretrained(
        pretrained_model_dir, 
        quantize_config=quantize_config, 
        trust_remote_code=True,
        device_map="auto",
    )
    
    
    print("开始量化模型.......")
    quantize_model.quantize(examples)
    
    # save model weights
    print("保存量化文件...")
    quantize_model.save_quantized(quantized_model_dir)
    print("保存量化文件完成...")
    
    print("保存tokenizer...")
    tokenizer.save_pretrained(quantized_model_dir)
    print("保存tokenizer完成...")

按照上述步骤,此时模型量化文件保存成功,接下来就是模型在线推理

def get_baichuan2_autogptq(quantized_model_dir):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from transformers.generation.utils import GenerationConfig
    import torch
    # 模型地址
    model_id = quantized_model_dir
    
    print("加载分词器tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True,
        use_fast=False
    )
    
    '''
    warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default).
    This will lead to slow inference or training speed
    '''
    
    print("加载量化model...")
    quantized_model_4bit = AutoModelForCausalLM.from_pretrained(
        # 要载入的模型名称
        model_id, 
        load_in_4bit=True,
        # 仅使用本地模型,不通过网络下载模型
        local_files_only=True,
        # 指定模型精度
        torch_dtype=torch.float16,
        trust_remote_code=True,
        safetensors=True
    )
    
    print("加载config...")
    quantized_model_4bit.generation_config = GenerationConfig.from_pretrained(
        model_id
    )

    # 实例测试
    print("生成...")
    messages = []
    messages.append({"role": "user", "content":"亚历山大为何如此厉害"})
    response = quantized_model_4bit.chat(tokenizer, messages)
    print(response)
    return response 

最后整合代码:

'''bin 文件是保存的是原始的加载模型文件,不涉及量化操作的模型过程,不然会报错或者加载不出来!!!'''
def save_bin(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoModelForCausalLM
    import torch
    import os
    
    original_model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_dir, 
            trust_remote_code=True,
            torch_dtype=torch.float16,      # 不执行这个保存的bin文件会非常的大,大概50多G
            safetensors=True
        )
    print("保存bin文件...")
    model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
    torch.save(original_model.state_dict(), model_path)
    print("保存bin文件完成...")



# auto_gptq原生库, 量化占用显存7-10G不等,用时23分钟,推理18G
def from_authority_autogptq(pretrained_model_dir, quantized_model_dir):
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
    import logging
    import torch
    import os
    
    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
    )


    # 量化分词器加载
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_dir, 
        use_fast=False, 
        trust_remote_code=True
    )
    
    examples = [
        tokenizer(
            "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
        )
    ]
    
    # 量化参数配置
    quantize_config = BaseQuantizeConfig(
        bits=4,             # quantize model to 4-bit
        group_size=128,     # it is recommended to set the value to 128
        desc_act=False,     # set to False can significantly speed up inference but the perplexity may slightly bad
    )

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    quantize_model = AutoGPTQForCausalLM.from_pretrained(
        pretrained_model_dir, 
        quantize_config=quantize_config, 
        trust_remote_code=True,
        device_map="auto",
    )
    
    
    print("开始量化模型.......")
    quantize_model.quantize(examples)
    
    # save model weights
    print("保存量化文件...")
    quantize_model.save_quantized(quantized_model_dir)
    print("保存量化文件完成...")
    
    print("保存tokenizer...")
    tokenizer.save_pretrained(quantized_model_dir)
    print("保存tokenizer完成...")



# 加载量化后的模型方法
def get_baichuan2_autogptq(quantized_model_dir):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from transformers.generation.utils import GenerationConfig
    import torch
    # 模型地址
    model_id = quantized_model_dir
    
    print("加载分词器tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True,
        use_fast=False
    )
    
    '''
    warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default).
    This will lead to slow inference or training speed
    '''
    
    print("加载量化model...")
    quantized_model_4bit = AutoModelForCausalLM.from_pretrained(
        # 要载入的模型名称
        model_id, 
        load_in_4bit=True,
        # 仅使用本地模型,不通过网络下载模型
        local_files_only=True,
        # 指定模型精度
        torch_dtype=torch.float16,
        trust_remote_code=True,
        safetensors=True
    )
    
    print("加载config...")
    quantized_model_4bit.generation_config = GenerationConfig.from_pretrained(
        model_id
    )

    # 实例测试
    print("生成...")
    messages = []
    messages.append({"role": "user", "content":"```桥架\n1、名称:机房走线架(铝合金) 2、规格:300mm*100mm 3、含支吊架制作安装 4、其它:具体详见图纸、技术规范书、图集、招标文件、招标答疑、政府相关文件、规范等其它资料,满足验收要求```\n请仔细阅读上文,并从中分析出实体列表中的各实体。请使用json字典格式回答,其中,键为各实体名称,值为从文本中提取出的内容(若没有相应实体则值为'无')。\n实体列表如下(目标实体之间通过";"隔开): ```名称;型号;材质;类型;规格;接地方式```"})
    response = quantized_model_4bit.chat(tokenizer, messages)
    print(response)
    return response 





if __name__ == "__main__":
    # from_transformers_autogptq 方法量化模型
    # pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat"
    # quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq"
    # from_transformers_autogptq(pretrained_model_dir, quantized_model_dir)
    
    import datetime
    print("程序开始时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
    
    # 地址
    pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat"
    quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq"
    
    # 第一步:保存原始模型的Bin文件,然后再量化(很关键)
    # save_bin(pretrained_model_dir, quantized_model_dir)
    
    # 第二部:执行来自autogptq原始包量化模型
    # from_authority_autogptq(pretrained_model_dir, quantized_model_dir)
    
    # 第三部:使用量化模型进行推理(需要添加对应文件)
    get_baichuan2_autogptq(quantized_model_dir)
    
    
    print("程序结束时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

对应包版本:

auto-gptq==0.6.0
transformers==4.39.2
torch==2.0.1
相关推荐
文心快码 Baidu Comate9 分钟前
打破视障壁垒,百度文心快码无障碍版本助力视障IT从业者就业无“碍”
人工智能·ai编程·文心快码·智能编程助手·智能代码助手
诚威_lol_中大努力中15 分钟前
关于pytorch3d的安装
人工智能·pytorch·python
herogus丶37 分钟前
【Spring AI】Spring AI Alibaba的简单使用
java·人工智能·spring·ai
dundunmm1 小时前
机器学习之pandas
人工智能·python·机器学习·数据挖掘·pandas
小火炉Q1 小时前
16 循环语句——for循环
人工智能·python·网络安全
88号技师2 小时前
真实环境下实车运行,新能源汽车锂离子电池数据集
人工智能·电动汽车·电池状态估计
原点安全2 小时前
“鼎和财险一体化数据安全管控实践”入选信通院金融领域优秀案例
大数据·人工智能·金融
红色的山茶花2 小时前
YOLOv9-0.1部分代码阅读笔记-anchor_generator.py
笔记·深度学习·yolo
吃个糖糖2 小时前
37 Opencv SIFT 特征检测
人工智能·opencv·计算机视觉
麦田里的稻草人w2 小时前
【YOLO】(基础篇一)YOLO介绍
人工智能·python·神经网络·yolo·机器学习