DeepSeek-V2-Chat多卡推理(不考虑性能)

@TOC

本文演示了如何使用accelerate推理DeepSeek-V2-Chat(裁剪以后的模型,仅演示如何将权值拆到多卡)

代码

python 复制代码
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from accelerate import init_empty_weights
import sys
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
from torch.utils._python_dispatch import TorchDispatchMode
from dataclasses import dataclass
from typing import Any
import torch.cuda
import multiprocessing as mp

@dataclass
class _ProfilerState:
    cls: Any
    object: Any = None

class TorchDumpDispatchMode(TorchDispatchMode):
    def __init__(self,parent):
        super().__init__()
        self.parent=parent
        self.op_index=0
        self.cvt_count=0

    def get_max_gpu_id(self,tensors):
        max_gpu_id = -1
        max_index = -1
        tensor_index=[]
        for i, tensor in enumerate(tensors):
            if not isinstance(tensor, torch.Tensor):
                continue
            tensor_index.append(i)
            if tensor.is_cuda:
                gpu_id = tensor.get_device()
                if gpu_id > max_gpu_id:
                    max_gpu_id = gpu_id
                    max_index = i
        if max_gpu_id == -1:
            return None, None,tensor_index
        return max_index, max_gpu_id,tensor_index

    def convert(self,op_type,tensor_list):
        index, gpu_id,tensor_index = self.get_max_gpu_id(tensor_list)
        if index is None:
            return
        keep_index=set(tensor_index)-set([index])
        device=torch.device(f"cuda:{gpu_id}")
        for i in keep_index:
            if tensor_list[i].device!=device:
                #print(f"{op_type} {i} {tensor_list[i].device} -> {device}")
                tensor_list[i].data=tensor_list[i].data.to(device,non_blocking=True)
                #卡间通信是串行的,所有多stream并不能充分提升性能

    def __torch_dispatch__(self, func, types, args=(),kwargs=None):
        func_packet = func._overloadpacket
        if kwargs is None:
            kwargs = {}
        op_type=f"{func}"
        self.op_index+=1
        if isinstance(args, list) or isinstance(args, tuple):
            self.convert(op_type,args)
        elif isinstance(args[0], list) or isinstance(args[0], tuple):
            self.convert(op_type,args[0])
        else:
            print(op_type)
        output= func(*args,**kwargs)
        return output

class TorchDumper:
    def __init__(self,**kwargs):
        self.p= _ProfilerState(TorchDumpDispatchMode)
        self.kwargs=kwargs

    def __enter__(self):
        if self.p.object is None:
            o = self.p.cls(self,**self.kwargs)
            o.__enter__()
            self.p.object = o
        else:
            self.p.object.step()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        TorchDumper._CURRENT_Dumper = None
        if self.p.object is not None:
            self.p.object.__exit__(exc_type, exc_val, exc_tb)
            del self.p.object

model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {i: "23GB" for i in range(8)}
sys.path.insert(0,model_name)

model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16)
model=model.eval()

no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention']
#no_split_module_classes = ['DeepseekV2DecoderLayer']

device_map = infer_auto_device_map(
                    model,
                    max_memory=max_memory,
                    no_split_module_classes=no_split_module_classes,
                    dtype='float16')

model = dispatch_model(model, device_map=device_map)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
with TorchDumper():
    outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)
相关推荐
伊布拉西莫21 分钟前
LangChain LCEL源码深度剖析
python·langchain
用心_承载未来22 分钟前
从“复制链接→打开APP“到“一键解析“:我做了个短视频去水印工具
python·去水印·短视频去水印
TYUT_xiaoming24 分钟前
yolo模型训练
人工智能·python·yolo
MageGojo1 小时前
百度热搜API接入实战:数据结构解析与工程化调用指南
python·数据抓取·api集成·热点数据·接口调试
TechWayfarer1 小时前
查IP归属地接入实战:保险理赔如何做动态风险监控与预警
网络·python·tcp/ip·安全·flask
(Charon)1 小时前
【C++ 面试高频:内存管理、RAII 和智能指针详解】
java·开发语言·word
speop2 小时前
AMD | task02
python
轻刀快马2 小时前
跨越软硬件的共鸣(二):从 Cache 写策略看 Redis 与 DB 的一致性博弈
java·开发语言·redis·计算机组成原理
lili00122 小时前
2026 企业 AI 选型新范式:OpenRouter Fusion 证明多模型融合性价比远超单模型,企业该如何重构技术栈? - 微元算力(weytoken)
java·人工智能·python·重构·ai编程
Keano Reurink2 小时前
搜索API与GSC数据对比:发现数据盲区
数据库·python·数据挖掘