本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法

基于DeepSeek-V2-Chat多卡推理演示不同的Profing方法

本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法
备注:

  • 1.torch prof.export_chrome_trace导出给ui.perfetto.dev可视化
  • 2.Nsight Compute可以给出性能瓶颈及优化建议

一.结果






二.操作步骤

bash 复制代码
tee prof.py <<-'EOF'
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from accelerate import init_empty_weights
import sys
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
import torch.cuda
import multiprocessing as mp
import inspect

model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {i: "23GB" for i in range(8)}
sys.path.insert(0,model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16)
model=model.eval()
no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention']
device_map = infer_auto_device_map(
                            model,max_memory=max_memory,
                            no_split_module_classes=no_split_module_classes,
                            dtype='float16')

model = dispatch_model(model, device_map=device_map)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id
messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ]

input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(input_tensor, max_new_tokens=100)
print("warm up done!")

def nsys_prof():
    torch.cuda.cudart().cudaProfilerStart()
    t0=time.time()
    outputs = model.generate(input_tensor, max_new_tokens=100)
    t1=time.time()
    print("e2e:",t1-t0)
    torch.cuda.cudart().cudaProfilerStop()

def torch_prof():
    from torch.profiler import profile
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        outputs = model.generate(input_tensor, max_new_tokens=100)
    prof.export_chrome_trace("torch_prof.json")

def torch_tensorboard_trace():
    prof = torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=0),
            on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
            record_shapes=True,
            profile_memory=True,
            with_stack=True)
    prof.start()
    for i in range(1):
        outputs = model.generate(input_tensor, max_new_tokens=100)
        torch.cuda.synchronize()
    prof.step()
    prof.stop()

def infer():
    outputs = model.generate(input_tensor, max_new_tokens=100)
    torch.cuda.synchronize()
        
def cprofile():
    import cProfile
    import pstats
    cProfile.run('infer()', 'restats')
    p = pstats.Stats('restats')
    p.sort_stats('cumulative').print_stats(50)

def torch_prof_threadpool():
    from torch.profiler import profile
    from concurrent.futures import ThreadPoolExecutor
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        def infer(index):
            past_key_values=None
            input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)            
            custom_stream = torch.cuda.Stream()
            t0=time.time()
            with torch.cuda.stream(custom_stream):
                outputs = model.generate(input_tensor,max_new_tokens=100)
            t1=time.time()
            print(index,t1-t0)
            return None
        with ThreadPoolExecutor(max_workers=4) as executor:
            results = list(executor.map(infer,range(0,4)))

    prof.export_chrome_trace("torch_prof_threadpool.json")

eval(sys.argv[1])()
EOF

nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx \
                --gpu-metrics-device=0,1,2,3,4,5,6,7 \
                --capture-range=cudaProfilerApi \
                --capture-range-end=stop python prof.py nsys_prof
python prof.py torch_prof
mkdir log -p
python prof.py torch_tensorboard_trace
python prof.py cprofile
python prof.py torch_prof_threadpool

tee get_traceEvents.py <<-'EOF'
import json
import sys
filepath=sys.argv[1]
data=json.load(open(filepath,"r"))
f=open(f"traceEvents_{filepath}","w")
json.dump({"traceEvents":data['traceEvents']},f)
f.close()
EOF

python get_traceEvents.py torch_prof_threadpool.json
python get_traceEvents.py torch_prof.json
rm torch_prof.json torch_prof_threadpool.json -f

输出

bash 复制代码
 Time (%)  Total Time (ns)  Num Calls  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                      Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  ---------------------------------------------
     66.5       1387919547     200835    6910.7    6470.0      4070    518456       3693.0  cudaLaunchKernel
     18.8        391445205      20190   19388.1   17149.5      5360    793835      22195.6  cudaMemcpyAsync
      5.6        115988383      19115    6067.9    5810.0      3530    188359       3372.6  cudaMemsetAsync
      2.9         60523944      23956    2526.5    1970.0      1240    205099       2145.7  cudaStreamWaitEvent
      2.1         43275528       1904   22728.7    3120.0      2630   1948957      88939.7  cudaStreamSynchronize
      1.7         36032225      23956    1504.1    1260.0       930    477756       3194.3  cudaEventRecord
      1.4         28312023      23956    1181.8     950.0       700    481776       3321.9  cudaEventCreateWithFlags
      1.1         22670547      23956     946.3     730.0       570    465647       7535.8  cudaEventDestroy
      0.0           170028        100    1700.3    1640.0      1520      2060        140.6  cudaStreamIsCapturing_v10000
      0.0            33330         16    2083.1    1715.0      1390      4950        894.8  cudaOccupancyMaxActiveBlocksPerMultiprocessor
      0.0             7310          1    7310.0    7310.0      7310      7310          0.0  cuProfilerStart

[5/7] Executing 'cuda_gpu_kern_sum' stats report

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  ----------------------------------------------------------------------------------------------------
     24.6        488154814      18117    26944.6    26720.0     25920     31935        892.4  ampere_bf16_s16816gemm_bf16_64x64_ldg8_f2f_stages_64x6_tn
     15.7        312293607      11187    27915.8    21408.0     21152    166178      15652.5  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x2_tn_align8>(T1::Para...
     15.2        302994818       2970   102018.5   101168.5      9376    209283      92340.2  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n...
      9.8        195548625       3150    62078.9    41089.0      4064     91073      27330.1  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x1_tn_align8>(T1::Para...
      8.7        172566020       1485   116206.1    40000.0     38976   1185823     285264.4  void gemv2T_kernel_val<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (int)128, (int...
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)      Operation
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ------------------
     53.3        847221455  13679   61935.9    1600.0      1120   1117006     155051.3  [CUDA memcpy DtoH]
     45.6        724555193  12181   59482.4    2528.0       448   1027370     143492.5  [CUDA memcpy HtoD]
      0.6         10113989  19115     529.1     480.0       447      1568        172.0  [CUDA memset]
      0.5          8129349   6308    1288.7    1280.0      1185      1728         90.1  [CUDA memcpy DtoD]

[7/7] Executing 'cuda_gpu_mem_size_sum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
   6213.111  13679     0.454     0.010     0.000     6.963        1.129  [CUDA memcpy DtoH]
   6211.319  12181     0.510     0.010     0.000     6.963        1.184  [CUDA memcpy HtoD]
    134.408   6308     0.021     0.000     0.000     0.410        0.088  [CUDA memcpy DtoD]
     16.679  19115     0.001     0.000     0.000     0.035        0.005  [CUDA memset]

Generated:
    /home/autotrain/torch_prof/cuda_profing_report.nsys-rep
    /home/autotrain/torch_prof/cuda_profing_report.sqlite

warm up done!
e2e: 9.11670708656311

----------------------------------------------------------------------------------------------------------------
         3206138 function calls (2757755 primitive calls) in 6.868 seconds

   Ordered by: cumulative time
   List reduced from 330 to 50 due to restriction <50>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1        0.000    0.000    6.868    6.868 {built-in method builtins.exec}
        1        0.000    0.000    6.868    6.868 <string>:1(<module>)
        1        0.000    0.000    6.868    6.868 /home/autotrain/torch_prof/prof.py:75(infer)
   1401/1        0.019    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/utils/_contextlib.py:112(decorate_context)
        1        0.000    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:1440(generate)
        1        0.017    0.017    6.866    6.866 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:2310(_sample)
70800/100        0.064    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1528(_wrapped_call_impl)
70800/100        0.155    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1534(_call_impl)
58500/100        0.182    0.000    6.660    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:160(new_forward)
      100        0.001    0.000    6.613    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1611(forward)
      100        0.007    0.000    6.594    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1425(forward)
     1500        0.059    0.000    6.419    0.004 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1209(forward)
     1400        0.035    0.000    3.599    0.003 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:564(forward)
     1400        0.242    0.000    2.756    0.002 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:586(moe_infer)
     1500        0.253    0.000    2.093    0.001 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:799(forward)
     9900        0.229    0.000    1.973    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:386(forward)
    58500        0.070    0.000    1.388    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:316(pre_forward)
198455/117100    0.479    0.000    1.339    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:135(send_to_device)
    37300        0.076    0.000    1.090    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/linear.py:115(forward)
    38700        1.015    0.000    1.015    0.000 {built-in method torch._C._nn.linear}
    89249        0.789    0.000    0.789    0.000 {method 'to' of 'torch._C.TensorBase' objects}
     6100        0.278    0.000    0.687    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:103(forward)
61684/58699      0.068    0.000    0.657    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:73(honor_type)
127539/118584    0.051    0.000    0.557    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:181(<genexpr>)
     1400        0.073    0.000    0.370    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:418(forward)
     1500        0.161    0.000    0.357    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:337(apply_rotary_pos_emb)
738474/604072    0.117    0.000    0.261    0.000 {built-in method builtins.isinstance}
    58600        0.020    0.000    0.203    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:189(<dictcomp>)
     1400        0.193    0.000    0.193    0.000 {method 'cpu' of 'torch._C.TensorBase' objects}
     2985        0.010    0.000    0.160    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/cache_utils.py:123(update)
    67201        0.034    0.000    0.159    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:993(__instancecheck__)
     9900        0.008    0.000    0.142    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/activation.py:395(forward)
     9900        0.010    0.000    0.135    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/functional.py:2080(silu)
    67201        0.041    0.000    0.125    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:1154(__subclasscheck__)
     7570        0.125    0.000    0.125    0.000 {built-in method torch.cat}
   265447        0.079    0.000    0.124    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:44(is_torch_tensor)
     9900        0.122    0.000    0.122    0.000 {built-in method torch._C._nn.silu}
     3000        0.071    0.000    0.119    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:329(rotate_half)
   155900        0.114    0.000    0.114    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1696(__getattr__)
     3000        0.097    0.000    0.097    0.000 {built-in method torch.matmul}
     6100        0.086    0.000    0.086    0.000 {method 'mean' of 'torch._C.TensorBase' objects}
     5900        0.084    0.000    0.084    0.000 {method 'reshape' of 'torch._C.TensorBase' objects}
     6100        0.078    0.000    0.078    0.000 {method 'pow' of 'torch._C.TensorBase' objects}
     5600        0.072    0.000    0.072    0.000 {method 'type' of 'torch._C.TensorBase' objects}
     1400        0.072    0.000    0.072    0.000 {method 'argsort' of 'torch._C.TensorBase' objects}
      203        0.070    0.000    0.070    0.000 {built-in method torch.tensor}
    67202        0.025    0.000    0.069    0.000 {built-in method builtins.issubclass}
    70800        0.060    0.000    0.060    0.000 {built-in method torch._C._get_tracing_state}
      200        0.002    0.000    0.058    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/logits_process.py:72(__call__)
     6100        0.058    0.000    0.058    0.000 {built-in method torch.rsqrt}
----------------------------------------------------------------------------------------------------------------
相关推荐
5G全域通1 分钟前
面向5G复杂性的下一代运维技术体系:架构、工具与实践
大数据·运维·人工智能·5g·架构
你们补药再卷啦4 分钟前
人工智能算法概览
人工智能·算法
悟纤6 分钟前
续写卡在 2 秒?解决方案全解析|Suno 进阶指南|第 13 篇
人工智能·suno·suno ai·suno api·ai music
RockHopper202514 分钟前
企业运营认知机器人的落地规范说明 —— 一种以工程化实现/商业化落地为目的设计原则
人工智能·llm·认知机器人·认知导向
吃人陈乐游刘15 分钟前
05实战经验X-anylabelingAI自动标注数据集-onnx简单解绍(2025年12月)
人工智能·深度学习
AndrewHZ19 分钟前
【图像处理基石】什么是光栅化?
图像处理·人工智能·算法·计算机视觉·3d·图形渲染·光栅化
阿星AI工作室21 分钟前
第一次围观AI打牌,明星模型居然集体翻车?丨开源项目CATArena拆解
人工智能
jqrbcts28 分钟前
关于发那科机器人如何时时把角度发给PLC
java·服务器·网络·人工智能
Rainly200029 分钟前
深度学习旅程之数学统计底座
人工智能·深度学习
QBoson32 分钟前
AI设计RNA开关新突破:受限玻尔兹曼机让人工分子“听懂”代谢物信号
人工智能