本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法

基于DeepSeek-V2-Chat多卡推理演示不同的Profing方法

本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法
备注:

  • 1.torch prof.export_chrome_trace导出给ui.perfetto.dev可视化
  • 2.Nsight Compute可以给出性能瓶颈及优化建议

一.结果






二.操作步骤

bash 复制代码
tee prof.py <<-'EOF'
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from accelerate import init_empty_weights
import sys
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
import torch.cuda
import multiprocessing as mp
import inspect

model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {i: "23GB" for i in range(8)}
sys.path.insert(0,model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16)
model=model.eval()
no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention']
device_map = infer_auto_device_map(
                            model,max_memory=max_memory,
                            no_split_module_classes=no_split_module_classes,
                            dtype='float16')

model = dispatch_model(model, device_map=device_map)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id
messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ]

input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(input_tensor, max_new_tokens=100)
print("warm up done!")

def nsys_prof():
    torch.cuda.cudart().cudaProfilerStart()
    t0=time.time()
    outputs = model.generate(input_tensor, max_new_tokens=100)
    t1=time.time()
    print("e2e:",t1-t0)
    torch.cuda.cudart().cudaProfilerStop()

def torch_prof():
    from torch.profiler import profile
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        outputs = model.generate(input_tensor, max_new_tokens=100)
    prof.export_chrome_trace("torch_prof.json")

def torch_tensorboard_trace():
    prof = torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=0),
            on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
            record_shapes=True,
            profile_memory=True,
            with_stack=True)
    prof.start()
    for i in range(1):
        outputs = model.generate(input_tensor, max_new_tokens=100)
        torch.cuda.synchronize()
    prof.step()
    prof.stop()

def infer():
    outputs = model.generate(input_tensor, max_new_tokens=100)
    torch.cuda.synchronize()
        
def cprofile():
    import cProfile
    import pstats
    cProfile.run('infer()', 'restats')
    p = pstats.Stats('restats')
    p.sort_stats('cumulative').print_stats(50)

def torch_prof_threadpool():
    from torch.profiler import profile
    from concurrent.futures import ThreadPoolExecutor
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        def infer(index):
            past_key_values=None
            input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)            
            custom_stream = torch.cuda.Stream()
            t0=time.time()
            with torch.cuda.stream(custom_stream):
                outputs = model.generate(input_tensor,max_new_tokens=100)
            t1=time.time()
            print(index,t1-t0)
            return None
        with ThreadPoolExecutor(max_workers=4) as executor:
            results = list(executor.map(infer,range(0,4)))

    prof.export_chrome_trace("torch_prof_threadpool.json")

eval(sys.argv[1])()
EOF

nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx \
                --gpu-metrics-device=0,1,2,3,4,5,6,7 \
                --capture-range=cudaProfilerApi \
                --capture-range-end=stop python prof.py nsys_prof
python prof.py torch_prof
mkdir log -p
python prof.py torch_tensorboard_trace
python prof.py cprofile
python prof.py torch_prof_threadpool

tee get_traceEvents.py <<-'EOF'
import json
import sys
filepath=sys.argv[1]
data=json.load(open(filepath,"r"))
f=open(f"traceEvents_{filepath}","w")
json.dump({"traceEvents":data['traceEvents']},f)
f.close()
EOF

python get_traceEvents.py torch_prof_threadpool.json
python get_traceEvents.py torch_prof.json
rm torch_prof.json torch_prof_threadpool.json -f

输出

bash 复制代码
 Time (%)  Total Time (ns)  Num Calls  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                      Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  ---------------------------------------------
     66.5       1387919547     200835    6910.7    6470.0      4070    518456       3693.0  cudaLaunchKernel
     18.8        391445205      20190   19388.1   17149.5      5360    793835      22195.6  cudaMemcpyAsync
      5.6        115988383      19115    6067.9    5810.0      3530    188359       3372.6  cudaMemsetAsync
      2.9         60523944      23956    2526.5    1970.0      1240    205099       2145.7  cudaStreamWaitEvent
      2.1         43275528       1904   22728.7    3120.0      2630   1948957      88939.7  cudaStreamSynchronize
      1.7         36032225      23956    1504.1    1260.0       930    477756       3194.3  cudaEventRecord
      1.4         28312023      23956    1181.8     950.0       700    481776       3321.9  cudaEventCreateWithFlags
      1.1         22670547      23956     946.3     730.0       570    465647       7535.8  cudaEventDestroy
      0.0           170028        100    1700.3    1640.0      1520      2060        140.6  cudaStreamIsCapturing_v10000
      0.0            33330         16    2083.1    1715.0      1390      4950        894.8  cudaOccupancyMaxActiveBlocksPerMultiprocessor
      0.0             7310          1    7310.0    7310.0      7310      7310          0.0  cuProfilerStart

[5/7] Executing 'cuda_gpu_kern_sum' stats report

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  ----------------------------------------------------------------------------------------------------
     24.6        488154814      18117    26944.6    26720.0     25920     31935        892.4  ampere_bf16_s16816gemm_bf16_64x64_ldg8_f2f_stages_64x6_tn
     15.7        312293607      11187    27915.8    21408.0     21152    166178      15652.5  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x2_tn_align8>(T1::Para...
     15.2        302994818       2970   102018.5   101168.5      9376    209283      92340.2  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n...
      9.8        195548625       3150    62078.9    41089.0      4064     91073      27330.1  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x1_tn_align8>(T1::Para...
      8.7        172566020       1485   116206.1    40000.0     38976   1185823     285264.4  void gemv2T_kernel_val<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (int)128, (int...
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)      Operation
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ------------------
     53.3        847221455  13679   61935.9    1600.0      1120   1117006     155051.3  [CUDA memcpy DtoH]
     45.6        724555193  12181   59482.4    2528.0       448   1027370     143492.5  [CUDA memcpy HtoD]
      0.6         10113989  19115     529.1     480.0       447      1568        172.0  [CUDA memset]
      0.5          8129349   6308    1288.7    1280.0      1185      1728         90.1  [CUDA memcpy DtoD]

[7/7] Executing 'cuda_gpu_mem_size_sum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
   6213.111  13679     0.454     0.010     0.000     6.963        1.129  [CUDA memcpy DtoH]
   6211.319  12181     0.510     0.010     0.000     6.963        1.184  [CUDA memcpy HtoD]
    134.408   6308     0.021     0.000     0.000     0.410        0.088  [CUDA memcpy DtoD]
     16.679  19115     0.001     0.000     0.000     0.035        0.005  [CUDA memset]

Generated:
    /home/autotrain/torch_prof/cuda_profing_report.nsys-rep
    /home/autotrain/torch_prof/cuda_profing_report.sqlite

warm up done!
e2e: 9.11670708656311

----------------------------------------------------------------------------------------------------------------
         3206138 function calls (2757755 primitive calls) in 6.868 seconds

   Ordered by: cumulative time
   List reduced from 330 to 50 due to restriction <50>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1        0.000    0.000    6.868    6.868 {built-in method builtins.exec}
        1        0.000    0.000    6.868    6.868 <string>:1(<module>)
        1        0.000    0.000    6.868    6.868 /home/autotrain/torch_prof/prof.py:75(infer)
   1401/1        0.019    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/utils/_contextlib.py:112(decorate_context)
        1        0.000    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:1440(generate)
        1        0.017    0.017    6.866    6.866 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:2310(_sample)
70800/100        0.064    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1528(_wrapped_call_impl)
70800/100        0.155    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1534(_call_impl)
58500/100        0.182    0.000    6.660    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:160(new_forward)
      100        0.001    0.000    6.613    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1611(forward)
      100        0.007    0.000    6.594    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1425(forward)
     1500        0.059    0.000    6.419    0.004 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1209(forward)
     1400        0.035    0.000    3.599    0.003 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:564(forward)
     1400        0.242    0.000    2.756    0.002 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:586(moe_infer)
     1500        0.253    0.000    2.093    0.001 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:799(forward)
     9900        0.229    0.000    1.973    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:386(forward)
    58500        0.070    0.000    1.388    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:316(pre_forward)
198455/117100    0.479    0.000    1.339    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:135(send_to_device)
    37300        0.076    0.000    1.090    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/linear.py:115(forward)
    38700        1.015    0.000    1.015    0.000 {built-in method torch._C._nn.linear}
    89249        0.789    0.000    0.789    0.000 {method 'to' of 'torch._C.TensorBase' objects}
     6100        0.278    0.000    0.687    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:103(forward)
61684/58699      0.068    0.000    0.657    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:73(honor_type)
127539/118584    0.051    0.000    0.557    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:181(<genexpr>)
     1400        0.073    0.000    0.370    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:418(forward)
     1500        0.161    0.000    0.357    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:337(apply_rotary_pos_emb)
738474/604072    0.117    0.000    0.261    0.000 {built-in method builtins.isinstance}
    58600        0.020    0.000    0.203    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:189(<dictcomp>)
     1400        0.193    0.000    0.193    0.000 {method 'cpu' of 'torch._C.TensorBase' objects}
     2985        0.010    0.000    0.160    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/cache_utils.py:123(update)
    67201        0.034    0.000    0.159    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:993(__instancecheck__)
     9900        0.008    0.000    0.142    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/activation.py:395(forward)
     9900        0.010    0.000    0.135    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/functional.py:2080(silu)
    67201        0.041    0.000    0.125    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:1154(__subclasscheck__)
     7570        0.125    0.000    0.125    0.000 {built-in method torch.cat}
   265447        0.079    0.000    0.124    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:44(is_torch_tensor)
     9900        0.122    0.000    0.122    0.000 {built-in method torch._C._nn.silu}
     3000        0.071    0.000    0.119    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:329(rotate_half)
   155900        0.114    0.000    0.114    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1696(__getattr__)
     3000        0.097    0.000    0.097    0.000 {built-in method torch.matmul}
     6100        0.086    0.000    0.086    0.000 {method 'mean' of 'torch._C.TensorBase' objects}
     5900        0.084    0.000    0.084    0.000 {method 'reshape' of 'torch._C.TensorBase' objects}
     6100        0.078    0.000    0.078    0.000 {method 'pow' of 'torch._C.TensorBase' objects}
     5600        0.072    0.000    0.072    0.000 {method 'type' of 'torch._C.TensorBase' objects}
     1400        0.072    0.000    0.072    0.000 {method 'argsort' of 'torch._C.TensorBase' objects}
      203        0.070    0.000    0.070    0.000 {built-in method torch.tensor}
    67202        0.025    0.000    0.069    0.000 {built-in method builtins.issubclass}
    70800        0.060    0.000    0.060    0.000 {built-in method torch._C._get_tracing_state}
      200        0.002    0.000    0.058    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/logits_process.py:72(__call__)
     6100        0.058    0.000    0.058    0.000 {built-in method torch.rsqrt}
----------------------------------------------------------------------------------------------------------------
相关推荐
大学生毕业题目33 分钟前
毕业项目推荐:83-基于yolov8/yolov5/yolo11的农作物杂草检测识别系统(Python+卷积神经网络)
人工智能·python·yolo·目标检测·cnn·pyqt·杂草识别
居7然38 分钟前
美团大模型“龙猫”登场,能否重塑本地生活新战局?
人工智能·大模型·生活·美团
说私域1 小时前
社交新零售时代本地化微商的发展路径研究——基于开源AI智能名片链动2+1模式S2B2C商城小程序源的创新实践
人工智能·开源·零售
IT_陈寒1 小时前
Python性能优化:5个被低估的魔法方法让你的代码提速50%
前端·人工智能·后端
Deng_Xian_Sheng1 小时前
有哪些任务可以使用无监督的方式训练深度学习模型?
人工智能·深度学习·无监督
数据科学作家4 小时前
学数据分析必囤!数据分析必看!清华社9本书覆盖Stata/SPSS/Python全阶段学习路径
人工智能·python·机器学习·数据分析·统计·stata·spss
CV缝合救星5 小时前
【Arxiv 2025 预发行论文】重磅突破!STAR-DSSA 模块横空出世:显著性+拓扑双重加持,小目标、大场景统统拿下!
人工智能·深度学习·计算机视觉·目标跟踪·即插即用模块
TDengine (老段)7 小时前
从 ETL 到 Agentic AI:工业数据管理变革与 TDengine IDMP 的治理之道
数据库·数据仓库·人工智能·物联网·时序数据库·etl·tdengine
蓝桉8028 小时前
如何进行神经网络的模型训练(视频代码中的知识点记录)
人工智能·深度学习·神经网络
星期天要睡觉8 小时前
深度学习——数据增强(Data Augmentation)
人工智能·深度学习