本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法

基于DeepSeek-V2-Chat多卡推理演示不同的Profing方法

一.结果
二.操作步骤

本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法
备注:

1.torch prof.export_chrome_trace导出给ui.perfetto.dev可视化
2.Nsight Compute可以给出性能瓶颈及优化建议

一.结果

二.操作步骤

bash 复制代码

tee prof.py <<-'EOF'
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from accelerate import init_empty_weights
import sys
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
import torch.cuda
import multiprocessing as mp
import inspect

model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {i: "23GB" for i in range(8)}
sys.path.insert(0,model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16)
model=model.eval()
no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention']
device_map = infer_auto_device_map(
                            model,max_memory=max_memory,
                            no_split_module_classes=no_split_module_classes,
                            dtype='float16')

model = dispatch_model(model, device_map=device_map)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id
messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ]

input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(input_tensor, max_new_tokens=100)
print("warm up done!")

def nsys_prof():
    torch.cuda.cudart().cudaProfilerStart()
    t0=time.time()
    outputs = model.generate(input_tensor, max_new_tokens=100)
    t1=time.time()
    print("e2e:",t1-t0)
    torch.cuda.cudart().cudaProfilerStop()

def torch_prof():
    from torch.profiler import profile
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        outputs = model.generate(input_tensor, max_new_tokens=100)
    prof.export_chrome_trace("torch_prof.json")

def torch_tensorboard_trace():
    prof = torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=0),
            on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
            record_shapes=True,
            profile_memory=True,
            with_stack=True)
    prof.start()
    for i in range(1):
        outputs = model.generate(input_tensor, max_new_tokens=100)
        torch.cuda.synchronize()
    prof.step()
    prof.stop()

def infer():
    outputs = model.generate(input_tensor, max_new_tokens=100)
    torch.cuda.synchronize()
        
def cprofile():
    import cProfile
    import pstats
    cProfile.run('infer()', 'restats')
    p = pstats.Stats('restats')
    p.sort_stats('cumulative').print_stats(50)

def torch_prof_threadpool():
    from torch.profiler import profile
    from concurrent.futures import ThreadPoolExecutor
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        def infer(index):
            past_key_values=None
            input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)            
            custom_stream = torch.cuda.Stream()
            t0=time.time()
            with torch.cuda.stream(custom_stream):
                outputs = model.generate(input_tensor,max_new_tokens=100)
            t1=time.time()
            print(index,t1-t0)
            return None
        with ThreadPoolExecutor(max_workers=4) as executor:
            results = list(executor.map(infer,range(0,4)))

    prof.export_chrome_trace("torch_prof_threadpool.json")

eval(sys.argv[1])()
EOF

nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx \
                --gpu-metrics-device=0,1,2,3,4,5,6,7 \
                --capture-range=cudaProfilerApi \
                --capture-range-end=stop python prof.py nsys_prof
python prof.py torch_prof
mkdir log -p
python prof.py torch_tensorboard_trace
python prof.py cprofile
python prof.py torch_prof_threadpool

tee get_traceEvents.py <<-'EOF'
import json
import sys
filepath=sys.argv[1]
data=json.load(open(filepath,"r"))
f=open(f"traceEvents_{filepath}","w")
json.dump({"traceEvents":data['traceEvents']},f)
f.close()
EOF

python get_traceEvents.py torch_prof_threadpool.json
python get_traceEvents.py torch_prof.json
rm torch_prof.json torch_prof_threadpool.json -f

输出

bash 复制代码

 Time (%)  Total Time (ns)  Num Calls  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                      Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  ---------------------------------------------
     66.5       1387919547     200835    6910.7    6470.0      4070    518456       3693.0  cudaLaunchKernel
     18.8        391445205      20190   19388.1   17149.5      5360    793835      22195.6  cudaMemcpyAsync
      5.6        115988383      19115    6067.9    5810.0      3530    188359       3372.6  cudaMemsetAsync
      2.9         60523944      23956    2526.5    1970.0      1240    205099       2145.7  cudaStreamWaitEvent
      2.1         43275528       1904   22728.7    3120.0      2630   1948957      88939.7  cudaStreamSynchronize
      1.7         36032225      23956    1504.1    1260.0       930    477756       3194.3  cudaEventRecord
      1.4         28312023      23956    1181.8     950.0       700    481776       3321.9  cudaEventCreateWithFlags
      1.1         22670547      23956     946.3     730.0       570    465647       7535.8  cudaEventDestroy
      0.0           170028        100    1700.3    1640.0      1520      2060        140.6  cudaStreamIsCapturing_v10000
      0.0            33330         16    2083.1    1715.0      1390      4950        894.8  cudaOccupancyMaxActiveBlocksPerMultiprocessor
      0.0             7310          1    7310.0    7310.0      7310      7310          0.0  cuProfilerStart

[5/7] Executing 'cuda_gpu_kern_sum' stats report

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  ----------------------------------------------------------------------------------------------------
     24.6        488154814      18117    26944.6    26720.0     25920     31935        892.4  ampere_bf16_s16816gemm_bf16_64x64_ldg8_f2f_stages_64x6_tn
     15.7        312293607      11187    27915.8    21408.0     21152    166178      15652.5  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x2_tn_align8>(T1::Para...
     15.2        302994818       2970   102018.5   101168.5      9376    209283      92340.2  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n...
      9.8        195548625       3150    62078.9    41089.0      4064     91073      27330.1  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x1_tn_align8>(T1::Para...
      8.7        172566020       1485   116206.1    40000.0     38976   1185823     285264.4  void gemv2T_kernel_val<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (int)128, (int...
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)      Operation
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ------------------
     53.3        847221455  13679   61935.9    1600.0      1120   1117006     155051.3  [CUDA memcpy DtoH]
     45.6        724555193  12181   59482.4    2528.0       448   1027370     143492.5  [CUDA memcpy HtoD]
      0.6         10113989  19115     529.1     480.0       447      1568        172.0  [CUDA memset]
      0.5          8129349   6308    1288.7    1280.0      1185      1728         90.1  [CUDA memcpy DtoD]

[7/7] Executing 'cuda_gpu_mem_size_sum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
   6213.111  13679     0.454     0.010     0.000     6.963        1.129  [CUDA memcpy DtoH]
   6211.319  12181     0.510     0.010     0.000     6.963        1.184  [CUDA memcpy HtoD]
    134.408   6308     0.021     0.000     0.000     0.410        0.088  [CUDA memcpy DtoD]
     16.679  19115     0.001     0.000     0.000     0.035        0.005  [CUDA memset]

Generated:
    /home/autotrain/torch_prof/cuda_profing_report.nsys-rep
    /home/autotrain/torch_prof/cuda_profing_report.sqlite

warm up done!
e2e: 9.11670708656311

----------------------------------------------------------------------------------------------------------------
         3206138 function calls (2757755 primitive calls) in 6.868 seconds

   Ordered by: cumulative time
   List reduced from 330 to 50 due to restriction <50>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1        0.000    0.000    6.868    6.868 {built-in method builtins.exec}
        1        0.000    0.000    6.868    6.868 <string>:1(<module>)
        1        0.000    0.000    6.868    6.868 /home/autotrain/torch_prof/prof.py:75(infer)
   1401/1        0.019    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/utils/_contextlib.py:112(decorate_context)
        1        0.000    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:1440(generate)
        1        0.017    0.017    6.866    6.866 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:2310(_sample)
70800/100        0.064    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1528(_wrapped_call_impl)
70800/100        0.155    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1534(_call_impl)
58500/100        0.182    0.000    6.660    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:160(new_forward)
      100        0.001    0.000    6.613    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1611(forward)
      100        0.007    0.000    6.594    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1425(forward)
     1500        0.059    0.000    6.419    0.004 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1209(forward)
     1400        0.035    0.000    3.599    0.003 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:564(forward)
     1400        0.242    0.000    2.756    0.002 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:586(moe_infer)
     1500        0.253    0.000    2.093    0.001 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:799(forward)
     9900        0.229    0.000    1.973    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:386(forward)
    58500        0.070    0.000    1.388    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:316(pre_forward)
198455/117100    0.479    0.000    1.339    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:135(send_to_device)
    37300        0.076    0.000    1.090    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/linear.py:115(forward)
    38700        1.015    0.000    1.015    0.000 {built-in method torch._C._nn.linear}
    89249        0.789    0.000    0.789    0.000 {method 'to' of 'torch._C.TensorBase' objects}
     6100        0.278    0.000    0.687    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:103(forward)
61684/58699      0.068    0.000    0.657    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:73(honor_type)
127539/118584    0.051    0.000    0.557    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:181(<genexpr>)
     1400        0.073    0.000    0.370    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:418(forward)
     1500        0.161    0.000    0.357    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:337(apply_rotary_pos_emb)
738474/604072    0.117    0.000    0.261    0.000 {built-in method builtins.isinstance}
    58600        0.020    0.000    0.203    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:189(<dictcomp>)
     1400        0.193    0.000    0.193    0.000 {method 'cpu' of 'torch._C.TensorBase' objects}
     2985        0.010    0.000    0.160    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/cache_utils.py:123(update)
    67201        0.034    0.000    0.159    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:993(__instancecheck__)
     9900        0.008    0.000    0.142    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/activation.py:395(forward)
     9900        0.010    0.000    0.135    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/functional.py:2080(silu)
    67201        0.041    0.000    0.125    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:1154(__subclasscheck__)
     7570        0.125    0.000    0.125    0.000 {built-in method torch.cat}
   265447        0.079    0.000    0.124    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:44(is_torch_tensor)
     9900        0.122    0.000    0.122    0.000 {built-in method torch._C._nn.silu}
     3000        0.071    0.000    0.119    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:329(rotate_half)
   155900        0.114    0.000    0.114    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1696(__getattr__)
     3000        0.097    0.000    0.097    0.000 {built-in method torch.matmul}
     6100        0.086    0.000    0.086    0.000 {method 'mean' of 'torch._C.TensorBase' objects}
     5900        0.084    0.000    0.084    0.000 {method 'reshape' of 'torch._C.TensorBase' objects}
     6100        0.078    0.000    0.078    0.000 {method 'pow' of 'torch._C.TensorBase' objects}
     5600        0.072    0.000    0.072    0.000 {method 'type' of 'torch._C.TensorBase' objects}
     1400        0.072    0.000    0.072    0.000 {method 'argsort' of 'torch._C.TensorBase' objects}
      203        0.070    0.000    0.070    0.000 {built-in method torch.tensor}
    67202        0.025    0.000    0.069    0.000 {built-in method builtins.issubclass}
    70800        0.060    0.000    0.060    0.000 {built-in method torch._C._get_tracing_state}
      200        0.002    0.000    0.058    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/logits_process.py:72(__call__)
     6100        0.058    0.000    0.058    0.000 {built-in method torch.rsqrt}
----------------------------------------------------------------------------------------------------------------