本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法

基于DeepSeek-V2-Chat多卡推理演示不同的Profing方法

本文基于DeepSeek-V2-Chat多卡推理,演示了几种不同的Profing方法
备注:

  • 1.torch prof.export_chrome_trace导出给ui.perfetto.dev可视化
  • 2.Nsight Compute可以给出性能瓶颈及优化建议

一.结果






二.操作步骤

bash 复制代码
tee prof.py <<-'EOF'
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from accelerate import init_empty_weights
import sys
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
import torch.cuda
import multiprocessing as mp
import inspect

model_name = "./models/deepseek-ai/DeepSeek-V2-Chat/"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
max_memory = {i: "23GB" for i in range(8)}
sys.path.insert(0,model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,attn_implementation="eager",torch_dtype=torch.bfloat16)
model=model.eval()
no_split_module_classes = ['DeepseekV2MLP','DeepseekV2Attention']
device_map = infer_auto_device_map(
                            model,max_memory=max_memory,
                            no_split_module_classes=no_split_module_classes,
                            dtype='float16')

model = dispatch_model(model, device_map=device_map)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id
messages = [{"role": "user", "content": "Write a piece of quicksort code in C++"} ]

input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
outputs = model.generate(input_tensor, max_new_tokens=100)
print("warm up done!")

def nsys_prof():
    torch.cuda.cudart().cudaProfilerStart()
    t0=time.time()
    outputs = model.generate(input_tensor, max_new_tokens=100)
    t1=time.time()
    print("e2e:",t1-t0)
    torch.cuda.cudart().cudaProfilerStop()

def torch_prof():
    from torch.profiler import profile
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        outputs = model.generate(input_tensor, max_new_tokens=100)
    prof.export_chrome_trace("torch_prof.json")

def torch_tensorboard_trace():
    prof = torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=0),
            on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
            record_shapes=True,
            profile_memory=True,
            with_stack=True)
    prof.start()
    for i in range(1):
        outputs = model.generate(input_tensor, max_new_tokens=100)
        torch.cuda.synchronize()
    prof.step()
    prof.stop()

def infer():
    outputs = model.generate(input_tensor, max_new_tokens=100)
    torch.cuda.synchronize()
        
def cprofile():
    import cProfile
    import pstats
    cProfile.run('infer()', 'restats')
    p = pstats.Stats('restats')
    p.sort_stats('cumulative').print_stats(50)

def torch_prof_threadpool():
    from torch.profiler import profile
    from concurrent.futures import ThreadPoolExecutor
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True
    ) as prof:
        def infer(index):
            past_key_values=None
            input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)            
            custom_stream = torch.cuda.Stream()
            t0=time.time()
            with torch.cuda.stream(custom_stream):
                outputs = model.generate(input_tensor,max_new_tokens=100)
            t1=time.time()
            print(index,t1-t0)
            return None
        with ThreadPoolExecutor(max_workers=4) as executor:
            results = list(executor.map(infer,range(0,4)))

    prof.export_chrome_trace("torch_prof_threadpool.json")

eval(sys.argv[1])()
EOF

nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx \
                --gpu-metrics-device=0,1,2,3,4,5,6,7 \
                --capture-range=cudaProfilerApi \
                --capture-range-end=stop python prof.py nsys_prof
python prof.py torch_prof
mkdir log -p
python prof.py torch_tensorboard_trace
python prof.py cprofile
python prof.py torch_prof_threadpool

tee get_traceEvents.py <<-'EOF'
import json
import sys
filepath=sys.argv[1]
data=json.load(open(filepath,"r"))
f=open(f"traceEvents_{filepath}","w")
json.dump({"traceEvents":data['traceEvents']},f)
f.close()
EOF

python get_traceEvents.py torch_prof_threadpool.json
python get_traceEvents.py torch_prof.json
rm torch_prof.json torch_prof_threadpool.json -f

输出

bash 复制代码
 Time (%)  Total Time (ns)  Num Calls  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                      Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  ---------------------------------------------
     66.5       1387919547     200835    6910.7    6470.0      4070    518456       3693.0  cudaLaunchKernel
     18.8        391445205      20190   19388.1   17149.5      5360    793835      22195.6  cudaMemcpyAsync
      5.6        115988383      19115    6067.9    5810.0      3530    188359       3372.6  cudaMemsetAsync
      2.9         60523944      23956    2526.5    1970.0      1240    205099       2145.7  cudaStreamWaitEvent
      2.1         43275528       1904   22728.7    3120.0      2630   1948957      88939.7  cudaStreamSynchronize
      1.7         36032225      23956    1504.1    1260.0       930    477756       3194.3  cudaEventRecord
      1.4         28312023      23956    1181.8     950.0       700    481776       3321.9  cudaEventCreateWithFlags
      1.1         22670547      23956     946.3     730.0       570    465647       7535.8  cudaEventDestroy
      0.0           170028        100    1700.3    1640.0      1520      2060        140.6  cudaStreamIsCapturing_v10000
      0.0            33330         16    2083.1    1715.0      1390      4950        894.8  cudaOccupancyMaxActiveBlocksPerMultiprocessor
      0.0             7310          1    7310.0    7310.0      7310      7310          0.0  cuProfilerStart

[5/7] Executing 'cuda_gpu_kern_sum' stats report

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  ----------------------------------------------------------------------------------------------------
     24.6        488154814      18117    26944.6    26720.0     25920     31935        892.4  ampere_bf16_s16816gemm_bf16_64x64_ldg8_f2f_stages_64x6_tn
     15.7        312293607      11187    27915.8    21408.0     21152    166178      15652.5  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x2_tn_align8>(T1::Para...
     15.2        302994818       2970   102018.5   101168.5      9376    209283      92340.2  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n...
      9.8        195548625       3150    62078.9    41089.0      4064     91073      27330.1  void cutlass::Kernel<cutlass_80_wmma_tensorop_bf16_s161616gemm_bf16_16x16_128x1_tn_align8>(T1::Para...
      8.7        172566020       1485   116206.1    40000.0     38976   1185823     285264.4  void gemv2T_kernel_val<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (int)128, (int...
[6/7] Executing 'cuda_gpu_mem_time_sum' stats report

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)      Operation
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ------------------
     53.3        847221455  13679   61935.9    1600.0      1120   1117006     155051.3  [CUDA memcpy DtoH]
     45.6        724555193  12181   59482.4    2528.0       448   1027370     143492.5  [CUDA memcpy HtoD]
      0.6         10113989  19115     529.1     480.0       447      1568        172.0  [CUDA memset]
      0.5          8129349   6308    1288.7    1280.0      1185      1728         90.1  [CUDA memcpy DtoD]

[7/7] Executing 'cuda_gpu_mem_size_sum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
   6213.111  13679     0.454     0.010     0.000     6.963        1.129  [CUDA memcpy DtoH]
   6211.319  12181     0.510     0.010     0.000     6.963        1.184  [CUDA memcpy HtoD]
    134.408   6308     0.021     0.000     0.000     0.410        0.088  [CUDA memcpy DtoD]
     16.679  19115     0.001     0.000     0.000     0.035        0.005  [CUDA memset]

Generated:
    /home/autotrain/torch_prof/cuda_profing_report.nsys-rep
    /home/autotrain/torch_prof/cuda_profing_report.sqlite

warm up done!
e2e: 9.11670708656311

----------------------------------------------------------------------------------------------------------------
         3206138 function calls (2757755 primitive calls) in 6.868 seconds

   Ordered by: cumulative time
   List reduced from 330 to 50 due to restriction <50>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1        0.000    0.000    6.868    6.868 {built-in method builtins.exec}
        1        0.000    0.000    6.868    6.868 <string>:1(<module>)
        1        0.000    0.000    6.868    6.868 /home/autotrain/torch_prof/prof.py:75(infer)
   1401/1        0.019    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/utils/_contextlib.py:112(decorate_context)
        1        0.000    0.000    6.868    6.868 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:1440(generate)
        1        0.017    0.017    6.866    6.866 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/utils.py:2310(_sample)
70800/100        0.064    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1528(_wrapped_call_impl)
70800/100        0.155    0.000    6.661    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1534(_call_impl)
58500/100        0.182    0.000    6.660    0.067 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:160(new_forward)
      100        0.001    0.000    6.613    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1611(forward)
      100        0.007    0.000    6.594    0.066 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1425(forward)
     1500        0.059    0.000    6.419    0.004 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:1209(forward)
     1400        0.035    0.000    3.599    0.003 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:564(forward)
     1400        0.242    0.000    2.756    0.002 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:586(moe_infer)
     1500        0.253    0.000    2.093    0.001 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:799(forward)
     9900        0.229    0.000    1.973    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:386(forward)
    58500        0.070    0.000    1.388    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/hooks.py:316(pre_forward)
198455/117100    0.479    0.000    1.339    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:135(send_to_device)
    37300        0.076    0.000    1.090    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/linear.py:115(forward)
    38700        1.015    0.000    1.015    0.000 {built-in method torch._C._nn.linear}
    89249        0.789    0.000    0.789    0.000 {method 'to' of 'torch._C.TensorBase' objects}
     6100        0.278    0.000    0.687    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:103(forward)
61684/58699      0.068    0.000    0.657    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:73(honor_type)
127539/118584    0.051    0.000    0.557    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:181(<genexpr>)
     1400        0.073    0.000    0.370    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:418(forward)
     1500        0.161    0.000    0.357    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:337(apply_rotary_pos_emb)
738474/604072    0.117    0.000    0.261    0.000 {built-in method builtins.isinstance}
    58600        0.020    0.000    0.203    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:189(<dictcomp>)
     1400        0.193    0.000    0.193    0.000 {method 'cpu' of 'torch._C.TensorBase' objects}
     2985        0.010    0.000    0.160    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/cache_utils.py:123(update)
    67201        0.034    0.000    0.159    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:993(__instancecheck__)
     9900        0.008    0.000    0.142    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/activation.py:395(forward)
     9900        0.010    0.000    0.135    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/functional.py:2080(silu)
    67201        0.041    0.000    0.125    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/typing.py:1154(__subclasscheck__)
     7570        0.125    0.000    0.125    0.000 {built-in method torch.cat}
   265447        0.079    0.000    0.124    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/accelerate/utils/operations.py:44(is_torch_tensor)
     9900        0.122    0.000    0.122    0.000 {built-in method torch._C._nn.silu}
     3000        0.071    0.000    0.119    0.000 /root/.cache/huggingface/modules/transformers_modules/modeling_deepseek.py:329(rotate_half)
   155900        0.114    0.000    0.114    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/torch/nn/modules/module.py:1696(__getattr__)
     3000        0.097    0.000    0.097    0.000 {built-in method torch.matmul}
     6100        0.086    0.000    0.086    0.000 {method 'mean' of 'torch._C.TensorBase' objects}
     5900        0.084    0.000    0.084    0.000 {method 'reshape' of 'torch._C.TensorBase' objects}
     6100        0.078    0.000    0.078    0.000 {method 'pow' of 'torch._C.TensorBase' objects}
     5600        0.072    0.000    0.072    0.000 {method 'type' of 'torch._C.TensorBase' objects}
     1400        0.072    0.000    0.072    0.000 {method 'argsort' of 'torch._C.TensorBase' objects}
      203        0.070    0.000    0.070    0.000 {built-in method torch.tensor}
    67202        0.025    0.000    0.069    0.000 {built-in method builtins.issubclass}
    70800        0.060    0.000    0.060    0.000 {built-in method torch._C._get_tracing_state}
      200        0.002    0.000    0.058    0.000 /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/generation/logits_process.py:72(__call__)
     6100        0.058    0.000    0.058    0.000 {built-in method torch.rsqrt}
----------------------------------------------------------------------------------------------------------------
相关推荐
boooo_hhh23 分钟前
深度学习笔记16-VGG-16算法-Pytorch实现人脸识别
pytorch·深度学习·机器学习
AnnyYoung27 分钟前
华为云deepseek大模型平台:deepseek满血版
人工智能·ai·华为云
INDEMIND1 小时前
INDEMIND:AI视觉赋能服务机器人,“零”碰撞避障技术实现全天候安全
人工智能·视觉导航·服务机器人·商用机器人
慕容木木1 小时前
【全网最全教程】使用最强DeepSeekR1+联网的火山引擎,没有生成长度限制,DeepSeek本体的替代品,可本地部署+知识库,注册即可有750w的token使用
人工智能·火山引擎·deepseek·deepseek r1
南 阳1 小时前
百度搜索全面接入DeepSeek-R1满血版:AI与搜索的全新融合
人工智能·chatgpt
企鹅侠客2 小时前
开源免费文档翻译工具 可支持pdf、word、excel、ppt
人工智能·pdf·word·excel·自动翻译
冰淇淋百宝箱2 小时前
AI 安全时代:SDL与大模型结合的“王炸组合”——技术落地与实战指南
人工智能·安全
Elastic 中国社区官方博客3 小时前
Elasticsearch Open Inference API 增加了对 Jina AI 嵌入和 Rerank 模型的支持
大数据·人工智能·elasticsearch·搜索引擎·ai·全文检索·jina
美狐美颜sdk3 小时前
直播美颜工具架构设计与性能优化实战:美颜SDK集成与实时处理
深度学习·美颜sdk·第三方美颜sdk·视频美颜sdk·美颜api
AWS官方合作商3 小时前
Amazon Lex:AI对话引擎重构企业服务新范式
人工智能·ai·机器人·aws