mindie运行Qwen2.5-7B-Instruct正常,量化版Qwen2.5-72B-Instruct-GPTQ-Int4报错

(base) root@huawei:/disk1/models# pwd

/disk1/models

(base) root@huawei:/disk1/models# cat /etc/issue

Ubuntu 20.04 LTS \n \l

(base) root@huawei:/disk1/models# free -g

total used free shared buff/cache available

Mem: 754 8 389 0 356 741

Swap: 7 0 7

(base) root@huawei:/disk1/models# lscpu|grep CPU

CPU op-mode(s): 64-bit

CPU(s): 192

On-line CPU(s) list: 0-191

NUMA node0 CPU(s): 0-23

NUMA node1 CPU(s): 24-47

NUMA node2 CPU(s): 48-71

NUMA node3 CPU(s): 72-95

NUMA node4 CPU(s): 96-119

NUMA node5 CPU(s): 120-143

NUMA node6 CPU(s): 144-167

NUMA node7 CPU(s): 168-191

(base) root@huawei:/disk1/models# df -hT

Filesystem Type Size Used Avail Use% Mounted on

udev devtmpfs 377G 0 377G 0% /dev

tmpfs tmpfs 76G 4.6M 76G 1% /run

/dev/sda2 ext4 439G 159G 258G 39% /

tmpfs tmpfs 378G 4.3M 378G 1% /dev/shm

tmpfs tmpfs 5.0M 0 5.0M 0% /run/lock

tmpfs tmpfs 378G 0 378G 0% /sys/fs/cgroup

/dev/sda1 vfat 511M 3.5M 508M 1% /boot/efi

/dev/loop7 squashfs 49M 49M 0 100% /snap/core18/2848

/dev/loop0 squashfs 69M 69M 0 100% /snap/core22/1720

/dev/loop6 squashfs 100M 100M 0 100% /snap/lxd/31572

/dev/loop2 squashfs 101M 101M 0 100% /snap/lxd/31822

/dev/loop3 squashfs 39M 39M 0 100% /snap/snapd/23546

/dev/loop4 squashfs 69M 69M 0 100% /snap/core22/1752

/dev/loop5 squashfs 49M 49M 0 100% /snap/core18/2857

overlay overlay 439G 159G 258G 39% /var/lib/docker/overlay2/3fb838ad167298740a56ca0038f073f7e3a212a7b4d5e7f295b85bd7130428aa/merged

/dev/loop1 squashfs 39M 39M 0 100% /snap/snapd/23772

/dev/mapper/testvg-testlv ext4 1.5T 226G 1.2T 17% /disk1

overlay overlay 439G 159G 258G 39% /var/lib/docker/overlay2/27007413f47cdafb51bbef36aa09298d95f6f9870d2ba16f3f74dfcbf1d7f5a9/merged

tmpfs tmpfs 76G 0 76G 0% /run/user/0

(base) root@huawei:/disk1/models# npu-smi info

+------------------------------------------------------------------------------------------------+

| npu-smi 23.0.0 Version: 23.0.0 |

+---------------------------+---------------+----------------------------------------------------+

| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|

| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |

+===========================+===============+====================================================+

| 0 910PremiumA | OK | 98.6 75 0 / 0 |

| 0 | 0000:C1:00.0 | 0 1225 / 13553 1 / 32768 |

+===========================+===============+====================================================+

| 1 910PremiumA | OK | 102.6 75 0 / 0 |

| 0 | 0000:81:00.0 | 0 1973 / 15665 1 / 32768 |

+===========================+===============+====================================================+

| 2 910PremiumA | OK | 102.4 75 0 / 0 |

| 0 | 0000:41:00.0 | 0 2237 / 15665 1 / 32768 |

+===========================+===============+====================================================+

| 3 910PremiumA | OK | 100.0 75 0 / 0 |

| 0 | 0000:01:00.0 | 0 2944 / 15567 1 / 32768 |

+===========================+===============+====================================================+

| 4 910PremiumA | OK | 100.4 74 0 / 0 |

| 0 | 0000:C2:00.0 | 0 1415 / 13553 1 / 32768 |

+===========================+===============+====================================================+

| 5 910PremiumA | OK | 104.7 75 0 / 0 |

| 0 | 0000:82:00.0 | 0 1708 / 15665 1 / 32768 |

+===========================+===============+====================================================+

| 6 910PremiumA | OK | 101.1 75 0 / 0 |

| 0 | 0000:42:00.0 | 0 2342 / 15665 0 / 32768 |

+===========================+===============+====================================================+

| 7 910PremiumA | OK | 99.3 75 0 / 0 |

| 0 | 0000:02:00.0 | 0 2898 / 15567 1 / 32768 |

+===========================+===============+====================================================+

+---------------------------+---------------+----------------------------------------------------+

| NPU Chip | Process id | Process name | Process memory(MB) |

+===========================+===============+====================================================+

| No running processes found in NPU 0 |

+===========================+===============+====================================================+

| No running processes found in NPU 1 |

+===========================+===============+====================================================+

| No running processes found in NPU 2 |

+===========================+===============+====================================================+

| No running processes found in NPU 3 |

+===========================+===============+====================================================+

| No running processes found in NPU 4 |

+===========================+===============+====================================================+

| No running processes found in NPU 5 |

+===========================+===============+====================================================+

| No running processes found in NPU 6 |

+===========================+===============+====================================================+

| No running processes found in NPU 7 |

+===========================+===============+====================================================+

(base) root@huawei:/disk1/models# ll /disk1/models

total 220140

drwxrwxrwx 5 root root 4096 Mar 7 07:37 ./

drwxr-xr-x 4 root root 4096 Mar 7 06:11 ../

-rw-r--r-- 1 root root 4807602 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py310_torch2.1.0-abi0.tar.gz

-rw-r--r-- 1 root root 4944832 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py310_torch2.1.0-abi1.tar.gz

-rw-r--r-- 1 root root 4813371 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py310_torch2.3.1-abi0.tar.gz

-rw-r--r-- 1 root root 4734426 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py310_torch2.3.1-abi1.tar.gz

-rw-r--r-- 1 root root 4808762 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py311_torch2.1.0-abi0.tar.gz

-rw-r--r-- 1 root root 4945450 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py311_torch2.1.0-abi1.tar.gz

-rw-r--r-- 1 root root 4813791 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py311_torch2.3.1-abi0.tar.gz

-rw-r--r-- 1 root root 4734373 Mar 7 01:59 Ascend-mindie-atb-models_1.0.0_linux-aarch64_py311_torch2.3.1-abi1.tar.gz

drwxrwxrwx 3 root root 4096 Mar 6 00:56 deepseek-ai/

-rw------- 1 root root 368 Mar 7 07:36 .msc

drwxrwxrwx 7 root root 4096 Mar 7 07:38 Qwen/

drwxrwxrwx 4 root root 4096 Mar 7 07:36 ._____temp/

-rw-r--r-- 1 root root 84138364 Oct 6 2023 torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

-rw-r--r-- 1 root root 89791945 Jul 24 2024 torch-2.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

-rw-r--r-- 1 root root 12845038 Mar 7 01:30 torch_npu-2.4.0.post2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

(base) root@huawei:/disk1/models#

运行容器:

docker run -it -d --name mindie-910a-t71 --ipc=host --net=host --shm-size=200g \

--device=/dev/davinci0 \

--device=/dev/davinci1 \

--device=/dev/davinci2 \

--device=/dev/davinci3 \

--device=/dev/davinci4 \

--device=/dev/davinci5 \

--device=/dev/davinci6 \

--device=/dev/davinci7 \

--device=/dev/davinci_manager \

--device=/dev/hisi_hdc \

--device=/dev/devmm_svm \

--entrypoint=bash \

-w /usr/local/Ascend/mindie/latest/mindie-llm/logs \

-v /usr/local/dcmi:/usr/local/dcmi \

-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \

-v /usr/local/sbin:/usr/local/sbin \

-v /usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/common \

-v /usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/driver \

-v /etc/hccn.conf:/etc/hccn.conf \

-v /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime \

-v /etc/ascend_install.info:/etc/ascend_install.info \

-v /etc/vnpu.cfg:/etc/vnpu.cfg \

-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \

-v /disk1/models:/models \

swr.cn-central-221.ovaijisuan.com/wh-aicc-fae/mindie:910A-ascend_24.1.rc3-cann_8.0.t63-py_3.10-ubuntu_20.04-aarch64-mindie_1.0.T71.02

进入容器测试

docker exec -it mindie-910a-t71 bash

另外一个模型也可运行:

复制代码
docker run -it -d --name mindie-910a-t65 --ipc=host --net=host --shm-size=200g \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/hisi_hdc \
--device=/dev/devmm_svm \
--entrypoint=bash \
-w /usr/local/Ascend/mindie/latest/mindie-llm/logs \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/sbin:/usr/local/sbin \
-v /usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/common \
-v /usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/driver \
-v /etc/hccn.conf:/etc/hccn.conf \
-v /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /etc/vnpu.cfg:/etc/vnpu.cfg \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /disk1/models:/models \
swr.cn-central-221.ovaijisuan.com/wh-aicc-fae/mindie:910a-ascend_23.0.0-cann_8.0.rc3-py_3.10-ubuntu_22.04-aarch64-mindie_1.0.t65

docker exec -it mindie-910a-t65 bash

torchrun --nproc_per_node 2 --master_port 20030 -m examples.run_pa --model_path /models/Qwen/Qwen2___5-7B-Instruct --input_texts "你好,请介绍一下武汉" --max_batch_size 2

测试结果:

1.运行Qwen2.5-7B-Instruct正常:

(Python310) root@huawei:/usr/local/Ascend/atb-models# torchrun --nproc_per_node 2 --master_port 20030 -m examples.run_pa --model_path /models/Qwen/Qwen2___5-7B-Instruct --input_texts "你好,请介绍一下武汉" --max_batch_size 2

2025-03-07 16:32:36,351\] torch.distributed.run: \[WARNING

2025-03-07 16:32:36,351 torch.distributed.run: WARNING *****************************************

2025-03-07 16:32:36,351 torch.distributed.run: WARNING Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.

2025-03-07 16:32:36,351 torch.distributed.run: WARNING *****************************************

2025-03-07 16:32:46,307 22204 281473125748752 llm INFOlogging.py-227 : Skip binding cpu.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

2025-03-07 16:32:46,855 22204 281473125748752 llm INFOlogging.py-227 : model_runner.quantize: None, model_runner.kv_quant_type: None, model_runner.fa_quant_type: None, model_runner.dtype: torch.float16

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

2025-03-07 16:32:54,824 22204 281473125748752 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:32:54,826 22204 281473125748752 llm INFOlogging.py-227 : init tokenizer done: Qwen2TokenizerFast(name_or_path='/models/Qwen/Qwen2___5-7B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': '\<\|im_start\|\>', '\<\|im_end\|\>', '\<\|object_ref_start\|\>', '\<\|object_ref_end\|\>', '\<\|box_start\|\>', '\<\|box_end\|\>', '\<\|quad_start\|\>', '\<\|quad_end\|\>', '\<\|vision_start\|\>', '\<\|vision_end\|\>', '\<\|vision_pad\|\>', '\<\|image_pad\|\>', '\<\|video_pad\|\>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={

151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

}

2025-03-07 16:32:54,834 22204 281473125748752 llm INFOlogging.py-227 : NPUSocInfo(soc_name='', soc_version=100, need_nz=True, matmul_nd_nz=False)

2025-03-07 16:32:55,027 22204 281473125748752 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:32:55,130 22205 281472994160656 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:32:55,324 22205 281472994160656 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:33:33,519 22205 281472994160656 llm INFOcache.py-98 : kv cache will allocate 0.0615234375GB memory

2025-03-07 16:33:34,434 22205 281472994160656 llm INFOflash_causal_qwen2.py-435 : <<<<<<<after transdata k_caches0.shape=torch.Size(18, 16, 128, 16)

2025-03-07 16:33:41,789 22204 281473125748752 llm INFOlogging.py-227 : model:

FlashQwen2ForCausalLM(

(rotary_embedding): PositionRotaryEmbedding()

(attn_mask): AttentionMask()

(transformer): FlashQwenModel(

(wte): TensorParallelEmbedding()

(h): ModuleList(

(0-27): 28 x FlashQwenLayer(

(attn): FlashQwenAttention(

(rotary_emb): PositionRotaryEmbedding()

(c_attn): TensorParallelColumnLinear(

(linear): FastLinear()

)

(c_proj): TensorParallelRowLinear(

(linear): FastLinear()

)

)

(mlp): QwenMLP(

(act): SiLU()

(w2_w1): TensorParallelColumnLinear(

(linear): FastLinear()

)

(c_proj): TensorParallelRowLinear(

(linear): FastLinear()

)

)

(ln_1): QwenRMSNorm()

(ln_2): QwenRMSNorm()

)

)

(ln_f): QwenRMSNorm()

)

(lm_head): TensorParallelHead(

(linear): FastLinear()

)

)

2025-03-07 16:33:43,496 22204 281473125748752 llm INFOlogging.py-227 : hbm_capacity(GB): 13.2353515625, init_memory(GB): 1.323535155504942

2025-03-07 16:33:43,496 22204 281473125748752 llm INFOlogging.py-227 : pa_runner: PARunner(model_path=/models/Qwen/Qwen2___5-7B-Instruct, input_text=None, max_position_embeddings=None, max_input_length=1024, max_output_length=20, max_prefill_tokens=-1, load_tokenizer=True, enable_atb_torch=False, max_prefill_batch_size=None, max_batch_size=2, dtype=torch.float16, block_size=128, model_config=ModelConfig(num_heads=14, num_kv_heads=2, num_kv_heads_origin=4, head_size=128, k_head_size=128, v_head_size=128, num_layers=28, device=npu:0, dtype=torch.float16, soc_info=NPUSocInfo(soc_name='', soc_version=100, need_nz=True, matmul_nd_nz=False), kv_quant_type=None, fa_quant_type=None, mapping=Mapping(world_size=2, rank=0, pp_rank=0, pp_groups=\[0, 1], micro_batch_size=2) ), cla_share_factor=1, , max_memory=14211350528,

2025-03-07 16:33:43,497 22204 281473125748752 llm INFOlogging.py-227 : ---------------begin warm_up---------------

2025-03-07 16:33:43,497 22204 281473125748752 llm INFOcache.py-98 : kv cache will allocate 0.0615234375GB memory

2025-03-07 16:33:43,499 22204 281473125748752 llm INFOlogging.py-227 : ------total req num: 2, infer start--------

2025-03-07 16:33:43,504 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,505 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,505 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,505 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,506 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,506 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,507 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,507 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,508 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,508 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,508 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,508 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,509 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,509 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,509 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,510 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,510 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,510 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,511 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,511 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,511 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,512 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,512 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,512 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,513 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,513 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,513 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,514 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,514 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,514 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,514 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,515 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,515 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,515 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,516 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,516 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,516 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,517 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,517 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,517 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,518 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,518 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,518 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,519 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,519 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,519 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,520 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,520 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,520 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,521 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,521 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,521 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,522 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,522 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,522 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,522 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,523 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,523 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,523 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,524 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,524 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,524 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,525 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,525 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,525 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,526 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,526 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,526 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,527 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,527 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,527 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,528 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,528 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,528 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,529 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,529 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,529 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,530 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,530 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,530 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,531 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,531 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,531 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,531 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,532 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,532 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,532 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,533 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,533 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,533 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,534 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,534 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,534 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,535 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,535 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,535 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,536 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,536 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,536 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,536 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,537 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,537 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,537 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,538 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,538 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,538 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,539 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,539 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,539 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,540 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,540 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,540 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:43,541 22204 281473125748752 llm INFOlogging.py-227 : trans to 29

2025-03-07 16:33:44,367 22204 281473125748752 llm INFOlogging.py-227 : <<<<<<< ori k_caches0.shape=torch.Size(18, 16, 128, 16)

2025-03-07 16:33:44,376 22204 281473125748752 llm INFOflash_causal_qwen2.py-435 : <<<<<<<after transdata k_caches0.shape=torch.Size(18, 16, 128, 16)

2025-03-07 16:33:44,376 22204 281473125748752 llm INFOlogging.py-227 : >>>>>>id of kcache is 281470252742704 id of vcache is 281470252742784

2025-03-07 16:33:46,979 22204 281473125748752 llm INFOlogging.py-227 : warmup_memory(GB): 1.32

2025-03-07 16:33:46,979 22204 281473125748752 llm INFOlogging.py-227 : ---------------end warm_up---------------

2025-03-07 16:33:46,979 22204 281473125748752 llm INFOlogging.py-227 : ---------------begin inference---------------

2025-03-07 16:33:47,060 22204 281473125748752 llm INFOlogging.py-227 : ------total req num: 2, infer start--------

2025-03-07 16:33:48,480 22204 281473125748752 llm INFOlogging.py-227 : ---------------end inference---------------

2025-03-07 16:33:48,480 22204 281473125748752 llm INFOlogging.py-227 : Answer0: 大学的历史和特色。

武汉大学是中国著名的高等学府之一,位于湖北省武汉市,创建于

2025-03-07 16:33:48,480 22204 281473125748752 llm INFOlogging.py-227 : Generate0 token num: (0, 20)

2025-03-07 16:33:48,480 22204 281473125748752 llm INFOlogging.py-227 : Answer1: 大学的历史和特色。

武汉大学是中国著名的高等学府之一,位于湖北省武汉市,创建于

2025-03-07 16:33:48,480 22204 281473125748752 llm INFOlogging.py-227 : Generate1 token num: (1, 40)

(Python310) root@huawei:/usr/local/Ascend/atb-models# ll /models/Qwen/

total 40

drwxrwxrwx 7 root root 4096 Mar 7 15:38 ./

drwxrwxrwx 5 root root 4096 Mar 7 15:37 ../

drwxr-xr-x 3 root root 4096 Mar 7 15:38 Qwen2.5-72B-Instruct/

lrwxrwxrwx 1 root root 72 Mar 6 00:57 Qwen2.5-72B-Instruct-GPTQ-Int4 -> /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-72B-Instruct-GPTQ-Int4

lrwxrwxrwx 1 root root 61 Mar 6 14:19 Qwen2.5-7B-Instruct -> /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct

lrwxrwxrwx 1 root root 65 Mar 6 05:42 Qwen2.5-VL-72B-Instruct -> /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-VL-72B-Instruct

drwxr-xr-x 2 root root 4096 Mar 7 14:04 Qwen2___5-72B-Instruct/

drwxr-x--- 2 root root 4096 Mar 7 15:28 Qwen2___5-72B-Instruct-GPTQ-Int4/

drwxr-x--- 2 root root 4096 Mar 6 16:18 Qwen2___5-7B-Instruct/

drwxr-x--- 2 root root 4096 Mar 6 05:42 Qwen2___5-VL-72B-Instruct/

2.运行Qwen2.5-72B-Instruct-GPTQ-Int4报错:

(Python310) root@huawei:/usr/local/Ascend/atb-models# torchrun --nproc_per_node 8 --master_port 20030 -m examples.run_pa --model_path "/models/Qwen/Qwen2___5-72B-Instruct-GPTQ-Int4" --input_texts "你好,请介绍一下武汉" --max_batch_size 8

2025-03-07 16:36:38,408\] torch.distributed.run: \[WARNING

2025-03-07 16:36:38,408 torch.distributed.run: WARNING *****************************************

2025-03-07 16:36:38,408 torch.distributed.run: WARNING Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.

2025-03-07 16:36:38,408 torch.distributed.run: WARNING *****************************************

2025-03-07 16:36:49,200 24163 281473876656144 llm INFOlogging.py-227 : Skip binding cpu.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

2025-03-07 16:36:49,770 24163 281473876656144 llm INFOlogging.py-227 : model_runner.quantize: None, model_runner.kv_quant_type: None, model_runner.fa_quant_type: None, model_runner.dtype: torch.float16

2025-03-07 16:36:57,840 24166 281473450606608 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:58,085 24167 281473718341648 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:58,084 24170 281472927670288 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:58,111 24168 281473527169040 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:58,285 24166 281473450606608 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:36:58,472 24167 281473718341648 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:36:58,579 24170 281472927670288 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:36:58,598 24168 281473527169040 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:36:58,637 24164 281473344917520 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:58,698 24169 281472867508240 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:58,975 24163 281473876656144 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:59,001 24163 281473876656144 llm INFOlogging.py-227 : init tokenizer done: Qwen2TokenizerFast(name_or_path='/models/Qwen/Qwen2___5-72B-Instruct-GPTQ-Int4', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': '\<\|im_start\|\>', '\<\|im_end\|\>', '\<\|object_ref_start\|\>', '\<\|object_ref_end\|\>', '\<\|box_start\|\>', '\<\|box_end\|\>', '\<\|quad_start\|\>', '\<\|quad_end\|\>', '\<\|vision_start\|\>', '\<\|vision_end\|\>', '\<\|vision_pad\|\>', '\<\|image_pad\|\>', '\<\|video_pad\|\>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={

151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),

}

2025-03-07 16:36:59,018 24164 281473344917520 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

2025-03-07 16:36:58,930 24165 281473872347152 llm INFOdist.py-81 : initialize_distributed has been Set

2025-03-07 16:36:59,066 24163 281473876656144 llm INFOlogging.py-227 : NPUSocInfo(soc_name='', soc_version=100, need_nz=True, matmul_nd_nz=False)

2025-03-07 16:36:59,212 24169 281472867508240 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

2025-03-07 16:36:59,423 24163 281473876656144 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

2025-03-07 16:36:59,662 24165 281473872347152 llm INFOflash_causal_qwen2.py-122 : >>>> qwen_QwenDecoderModel is called.

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 196, in _run_module_as_main

return _run_code(code, main_globals, None,

File "/root/miniconda3/envs/Python310/lib/python3.10/runpy.py", line 86, in _run_code

exec(code, run_globals)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 500, in <module>

pa_runner = PARunner(**input_dict)

File "/usr/local/Ascend/atb-models/examples/run_pa.py", line 97, in init

self.model.load_weights(**kw_args)

File "/usr/local/Ascend/atb-models/atb_llm/runner/model_runner.py", line 161, in load_weights

self.model = self.model_cls(self.config,

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/flash_causal_qwen2.py", line 32, in init

self.transformer = FlashQwenModel(config, weights, model_prefix=model_prefix, lmhead_prefix=lmhead_prefix)

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 407, in init

File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 408, in \ FlashQwenLayer( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 278, in __init__ self.attn = FlashQwenAttention( File "/usr/local/Ascend/atb-models/atb_llm/models/qwen2/modeling_qwen2.py", line 149, in __init__ self.c_attn = load_column_multi( File "/usr/local/Ascend/atb-models/atb_llm/utils/layers/__init__.py", line 48, in load_column_multi weight = weights.get_multi_weights_col(prefixes, quantize=quantize, dim=0, gqa_size=head_size) File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in get_multi_weights_col w = \[self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 657, in <listcomp>

w = self.get_sharded(f"{p}.weight", dim=dim, gqa_size=gqa_size) for p in prefixes

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 331, in get_sharded

slice_ = self._get_slice(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 718, in _get_slice

filename, tensor_name = self.get_filename(tensor_name)

File "/usr/local/Ascend/atb-models/atb_llm/utils/weights.py", line 146, in get_filename

raise AssertionError(f"weight {tensor_name} does not exist")

AssertionError: weight model.layers.0.self_attn.q_proj.weight does not exist

ERROR 2025-03-07-16:37:05 (PID:24166, Device:3, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:05 (PID:24168, Device:5, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:05 (PID:24170, Device:7, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:05 (PID:24167, Device:4, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:06 (PID:24164, Device:1, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:06 (PID:24163, Device:0, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:06 (PID:24169, Device:6, RankID:-1) ERR99999 UNKNOWN application exception

ERROR 2025-03-07-16:37:07 (PID:24165, Device:2, RankID:-1) ERR99999 UNKNOWN application exception

2025-03-07 16:37:13,455 torch.distributed.elastic.multiprocessing.api: WARNING Sending process 24163 closing signal SIGTERM

2025-03-07 16:37:13,487 torch.distributed.elastic.multiprocessing.api: ERROR failed (exitcode: 1) local_rank: 1 (pid: 24164) of binary: /root/miniconda3/envs/Python310/bin/python

Traceback (most recent call last):

File "/root/miniconda3/envs/Python310/bin/torchrun", line 8, in <module>

sys.exit(main())

File "/root/miniconda3/envs/Python310/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper

return f(*args, **kwargs)

File "/root/miniconda3/envs/Python310/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main

run(args)

File "/root/miniconda3/envs/Python310/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run

elastic_launch(

File "/root/miniconda3/envs/Python310/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call

return launch_agent(self._config, self._entrypoint, list(args))

File "/root/miniconda3/envs/Python310/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent

raise ChildFailedError(

torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

============================================================

examples.run_pa FAILED


Failures:

1:

time : 2025-03-07_16:37:13

host : huawei

rank : 2 (local_rank: 2)

exitcode : 1 (pid: 24165)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

2:

time : 2025-03-07_16:37:13

host : huawei

rank : 3 (local_rank: 3)

exitcode : 1 (pid: 24166)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

3:

time : 2025-03-07_16:37:13

host : huawei

rank : 4 (local_rank: 4)

exitcode : 1 (pid: 24167)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

4:

time : 2025-03-07_16:37:13

host : huawei

rank : 5 (local_rank: 5)

exitcode : 1 (pid: 24168)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

5:

time : 2025-03-07_16:37:13

host : huawei

rank : 6 (local_rank: 6)

exitcode : 1 (pid: 24169)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

6:

time : 2025-03-07_16:37:13

host : huawei

rank : 7 (local_rank: 7)

exitcode : 1 (pid: 24170)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html


Root Cause (first observed failure):

0:

time : 2025-03-07_16:37:13

host : huawei

rank : 1 (local_rank: 1)

exitcode : 1 (pid: 24164)

error_file: <N/A>

traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

============================================================

(Python310) root@huawei:/usr/local/Ascend/atb-models#

执行量化版报错:

镜像仓库网

镜像仓库网

相关推荐
顾籽黎3 个月前
MindIE 启动模型开启思考分离说明
mindie·deepseekv3.1·思考分离