GPU训练代码 - 技术栈

python 复制代码

import os
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from accelerate import Accelerator

# 设置环境变量，避免显存碎片化问题
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'

# 设定数据路径和模型路径
data_path = '/data/sft_qwen2_7B/'  # 假设你已经有处理好的数据集
model_path = '/data/model/'  # 模型路径

# 初始化 Accelerator
accelerator = Accelerator()

# 检查可用的 GPU 设备
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"Available GPU devices: {device_count}")

# padding id 设置为 -100，用于忽略损失计算
PaddingID = -100

def preprocess_inputs(examples, max_len=8192):
    """
    预处理输入数据，将 question 和 answer 转换为模型可接受的格式。
    """
    prompt_template = '<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n'
    system_prompt = "你是一个知识渊博的人，请根据问题做出全面且正确的回答。"

    model_inputs = {'input_ids': [], 'labels': []}
    for i in range(len(examples['question'])):
        prompt = prompt_template.format(
                    system_prompt=system_prompt,
                    user_prompt=examples['question'][i]
                )
        # 编码 question 部分
        a_ids = tokenizer.encode(prompt)
        # 编码 answer 部分，不加 special token，末尾加 eos_token_id
        b_ids = tokenizer.encode(f"{examples['answer'][i]}", add_special_tokens=False) + [tokenizer.eos_token_id]
        input_ids = a_ids + b_ids

        # 如果长度超过 max_len，进行截断
        if len(input_ids) > max_len:
            input_ids = input_ids[:max_len]

        # 填充 input_ids 和 labels 到 max_len
        pad_length = max_len - len(input_ids)
        input_ids = input_ids + [tokenizer.pad_token_id] * pad_length
        labels = [PaddingID] * len(a_ids) + b_ids + [PaddingID] * pad_length

        model_inputs['input_ids'].append(input_ids)
        model_inputs['labels'].append(labels)

    return model_inputs

if __name__ == "__main__":
    # 加载 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    # 加载数据集，使用 `load_from_disk` 读取本地磁盘上的数据集
    dataset = load_from_disk(data_path)

    # 预处理数据集
    train_dataset = dataset['train'].map(
        preprocess_inputs, 
        batched=True, 
        num_proc=1, 
        load_from_cache_file=False
    )

    # 使用 float16 来降低显存使用，提升训练效率
    data_type = torch.float16 if torch.cuda.is_available() else torch.float32

    # 加载模型，使用 `Accelerate` 来简化模型加载和设备映射
    model = AutoModelForCausalLM.from_pretrained(
        model_path, 
        torch_dtype=data_type, 
        trust_remote_code=True
    )

    # 使用 Accelerator 自动处理模型和数据在多 GPU 上的分布
    model, train_dataset = accelerator.prepare(model, train_dataset)

    # 启用梯度检查点，减少显存消耗
    model.gradient_checkpointing_enable()

    # 使用 DataCollatorForSeq2Seq 进行批处理数据的动态填充
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, 
        model=model, 
        label_pad_token_id=PaddingID, 
        padding=True
    )

    # 定义训练参数
    training_args = TrainingArguments(
        output_dir='./outputs',
        per_device_train_batch_size=1,           # 每个设备的批次大小
        logging_steps=10,
        gradient_accumulation_steps=16,          # 梯度累积，提升训练效率
        num_train_epochs=3,                      # 训练轮数
        weight_decay=0.01,                       # 权重衰减
        warmup_steps=100,                        # 预热步数
        learning_rate=5e-5,                      # 学习率
        lr_scheduler_type='cosine',              # 余弦学习率调度
        save_strategy='steps',                   # 每隔一定步数保存
        save_steps=500,                          # 保存步数
        bf16=False,                              # 关闭 bf16 (16 位浮点数) 支持
        report_to='wandb',                       # 使用 Weights & Biases 进行训练跟踪
        logging_dir='./logs',                    # 日志文件保存路径
        dataloader_num_workers=4,                # 数据加载线程数
        fp16=True,                               # 使用 16 位浮点数来减少显存使用
    )

    # 创建 Trainer 实例
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # 开始训练
    trainer.train()

报错1：

shell 复制代码

torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 448.00 MiB (GPU 6; 79.15 GiB total capacity; 1.72 GiB already allocated; 185.69 MiB free; 1.73 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.50 GiB (GPU 6; 79.15 GiB total capacity; 1.40 GiB already allocated; 421.69 MiB free; 1.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

报错2：

shell 复制代码

pyarrow.lib.ArrowInvalid: Column 9 named input_ids expected length 1000 but got length 978

报错3：

shell 复制代码

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!