python
import os
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from accelerate import Accelerator
# 设置环境变量,避免显存碎片化问题
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'
# 设定数据路径和模型路径
data_path = '/data/sft_qwen2_7B/' # 假设你已经有处理好的数据集
model_path = '/data/model/' # 模型路径
# 初始化 Accelerator
accelerator = Accelerator()
# 检查可用的 GPU 设备
if torch.cuda.is_available():
device_count = torch.cuda.device_count()
print(f"Available GPU devices: {device_count}")
# padding id 设置为 -100,用于忽略损失计算
PaddingID = -100
def preprocess_inputs(examples, max_len=8192):
"""
预处理输入数据,将 question 和 answer 转换为模型可接受的格式。
"""
prompt_template = '<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n'
system_prompt = "你是一个知识渊博的人,请根据问题做出全面且正确的回答。"
model_inputs = {'input_ids': [], 'labels': []}
for i in range(len(examples['question'])):
prompt = prompt_template.format(
system_prompt=system_prompt,
user_prompt=examples['question'][i]
)
# 编码 question 部分
a_ids = tokenizer.encode(prompt)
# 编码 answer 部分,不加 special token,末尾加 eos_token_id
b_ids = tokenizer.encode(f"{examples['answer'][i]}", add_special_tokens=False) + [tokenizer.eos_token_id]
input_ids = a_ids + b_ids
# 如果长度超过 max_len,进行截断
if len(input_ids) > max_len:
input_ids = input_ids[:max_len]
# 填充 input_ids 和 labels 到 max_len
pad_length = max_len - len(input_ids)
input_ids = input_ids + [tokenizer.pad_token_id] * pad_length
labels = [PaddingID] * len(a_ids) + b_ids + [PaddingID] * pad_length
model_inputs['input_ids'].append(input_ids)
model_inputs['labels'].append(labels)
return model_inputs
if __name__ == "__main__":
# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# 加载数据集,使用 `load_from_disk` 读取本地磁盘上的数据集
dataset = load_from_disk(data_path)
# 预处理数据集
train_dataset = dataset['train'].map(
preprocess_inputs,
batched=True,
num_proc=1,
load_from_cache_file=False
)
# 使用 float16 来降低显存使用,提升训练效率
data_type = torch.float16 if torch.cuda.is_available() else torch.float32
# 加载模型,使用 `Accelerate` 来简化模型加载和设备映射
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=data_type,
trust_remote_code=True
)
# 使用 Accelerator 自动处理模型和数据在多 GPU 上的分布
model, train_dataset = accelerator.prepare(model, train_dataset)
# 启用梯度检查点,减少显存消耗
model.gradient_checkpointing_enable()
# 使用 DataCollatorForSeq2Seq 进行批处理数据的动态填充
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=model,
label_pad_token_id=PaddingID,
padding=True
)
# 定义训练参数
training_args = TrainingArguments(
output_dir='./outputs',
per_device_train_batch_size=1, # 每个设备的批次大小
logging_steps=10,
gradient_accumulation_steps=16, # 梯度累积,提升训练效率
num_train_epochs=3, # 训练轮数
weight_decay=0.01, # 权重衰减
warmup_steps=100, # 预热步数
learning_rate=5e-5, # 学习率
lr_scheduler_type='cosine', # 余弦学习率调度
save_strategy='steps', # 每隔一定步数保存
save_steps=500, # 保存步数
bf16=False, # 关闭 bf16 (16 位浮点数) 支持
report_to='wandb', # 使用 Weights & Biases 进行训练跟踪
logging_dir='./logs', # 日志文件保存路径
dataloader_num_workers=4, # 数据加载线程数
fp16=True, # 使用 16 位浮点数来减少显存使用
)
# 创建 Trainer 实例
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
# 开始训练
trainer.train()
报错1:
shell
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 448.00 MiB (GPU 6; 79.15 GiB total capacity; 1.72 GiB already allocated; 185.69 MiB free; 1.73 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.50 GiB (GPU 6; 79.15 GiB total capacity; 1.40 GiB already allocated; 421.69 MiB free; 1.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
报错2:
shell
pyarrow.lib.ArrowInvalid: Column 9 named input_ids expected length 1000 but got length 978
报错3:
shell
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!