【llm 微调code-llama 训练自己的数据集 一个小案例】

这也是一个通用的方案,使用peft微调LLM。

准备自己的数据集

根据情况改就行了,jsonl格式,三个字段:context, answer, question

python 复制代码
import pandas as pd
import random
import json


data = pd.read_csv('dataset.csv')
train_data = data[['prompt','Code']]
train_data = train_data.values.tolist()

random.shuffle(train_data)


train_num = int(0.8 * len(train_data))

with open('train_data.jsonl', 'w') as f:
    for d in train_data[:train_num]:
        d = {
            'context':'',
            'question':d[0],
            'answer':d[1]
        }
        f.write(json.dumps(d)+'\n')
with open('val_data.jsonl', 'w') as f:
    for d in train_data[train_num:]:
        d = {
            'context':'',
            'question':d[0],
            'answer':d[1]
        }
        f.write(json.dumps(d)+'\n')

初始化

python 复制代码
from datetime import datetime
import os
import sys

import torch

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)
from transformers import (AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM,
                          TrainingArguments, Trainer, DataCollatorForSeq2Seq)

# 加载自己的数据集
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='train_data.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='val_data.jsonl', split='train')

# 读取模型
base_model = 'CodeLlama-7b-Instruct-hf'

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model)

微调前的效果

python 复制代码
tokenizer.pad_token = tokenizer.eos_token
prompt = """You are programming coder.

Now answer the question:

{}"""
prompts = [prompt.format(train_dataset[i]['question']) for i in [1,20,32,45,67]]

model_input = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")


model.eval()
with torch.no_grad():
    outputs = model.generate(**model_input, max_new_tokens=300)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(outputs)

进行微调

python 复制代码
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"


def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result


def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a powerful programming model. Your job is to answer questions about a database. You are given a question.

You must output the code that answers the question.

### Input:
{data_point["question"]}

### Response:
{data_point["answer"]}
"""
    return tokenize(full_prompt)


tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)


model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True



batch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "code-llama-ft"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        load_best_model_at_end=False,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="none", # if use_wandb else "none", wandb
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

开始训练

python 复制代码
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)
trainer.train()

进行测试

python 复制代码
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

base_model = 'CodeLlama-7b-Instruct-hf'
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)


output_dir = "code-llama-ft"
model = PeftModel.from_pretrained(model, output_dir)


eval_prompt = """You are a powerful programming model. Your job is to answer questions about a database. You are given a question.

You must output the code that answers the question.

### Input:
Write a function in Java that takes an array and returns the sum of the numbers in the array, or 0 if the array is empty. Except the number 13 is very unlucky, so it does not count any 13, or any number that immediately follows a 13.

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    outputs = model.generate(**model_input, max_new_tokens=100)[0]
print(tokenizer.decode(outputs, skip_special_tokens=True))

主要参考https://zhuanlan.zhihu.com/p/660933421

相关推荐
蔗理苦5 小时前
2025-04-05 吴恩达机器学习5——逻辑回归(2):过拟合与正则化
人工智能·python·机器学习·逻辑回归
MobiCetus10 小时前
Deep Reinforcement Learning for Robotics翻译解读2
人工智能·深度学习·神经网络·机器学习·生成对抗网络·计算机视觉·数据挖掘
前端不能无12 小时前
从零开始本地部署Open WebUI完整指南
llm·github
IT古董13 小时前
【漫话机器学习系列】178.多元逻辑回归(Multinomial Logistic Regression)
人工智能·机器学习·逻辑回归
明月看潮生14 小时前
青少年编程与数学 02-015 大学数学知识点 02课题、线性代数
线性代数·机器学习·青少年编程·编程与数学
Agile.Zhou16 小时前
使用 AutoGen Studio 打造你的私有团队
llm·deepseek
L_cl16 小时前
【NLP 54、大模型训练相关知识】
人工智能·机器学习·自然语言处理
xidianjiapei00116 小时前
LLM架构解析:循环神经网络(RNN)(第三部分)—— 从基础原理到实践应用的深度探索
人工智能·rnn·深度学习·神经网络·机器学习·llm
JinYoMo19 小时前
【手把手教你从零开始YOLOv8-入门篇】YOLOv8 数据集构建
算法·机器学习·计算机视觉
求知呀20 小时前
MCP 模型上下文协议
llm·mcp