deepseek-glm4-grpo训练

一、目录

1.grpo 重新训练已经微调的glm4模型

二、实现

1.grpo 重新训练已经微调的glm4模型

1.1 指令:

复制代码
 CUDA_VISIBLE_DEVICES=1 nohup python test.py --model_name_or_path /home/LLaMA-Factory/saves/glm4-9b-lora-alpaca_reference_train20250115_01_merge \
        --dataset_name /home/LLaMA-Factory/data/alpca_all_simple.json \
        --learning_rate 5.0e-6 \
        --num_train_epochs 2   \
        --per_device_train_batch_size 2  \
        --num_generations 4 \
        --gradient_accumulation_steps 4 \
         --logging_steps 25 \
        --eval_strategy steps \
        --eval_steps 50 \
        --use_peft 1 \
        --lora_r 32 \
        --lora_alpha 16 \
        --output_dir /saves/glm4-9b-grpo >grop_output.log 2>&1 &

1.2 遇到问题及解决

复制代码
1. tokenizer no padding_side 字段
解决:脚本中添加该字段  padding_side: Optional[str] = None,
 def _pad(
            self,
            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            pad_to_multiple_of: Optional[int] = None,
            padding_side: Optional[str] = None,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
2. model no num_logits_to_keep 字段
修改模型脚本,进行添加该字段,以及相关功能。
 def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            return_last_logit: Optional[bool] = False,
            num_logits_to_keep: int = 0
    ):
        lm_logits = self.transformer.output_layer(hidden_states[:, -num_logits_to_keep:, :])

1.3 脚本

复制代码
#coding="utf8"
import json
import argparse
from typing import Optional
from dataclasses import dataclass, field
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config


@dataclass
class GRPOScriptArguments(ScriptArguments):
    """
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
    """

    reward_model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
        },
    )


class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["instruction"]
        return {"prompt": text}

def get_dataset(path):
    import json
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    dataset = MyDataset(data[:-200])
    evaldataset = MyDataset(data[-200:])
    return dataset, evaldataset


def main(script_args, training_args, model_args):
    # Load a pretrained model
    print(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )

    def reward_len(completions, **kwargs):
        #输出奖励
        data = []
        for completion in completions:
           
            try:
                completion = json.loads(completion)
                data.append(1.0)
            except:
                
                data.append(0.0)
         
        return data
    
    # Load the dataset
    dataset, evaldataset = get_dataset(script_args.dataset_name)

    # Initialize the GRPO trainer
    trainer = GRPOTrainer(
        model=model,
        reward_funcs = reward_len,
        args=training_args,
        train_dataset = dataset,
        eval_dataset = evaldataset if training_args.eval_strategy != "no" else None,
        processing_class=tokenizer,
        peft_config=get_peft_config(model_args),
    )

    # Train and push the model to the Hub
    trainer.train()

    # Save and push to hub
    trainer.save_model(training_args.output_dir)
    # if training_args.push_to_hub:
    #     trainer.push_to_hub(dataset_name=script_args.dataset_name)


def make_parser(subparsers: argparse._SubParsersAction = None):
    dataclass_types = (GRPOScriptArguments, GRPOConfig, ModelConfig)
    if subparsers is not None:
        parser = subparsers.add_parser("grpo", help="Run the GRPO training script", dataclass_types=dataclass_types)
    else:
        parser = TrlParser(dataclass_types)
    return parser


if __name__ == "__main__":
    parser = make_parser()
    script_args, training_args, model_args = parser.parse_args_and_config()
    main(script_args, training_args, model_args)
相关推荐
碣石潇湘无限路12 分钟前
【AI篇】当Transformer模型开始学习《孙子兵法》
人工智能·学习
看到我,请让我去学习23 分钟前
OpenCV开发-初始概念
人工智能·opencv·计算机视觉
汀沿河23 分钟前
8.1 prefix Tunning与Prompt Tunning模型微调方法
linux·运维·服务器·人工智能
陈敬雷-充电了么-CEO兼CTO33 分钟前
大模型技术原理 - 基于Transformer的预训练语言模型
人工智能·深度学习·语言模型·自然语言处理·chatgpt·aigc·transformer
学术 学术 Fun39 分钟前
✨ OpenAudio S1:影视级文本转语音与语音克隆Mac整合包
人工智能·语音识别
风铃喵游1 小时前
让大模型调用MCP服务变得超级简单
前端·人工智能
booooooty2 小时前
基于Spring AI Alibaba的多智能体RAG应用
java·人工智能·spring·多智能体·rag·spring ai·ai alibaba
PyAIExplorer2 小时前
基于 OpenCV 的图像 ROI 切割实现
人工智能·opencv·计算机视觉
风口猪炒股指标2 小时前
技术分析、超短线打板模式与情绪周期理论,在市场共识的形成、分歧、瓦解过程中缘起性空的理解
人工智能·博弈论·群体博弈·人生哲学·自我引导觉醒
ai_xiaogui3 小时前
一键部署AI工具!用AIStarter快速安装ComfyUI与Stable Diffusion
人工智能·stable diffusion·部署ai工具·ai应用市场教程·sd快速部署·comfyui一键安装