deepseek-glm4-grpo训练

一、目录

1.grpo 重新训练已经微调的glm4模型

二、实现

1.grpo 重新训练已经微调的glm4模型

1.1 指令:

复制代码
 CUDA_VISIBLE_DEVICES=1 nohup python test.py --model_name_or_path /home/LLaMA-Factory/saves/glm4-9b-lora-alpaca_reference_train20250115_01_merge \
        --dataset_name /home/LLaMA-Factory/data/alpca_all_simple.json \
        --learning_rate 5.0e-6 \
        --num_train_epochs 2   \
        --per_device_train_batch_size 2  \
        --num_generations 4 \
        --gradient_accumulation_steps 4 \
         --logging_steps 25 \
        --eval_strategy steps \
        --eval_steps 50 \
        --use_peft 1 \
        --lora_r 32 \
        --lora_alpha 16 \
        --output_dir /saves/glm4-9b-grpo >grop_output.log 2>&1 &

1.2 遇到问题及解决

复制代码
1. tokenizer no padding_side 字段
解决:脚本中添加该字段  padding_side: Optional[str] = None,
 def _pad(
            self,
            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            pad_to_multiple_of: Optional[int] = None,
            padding_side: Optional[str] = None,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
2. model no num_logits_to_keep 字段
修改模型脚本,进行添加该字段,以及相关功能。
 def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            return_last_logit: Optional[bool] = False,
            num_logits_to_keep: int = 0
    ):
        lm_logits = self.transformer.output_layer(hidden_states[:, -num_logits_to_keep:, :])

1.3 脚本

复制代码
#coding="utf8"
import json
import argparse
from typing import Optional
from dataclasses import dataclass, field
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config


@dataclass
class GRPOScriptArguments(ScriptArguments):
    """
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
    """

    reward_model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
        },
    )


class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["instruction"]
        return {"prompt": text}

def get_dataset(path):
    import json
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    dataset = MyDataset(data[:-200])
    evaldataset = MyDataset(data[-200:])
    return dataset, evaldataset


def main(script_args, training_args, model_args):
    # Load a pretrained model
    print(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )

    def reward_len(completions, **kwargs):
        #输出奖励
        data = []
        for completion in completions:
           
            try:
                completion = json.loads(completion)
                data.append(1.0)
            except:
                
                data.append(0.0)
         
        return data
    
    # Load the dataset
    dataset, evaldataset = get_dataset(script_args.dataset_name)

    # Initialize the GRPO trainer
    trainer = GRPOTrainer(
        model=model,
        reward_funcs = reward_len,
        args=training_args,
        train_dataset = dataset,
        eval_dataset = evaldataset if training_args.eval_strategy != "no" else None,
        processing_class=tokenizer,
        peft_config=get_peft_config(model_args),
    )

    # Train and push the model to the Hub
    trainer.train()

    # Save and push to hub
    trainer.save_model(training_args.output_dir)
    # if training_args.push_to_hub:
    #     trainer.push_to_hub(dataset_name=script_args.dataset_name)


def make_parser(subparsers: argparse._SubParsersAction = None):
    dataclass_types = (GRPOScriptArguments, GRPOConfig, ModelConfig)
    if subparsers is not None:
        parser = subparsers.add_parser("grpo", help="Run the GRPO training script", dataclass_types=dataclass_types)
    else:
        parser = TrlParser(dataclass_types)
    return parser


if __name__ == "__main__":
    parser = make_parser()
    script_args, training_args, model_args = parser.parse_args_and_config()
    main(script_args, training_args, model_args)
相关推荐
吴佳浩29 分钟前
什么?有人手写 Skill?Agent Skill?Skill?
人工智能·llm·agent
俊哥V5 小时前
每日 AI 研究简报 · 2026-05-21
人工智能·ai
2601_957884845 小时前
深度拆解:大模型RAG架构下,GEO优化的技术实现路径
人工智能·架构
这个DBA有点耶5 小时前
DBA的AI助手:向量检索与NL2SQL入门
数据库·人工智能·postgresql·学习方法·dba
YOLO数据集集合5 小时前
无人机航拍林业树种分割|单木树冠检测|三维点云|遥感影像数据集10059期
人工智能·yolo·目标检测·无人机
Pocker_Spades_A6 小时前
工业智能化的时序选型指南:当数据底座遇见机器学习
人工智能·机器学习
2601_955781986 小时前
飞书远程控机:OpenClaw配置全攻略
人工智能·开源·github·飞书·open claw安装·open claw部署
Inhand陈工6 小时前
游轮WiFi覆盖方案复盘:6台5G CPE + AP实现全船高速上网
人工智能·物联网·网络协议·网络安全·信息与通信·iot
程序猿追6 小时前
在 HarmonyOS 模拟器上种出斐波那契螺旋线
大数据·人工智能·microsoft·华为·harmonyos
:mnong6 小时前
跟着 Sesame Robot 项目学习
人工智能·robot·esp