deepseek-glm4-grpo训练

一、目录

1.grpo 重新训练已经微调的glm4模型

二、实现

1.grpo 重新训练已经微调的glm4模型

1.1 指令:

复制代码
 CUDA_VISIBLE_DEVICES=1 nohup python test.py --model_name_or_path /home/LLaMA-Factory/saves/glm4-9b-lora-alpaca_reference_train20250115_01_merge \
        --dataset_name /home/LLaMA-Factory/data/alpca_all_simple.json \
        --learning_rate 5.0e-6 \
        --num_train_epochs 2   \
        --per_device_train_batch_size 2  \
        --num_generations 4 \
        --gradient_accumulation_steps 4 \
         --logging_steps 25 \
        --eval_strategy steps \
        --eval_steps 50 \
        --use_peft 1 \
        --lora_r 32 \
        --lora_alpha 16 \
        --output_dir /saves/glm4-9b-grpo >grop_output.log 2>&1 &

1.2 遇到问题及解决

复制代码
1. tokenizer no padding_side 字段
解决:脚本中添加该字段  padding_side: Optional[str] = None,
 def _pad(
            self,
            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            pad_to_multiple_of: Optional[int] = None,
            padding_side: Optional[str] = None,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
2. model no num_logits_to_keep 字段
修改模型脚本,进行添加该字段,以及相关功能。
 def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            return_last_logit: Optional[bool] = False,
            num_logits_to_keep: int = 0
    ):
        lm_logits = self.transformer.output_layer(hidden_states[:, -num_logits_to_keep:, :])

1.3 脚本

复制代码
#coding="utf8"
import json
import argparse
from typing import Optional
from dataclasses import dataclass, field
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config


@dataclass
class GRPOScriptArguments(ScriptArguments):
    """
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
    """

    reward_model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
        },
    )


class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["instruction"]
        return {"prompt": text}

def get_dataset(path):
    import json
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    dataset = MyDataset(data[:-200])
    evaldataset = MyDataset(data[-200:])
    return dataset, evaldataset


def main(script_args, training_args, model_args):
    # Load a pretrained model
    print(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )

    def reward_len(completions, **kwargs):
        #输出奖励
        data = []
        for completion in completions:
           
            try:
                completion = json.loads(completion)
                data.append(1.0)
            except:
                
                data.append(0.0)
         
        return data
    
    # Load the dataset
    dataset, evaldataset = get_dataset(script_args.dataset_name)

    # Initialize the GRPO trainer
    trainer = GRPOTrainer(
        model=model,
        reward_funcs = reward_len,
        args=training_args,
        train_dataset = dataset,
        eval_dataset = evaldataset if training_args.eval_strategy != "no" else None,
        processing_class=tokenizer,
        peft_config=get_peft_config(model_args),
    )

    # Train and push the model to the Hub
    trainer.train()

    # Save and push to hub
    trainer.save_model(training_args.output_dir)
    # if training_args.push_to_hub:
    #     trainer.push_to_hub(dataset_name=script_args.dataset_name)


def make_parser(subparsers: argparse._SubParsersAction = None):
    dataclass_types = (GRPOScriptArguments, GRPOConfig, ModelConfig)
    if subparsers is not None:
        parser = subparsers.add_parser("grpo", help="Run the GRPO training script", dataclass_types=dataclass_types)
    else:
        parser = TrlParser(dataclass_types)
    return parser


if __name__ == "__main__":
    parser = make_parser()
    script_args, training_args, model_args = parser.parse_args_and_config()
    main(script_args, training_args, model_args)
相关推荐
让学习成为一种生活方式6 小时前
海洋类胡萝卜素生物合成的乙酰转移酶--文献精读217
人工智能
QQ676580086 小时前
服装计算机视觉数据集 连衣裙数据集 衣服类别识别 毛衣数据集 夹克衫AI识别 衬衫识别 裤子 数据集 yolo格式数据集
人工智能·yolo·计算机视觉·连衣裙·衣服类别·毛衣数据集·夹克衫ai
冰糖葫芦三剑客6 小时前
人工智能生成合成内容文件元数据隐式标识说明函要怎么填写
人工智能
CV-杨帆6 小时前
ICLR 2026 LLM安全相关论文整理
人工智能·深度学习·安全
田八6 小时前
聊聊AI的发展史,AI的爆发并不是偶然
前端·人工智能·程序员
zandy10116 小时前
全链路可控+极致性能,衡石HENGSHI CLI重新定义企业级BI工具的AI协作能力
大数据·人工智能·ai analytics·ai native·agent-first
广州灵眸科技有限公司6 小时前
为RK3588注入澎湃算力:RK1820 AI加速卡完整适配与评测指南
linux·网络·人工智能·物联网·算法
小程故事多_806 小时前
从零吃透Transformer核心,多头注意力、残差连接与前馈网络(大白话完整版)
人工智能·深度学习·架构·aigc·transformer
xiejava10186 小时前
写了一个WebDAV的Skill解决OpenClaw AI助手跨平台协作难题
人工智能·ai编程·智能体·openclaw
zhanghongbin016 小时前
AI 采集器:Claude Code、OpenAI、LiteLLM 监控
java·前端·人工智能