deepseek-glm4-grpo训练

一、目录

1.grpo 重新训练已经微调的glm4模型

二、实现

1.grpo 重新训练已经微调的glm4模型

1.1 指令:

复制代码
 CUDA_VISIBLE_DEVICES=1 nohup python test.py --model_name_or_path /home/LLaMA-Factory/saves/glm4-9b-lora-alpaca_reference_train20250115_01_merge \
        --dataset_name /home/LLaMA-Factory/data/alpca_all_simple.json \
        --learning_rate 5.0e-6 \
        --num_train_epochs 2   \
        --per_device_train_batch_size 2  \
        --num_generations 4 \
        --gradient_accumulation_steps 4 \
         --logging_steps 25 \
        --eval_strategy steps \
        --eval_steps 50 \
        --use_peft 1 \
        --lora_r 32 \
        --lora_alpha 16 \
        --output_dir /saves/glm4-9b-grpo >grop_output.log 2>&1 &

1.2 遇到问题及解决

复制代码
1. tokenizer no padding_side 字段
解决:脚本中添加该字段  padding_side: Optional[str] = None,
 def _pad(
            self,
            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            pad_to_multiple_of: Optional[int] = None,
            padding_side: Optional[str] = None,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
2. model no num_logits_to_keep 字段
修改模型脚本,进行添加该字段,以及相关功能。
 def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            return_last_logit: Optional[bool] = False,
            num_logits_to_keep: int = 0
    ):
        lm_logits = self.transformer.output_layer(hidden_states[:, -num_logits_to_keep:, :])

1.3 脚本

复制代码
#coding="utf8"
import json
import argparse
from typing import Optional
from dataclasses import dataclass, field
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config


@dataclass
class GRPOScriptArguments(ScriptArguments):
    """
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
    """

    reward_model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
        },
    )


class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["instruction"]
        return {"prompt": text}

def get_dataset(path):
    import json
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    dataset = MyDataset(data[:-200])
    evaldataset = MyDataset(data[-200:])
    return dataset, evaldataset


def main(script_args, training_args, model_args):
    # Load a pretrained model
    print(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )

    def reward_len(completions, **kwargs):
        #输出奖励
        data = []
        for completion in completions:
           
            try:
                completion = json.loads(completion)
                data.append(1.0)
            except:
                
                data.append(0.0)
         
        return data
    
    # Load the dataset
    dataset, evaldataset = get_dataset(script_args.dataset_name)

    # Initialize the GRPO trainer
    trainer = GRPOTrainer(
        model=model,
        reward_funcs = reward_len,
        args=training_args,
        train_dataset = dataset,
        eval_dataset = evaldataset if training_args.eval_strategy != "no" else None,
        processing_class=tokenizer,
        peft_config=get_peft_config(model_args),
    )

    # Train and push the model to the Hub
    trainer.train()

    # Save and push to hub
    trainer.save_model(training_args.output_dir)
    # if training_args.push_to_hub:
    #     trainer.push_to_hub(dataset_name=script_args.dataset_name)


def make_parser(subparsers: argparse._SubParsersAction = None):
    dataclass_types = (GRPOScriptArguments, GRPOConfig, ModelConfig)
    if subparsers is not None:
        parser = subparsers.add_parser("grpo", help="Run the GRPO training script", dataclass_types=dataclass_types)
    else:
        parser = TrlParser(dataclass_types)
    return parser


if __name__ == "__main__":
    parser = make_parser()
    script_args, training_args, model_args = parser.parse_args_and_config()
    main(script_args, training_args, model_args)
相关推荐
ZzT1 小时前
怎么做才不会被 AI 替代?
人工智能·程序员
道友可好1 小时前
从今天开始:你的第一个 Harness Engineering 实践
前端·人工智能·后端
小姜前线技术2 小时前
AI回答代码块高亮加一键复制
人工智能
洛阳泰山2 小时前
从 0 到 1.6K Star:一个 Java 开源项目的增长复盘
人工智能·后端·开源
米小虾3 小时前
Agent Skill 设计模式完全指南
人工智能·agent
饼干哥哥3 小时前
保姆级教程:用Image2 + Seedance2.0 做长视频,以品牌广告为例
人工智能
米小虾4 小时前
Agent Skill 规范与 Skill-Creator 核心思想
人工智能·agent
ZhengEnCi4 小时前
09e-斯坦福CS336作业四:大规模语言模型训练数据收集与处理
人工智能
oil欧哟4 小时前
Codex 最佳实践(超级长文):先搞懂 AI,再用好 AI
前端·人工智能·后端
甲维斯4 小时前
日本发布比肩Fable5的模型?Fugu Ultra初探!
人工智能·ai编程