deepseek-glm4-grpo训练

一、目录

1.grpo 重新训练已经微调的glm4模型

二、实现

1.grpo 重新训练已经微调的glm4模型

1.1 指令:

复制代码
 CUDA_VISIBLE_DEVICES=1 nohup python test.py --model_name_or_path /home/LLaMA-Factory/saves/glm4-9b-lora-alpaca_reference_train20250115_01_merge \
        --dataset_name /home/LLaMA-Factory/data/alpca_all_simple.json \
        --learning_rate 5.0e-6 \
        --num_train_epochs 2   \
        --per_device_train_batch_size 2  \
        --num_generations 4 \
        --gradient_accumulation_steps 4 \
         --logging_steps 25 \
        --eval_strategy steps \
        --eval_steps 50 \
        --use_peft 1 \
        --lora_r 32 \
        --lora_alpha 16 \
        --output_dir /saves/glm4-9b-grpo >grop_output.log 2>&1 &

1.2 遇到问题及解决

复制代码
1. tokenizer no padding_side 字段
解决:脚本中添加该字段  padding_side: Optional[str] = None,
 def _pad(
            self,
            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            pad_to_multiple_of: Optional[int] = None,
            padding_side: Optional[str] = None,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
2. model no num_logits_to_keep 字段
修改模型脚本,进行添加该字段,以及相关功能。
 def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            return_last_logit: Optional[bool] = False,
            num_logits_to_keep: int = 0
    ):
        lm_logits = self.transformer.output_layer(hidden_states[:, -num_logits_to_keep:, :])

1.3 脚本

复制代码
#coding="utf8"
import json
import argparse
from typing import Optional
from dataclasses import dataclass, field
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config


@dataclass
class GRPOScriptArguments(ScriptArguments):
    """
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
    """

    reward_model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
        },
    )


class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["instruction"]
        return {"prompt": text}

def get_dataset(path):
    import json
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    dataset = MyDataset(data[:-200])
    evaldataset = MyDataset(data[-200:])
    return dataset, evaldataset


def main(script_args, training_args, model_args):
    # Load a pretrained model
    print(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=True
    )

    def reward_len(completions, **kwargs):
        #输出奖励
        data = []
        for completion in completions:
           
            try:
                completion = json.loads(completion)
                data.append(1.0)
            except:
                
                data.append(0.0)
         
        return data
    
    # Load the dataset
    dataset, evaldataset = get_dataset(script_args.dataset_name)

    # Initialize the GRPO trainer
    trainer = GRPOTrainer(
        model=model,
        reward_funcs = reward_len,
        args=training_args,
        train_dataset = dataset,
        eval_dataset = evaldataset if training_args.eval_strategy != "no" else None,
        processing_class=tokenizer,
        peft_config=get_peft_config(model_args),
    )

    # Train and push the model to the Hub
    trainer.train()

    # Save and push to hub
    trainer.save_model(training_args.output_dir)
    # if training_args.push_to_hub:
    #     trainer.push_to_hub(dataset_name=script_args.dataset_name)


def make_parser(subparsers: argparse._SubParsersAction = None):
    dataclass_types = (GRPOScriptArguments, GRPOConfig, ModelConfig)
    if subparsers is not None:
        parser = subparsers.add_parser("grpo", help="Run the GRPO training script", dataclass_types=dataclass_types)
    else:
        parser = TrlParser(dataclass_types)
    return parser


if __name__ == "__main__":
    parser = make_parser()
    script_args, training_args, model_args = parser.parse_args_and_config()
    main(script_args, training_args, model_args)
相关推荐
大嘴带你水论文1 天前
震惊!仅用10张照片就能随意编辑3D人脸?韩国KAIST最新黑科技FFaceNeRF解析!
论文阅读·人工智能·python·科技·计算机视觉·3d·transformer
IT_陈寒1 天前
🔥3分钟掌握JavaScript性能优化:从V8引擎原理到5个实战提速技巧
前端·人工智能·后端
格林威1 天前
棱镜的技术加持:线扫相机如何同时拍RGB和SWIR?
人工智能·深度学习·数码相机·yolo·计算机视觉
JoinApper1 天前
小白学OpenCV系列3-图像算数运算
人工智能·opencv·计算机视觉
张小九991 天前
ThermoSeek:热稳定蛋白数据库
人工智能
wzy-6661 天前
DINOv3 新颖角度解释
人工智能
jie*1 天前
小杰机器学习(two)——导数、损失函数、斜率极值最值、微分规则、切平面与偏导数、梯度。
人工智能·机器学习
Niuguangshuo1 天前
深度学习:归一化技术
人工智能·深度学习
302AI1 天前
Claude 断供中国之际,Kimi-K2-0905 低调上线:时势造英雄
人工智能·llm·ai编程
却道天凉_好个秋1 天前
计算机视觉(九):图像轮廓
人工智能·opencv·计算机视觉·图像轮廓