在 DeepSpeed-Chat RLHF 阶段代码解读(0) ------ 原始 PPO 代码解读 - 掘金 (juejin.cn) 一文中,我们了解了 PPO 的原理和实现。本篇文章,我们来看 RLHF 里 Reward Model 的实现,为 PPO 应用到 RLHF 里做好前置准备。我们知道在原始 PPO 里,采取某种策略产生的奖励是由环境决定的,可以理解为内置的。但是在 RLHF 里,我们希望用 PPO 算法利用人类反馈使得 LLM 和人类的偏好对齐,所以我们需要对 LLM 的生成设置奖励,越符合人类偏好的 LLM 生成给予更高的奖励,不符合人类偏好的 LLM 生成给予惩罚(负的奖励),但是奖励从哪里,奖励从奖励函数来,这就是 RLHF 奖励函数存在的意义。
训练数据
prompt -> 用户的输入 chosen -> 更符合人类偏好的回答 rejected -> 不符合人类偏好的回答 为了方便理解整个流程,下面我们构造一下 batch_size 为 1 的数据 one_batch,即长度为 1 的训练数据,和 DeepSpeed-Chat 的 reward model 的训练数据一致。
python
model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
end_of_conversation_token = "<|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='right')
tokenizer.add_special_tokens(
{'additional_special_tokens': [end_of_conversation_token]})
chosen_dataset = []
rejected_dataset = []
data = [{
"prompt": "背诵静夜思第一句",
"chosen": "床前明月光,疑是地上霜。",
"rejected": "等等。"
}]
chosen_sentence = data[0]["prompt"] + data[0]["chosen"] + end_of_conversation_token
reject_sentence = data[0]["prompt"] + data[0]["rejected"] + end_of_conversation_token
chosen_dataset.append(tokenizer(
chosen_sentence,
max_length=48,
truncation=True,
padding="max_length",
return_tensors="pt",
))
rejected_dataset.append(tokenizer(
reject_sentence,
max_length=48,
truncation=True,
padding="max_length",
return_tensors="pt",
))
one_batch = {
"input_ids": torch.cat([chosen_dataset[0]["input_ids"], rejected_dataset[0]["input_ids"]], dim=0),
"attention_mask": torch.cat([chosen_dataset[0]["attention_mask"], rejected_dataset[0]["attention_mask"]], dim=0),
}
奖励模型
有了训练数据,我们接下来看看 reward model 是什么样子的。就很好理解 reward model 是怎么训练出来的了。比较难理解的部分,都在代码里注释了,这里就不再赘述。
python
class RewardModel(nn.Module):
def __init__(self,
base_model,
tokenizer,
num_padding_at_beginning=0,
compute_fp32_loss=False):
super().__init__()
self.config = base_model.config
self.num_padding_at_beginning = num_padding_at_beginning
# reward model 为了计算 reward 额外加的一层
self.v_head = nn.Linear(self.config.hidden_size, 1, bias=False)
self.rwtransformer = base_model
self.PAD_ID = tokenizer.pad_token_id
self.compute_fp32_loss = compute_fp32_loss
def gradient_checkpointing_enable(self):
self.rwtransformer.gradient_checkpointing_enable()
def gradient_checkpointing_disable(self):
self.rwtransformer.gradient_checkpointing_disable()
def forward(self,
input_ids=None,
past_key_values=None,
attention_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
use_cache=False):
loss = None
# base model 的输出
transformer_outputs = self.rwtransformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache)
# 取最后一层的隐层输出,然后输入到 v_head
# one_batch shape: [2, 48, 2048] -> [2, 48, 1] -> [2,48],所以相当于每个 token 有一个 reward
hidden_states = transformer_outputs[0]
print(f"hidden_states.shape: {hidden_states.shape}")
rewards = self.v_head(hidden_states).squeeze(-1)
print(f"rewards.shape: {rewards.shape}")
chosen_mean_scores = []
rejected_mean_scores = []
# Split the inputs and rewards into two parts, chosen and rejected
assert len(input_ids.shape) == 2
bs = input_ids.shape[0] // 2 # 对于 one_batch,bs 是 1
seq_len = input_ids.shape[1]
chosen_ids = input_ids[:bs] # bs x seq x 1
rejected_ids = input_ids[bs:]
chosen_rewards = rewards[:bs]
rejected_rewards = rewards[bs:]
# Compute pairwise loss. Only backprop on the different tokens before padding
loss = 0.
for i in range(bs):
chosen_id = chosen_ids[i]
rejected_id = rejected_ids[i]
chosen_reward = chosen_rewards[i]
rejected_reward = rejected_rewards[i]
c_inds = (chosen_id == self.PAD_ID).nonzero()
print(f"c_inds: {c_inds}")
# chosen data 的结束位置 c_ind,c_inds[self.num_padding_at_beginning].item() 是第一个 padding token 的位置
c_ind = c_inds[self.num_padding_at_beginning].item() if len(
c_inds
) > self.num_padding_at_beginning else seq_len # OPT model pads the first token, so we need to use the second padding token as the end of the sequence
# 看 chosen 和 rejected 是从哪个 token 开始不同的
check_divergence = (chosen_id != rejected_id).nonzero()
# 如果没有不同的 token,那么 c_ind 和 r_ind 一样
if len(check_divergence) == 0:
end_ind = rejected_reward.size(-1)
divergence_ind = end_ind - 1
r_ind = c_ind
else:
# Check if there is any padding otherwise take length of sequence
r_inds = (rejected_id == self.PAD_ID).nonzero()
r_ind = r_inds[self.num_padding_at_beginning].item(
) if len(r_inds) > self.num_padding_at_beginning else seq_len
# 如果有不同的 token,那么 c_ind 和 r_ind 里更大的那个是结束 token 的位置
end_ind = max(c_ind, r_ind)
# divergence_ind 是第一个不同的 token 的位置,reward 也是从这个位置开始不同的
divergence_ind = check_divergence[0]
assert divergence_ind > 0
c_truncated_reward = chosen_reward[divergence_ind:end_ind]
print("c_truncated_reward last_token reward: ", c_truncated_reward[-1])
r_truncated_reward = rejected_reward[divergence_ind:end_ind]
chosen_mean_scores.append(
chosen_reward[c_ind - 1]) #use the end score for reference
rejected_mean_scores.append(rejected_reward[r_ind - 1])
if self.compute_fp32_loss:
c_truncated_reward = c_truncated_reward.float()
r_truncated_reward = r_truncated_reward.float()
loss += -torch.nn.functional.logsigmoid(c_truncated_reward -
r_truncated_reward).mean()
# loss 这里是用的是所有 token 的 reward 的平均值,但是也有用最后一个 token 的 reward 的实现,这两种实现应该是都可以的
# loss += -torch.nn.functional.logsigmoid(c_truncated_reward[-1] -
# r_truncated_reward[-1])
loss = loss / bs
chosen_mean_scores = torch.stack(chosen_mean_scores)
rejected_mean_scores = torch.stack(rejected_mean_scores)
# chosen_mean_scores 实际上是 end_of_conversation_token 的 reward,这里叫 mean 有些奇怪
return {
"loss": loss,
"chosen_mean_scores": chosen_mean_scores,
"rejected_mean_scores": rejected_mean_scores,
}
forward
可以看出 forward 的结果,和代码里注释的是一样的。
python
base_model = AutoModel.from_pretrained(model_path)
base_model.resize_token_embeddings(int(
8 *
math.ceil(len(tokenizer) / 8.0)))
critic_model = RewardModel(base_model, tokenizer)
critic_model(**one_batch)
"""
输出:
hidden_states.shape: torch.Size([2, 48, 2048])
rewards.shape: torch.Size([2, 48])
c_inds: tensor([[37],
[38],
[39],
[40],
[41],
[42],
[43],
[44],
[45],
[46],
[47]])
c_truncated_reward last_token reward: tensor(0.0040, grad_fn=<SelectBackward0>)
{'loss': tensor(0.4015, grad_fn=<DivBackward0>),
'chosen_mean_scores': tensor([0.0040], grad_fn=<StackBackward0>),
'rejected_mean_scores': tensor([0.2623], grad_fn=<StackBackward0>)}
"""
Loss 优化
TODO.
到这里 Reward Model 大致过了一遍,下篇会介绍如何使用 PPO + Reward Model 进行 RLHF 使得 LLM 和人类偏好对齐。如果要自己运行上面的代码,可以看:RLXF/reward_model.ipynb at main · amulil/RLXF (github.com)