- # applications/DeepSpeed-Chat/train.py
- parser.add_argument(
- "--actor-model",
- # actor模型只能选用“facebook/opt”系列的1.3b~66b
- # 但即使是1.3b也仍对设备显存有一定的要求
- # 对于使用较弱设备进行学习的用户不是很友好
- type=lambda x: x.replace("facebook/opt-", ""),
- default="1.3b",
- choices=("1.3b", "6.7b", "13b", "66b"),
- help="Which facebook/opt-* model to use for Actor (step 1)",
- )
数据格式名称 | 说明 | 样例 |
chosen_sentence | 人类偏好的完整对话,由prompt衔接偏好应答chosen得到,适用于phase1和phase2。 | “Human: Please tell me about Microsoft in a few sentence? Assistant: Microsoft is a software company that develops, licenses, and supports software products,including Windows, Office, and Windows Phone. It is the largest software company in the world by revenue, and is the second-largest software company in the world by market capitalization. Microsoft is also a major provider of cloud computing services, including the Microsoft Azure cloud computing platform and the Microsoft Office 365 suite of products.” |
模型将基于自回归语言建模任务对形如上述样例的数据进行训练,最后将得到具备更佳对话能力的模型SFT/ref(Supervised Fine-Tuning)。
更多的数据格式可见【上篇】的“1.2.1 数据格式基本概念”。
- 在此简单讲述UML时序图的元素含义:
- - 箭头表示信息传递:实线表示调用,虚线表示返回;
- - alt表示假设分支,其后方“[]”中的内容表示“条件”;
- - loop表示循环;
- - 淡蓝色区域即为高亮部分。
main1.pyutils.pymodel_utils.pylora.pydata_utils.pytransformersload_hf_tokenizer()1tokenizer2create_hf_model()3model4convert_linear_layer_to_lora()5model6only_optimize_lora_parameters()7model8alt[only_optimize_lora]alt[lora_dim > 0]create_prompt_dataset()9train_dataset, eval_dataset10train_dataloader, eval_dataloader11DeepSpeedEngine: model, opt, lrs12evaluation()13perplexity14model.forward()15LinearLayer_LoRA.forward()16output17alt[lora_dim > 0]loss18backward()19step()20loop[step]evaluation()21perplexity22save model23loop[epoch]main1.pyutils.pymodel_utils.pylora.pydata_utils.pytransformers
p ( x ) = ∏ t = 1 T p ( x t ∣ x < t ) p(x) = \prod_{t=1}^{T} p(x_t|x_{<t})p(x)=t=1∏Tp(xt∣x<t)
- # applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
- """
- 模型调用create_hf_model方法进行构建,
- 参数指定有AutoModelForCausalLM
- """
- model = create_hf_model(AutoModelForCausalLM, ···)
- # applications/DeepSpeed-Chat/training/utils/model/model_utils.py
- def create_hf_model(model_class, ···):
- ···
- """model_class=AutoModelForCausalLM"""
- model = model_class.from_pretrained(
- model_name_or_path,
- from_tf=bool(".ckpt" in model_name_or_path),
- config=model_config)
- ···

- # applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
- # 判断是否启用LoRA模式
- if args.lora_dim > 0:
- """
- 如果启用,则对名称中含有“decoder.layers.”且为线性层的结构部分引入LoRA旁路(实现先降维后升维的2个线性层),
- 这类结构基本都是attention、信息交互用的inner线性层,
- 这类结构的Weight参数将被冻结,转而优化LoRA旁路的参数。
- """
- args.lora_module_name = "decoder.layers."
- model = convert_linear_layer_to_lora(model, args.lora_module_name,
- args.lora_dim)
- # applications/DeepSpeed-Chat/training/utils/module/lora.py
- def convert_linear_layer_to_lora(model,
- part_module_name,
- lora_dim=0,
- lora_scaling=1,
- lora_droppout=0):
- """
- 将名称中带有"decoder.layers."的线性层转换为lora层
- """
- """取出模型中参数名含有decoder.layers.的线性层"""
- repalce_name = []
- for name, module in model.named_modules():
- if isinstance(module, nn.Linear) and part_module_name in name:
- repalce_name.append(name)
- for name in repalce_name:
- """recursive_getattr实现了从model中根据属性名取出对应原始结构"""
- module = recursive_getattr(model, name)
- """纳入原始结构的参数,实例化lora层"""
- tmp = LinearLayer_LoRA(
- module.weight, lora_dim, lora_scaling, lora_droppout,
- module.bias).to(module.weight.device).to(module.weight.dtype)
- """recursive_getattr实现了将model对应属性的结构换成lora层实例"""
- recursive_setattr(model, name, tmp)
- return model
- # applications/DeepSpeed-Chat/training/utils/module/lora.py
- class LinearLayer_LoRA(nn.Module):
- """具体的lora层"""
- def __init__(...):
- ...
- """此处的weight和bias即为原始结构中的参数"""
- self.weight = weight
- self.bias = bias
- ···
- """冻结weight部分的参数"""
- self.weight.requires_grad = False
- ···
- self.lora_right_weight = nn.Parameter(torch.zeros(columns, lora_dim))
- self.lora_left_weight = nn.Parameter(torch.zeros(lora_dim, rows))
- ···
- """初始化LoRA线性层的参数"""
- self.reset_parameters()
- def reset_parameters(self):
- """初始化LoRA线性层的参数"""
- # 降维矩阵使用kaiming均匀分布初始化,
- # 服从均匀分布U(-\sqrt{1/in_feature}, +\sqrt{1/in_feature})
- # 与LoRA原始定义所用的(0,\sigma^2)正态分布初始化不同
- nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5))
- # 升维矩阵使用全0初始化
- nn.init.zeros_(self.lora_left_weight)
- def forward(self, input):
- """LoRA的正向传播"""
- ···
- else:
- return F.linear(input, self.weight, self.bias)
- + (self.lora_dropout(input) @ self.lora_right_weight @ self.lora_left_weight) * self.lora_scaling

- OPTForCausalLM(
- (model): OPTModel(
- (decoder): OPTDecoder(
- (embed_tokens): Embedding(50272, 768, padding_idx=1)
- (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
- (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
- (layers): ModuleList(
- (0-11): 12 x OPTDecoderLayer(
- (self_attn): OPTAttention(
- (k_proj): LinearLayer_LoRA(
- (lora_dropout): Identity()
- )
- (v_proj): LinearLayer_LoRA(
- (lora_dropout): Identity()
- )
- (q_proj): LinearLayer_LoRA(
- (lora_dropout): Identity()
- )
- (out_proj): LinearLayer_LoRA(
- (lora_dropout): Identity()
- )
- )
- (activation_fn): ReLU()
- (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
- (fc1): LinearLayer_LoRA(
- (lora_dropout): Identity()
- )
- (fc2): LinearLayer_LoRA(
- (lora_dropout): Identity()
- )
- (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
- )
- )
- )
- )
- (lm_head): Linear(in_features=768, out_features=50272, bias=False)
- )

所定义(可见下方代码块),即“LoRA层的两条分支结果进行加和”。在代码中体现为F.linear(input, self.weight, self.bias) + (self.lora_dropout(input) @ self.lora_right_weight @ self.lora_left_weight) * self.lora_scaling
- # applications/DeepSpeed-Chat/training/utils/module/lora.py
- class LinearLayer_LoRA(nn.Module):
- """具体的lora层"""
- ···
- def forward(self, input):
- """LoRA的正向传播"""
- ···
- else:
- return F.linear(
- input, self.weight,
- self.bias) + (self.lora_dropout(input) @ self.lora_right_weight
- @ self.lora_left_weight) * self.lora_scaling
Supervised fine-tuning (SFT) has indeed made significant progress in the field of large language models (LLMs). However, unexpected behaviors such as repeating content generation and inconsistency between perplexity (PPL) scores and generation capabilities can still occur.
p e r p l e x i t y = ( ∏ t = 1 T p t ) − 1 T perplexity = (\prod_{t=1}^{T} p_t)^{-\frac{1}{T}}perplexity=(t=1∏Tpt)−T1
其中,输出的句子共有T TT个token,第t tt个token的置信概率值为p t p_tpt。
l o s s = − 1 T ∑ t = 1 T log p t loss = -\frac{1}{T} \sum_{t=1}^{T}\log{p_t}loss=−T1t=1∑Tlogpt
其中,输出的句子共有T TT个token,第t tt个token的置信概率值为p t p_tpt。
p e r p l e x i t y = exp ( l o s s ) perplexity = \exp(loss)perplexity=exp(loss)
- def evaluation(model, eval_dataloader):
- """
- 以困惑度perplexity为评估指标进行验证
- """
- model.eval()
- losses = 0
- for step, batch in enumerate(eval_dataloader):
- """
- batch: 由input_ids、attention_mask、labels共3个部分组成的dict。
- 其中每个部分的shape均为(bs, max_seq_len)
- """
- batch = to_device(batch, device)
- with torch.no_grad():
- outputs = model(**batch)
- """Causal LM 的损失函数为交叉熵损失"""
- loss = outputs.loss
- losses += loss.float()
- losses = losses / (step + 1)
- try:
- """困惑度perplexity通常可以通过exp(CELoss)计算得到"""
- perplexity = torch.exp(losses)
- except OverflowError:
- perplexity = float("inf")
- try:
- """
- - get_all_reduce_mean中调用了torch.distributed.all_reduce(perplexity, op=torch.distributed.ReduceOp.SUM)
- - 对所有进程、或者说GPU(因为通常情况下就是单个进程控制单个GPU)中的perplexity进行求和
- - 然后再除以全局进程数torch.distributed.get_world_size()得到平均的perplexity结果
- """
- perplexity = get_all_reduce_mean(perplexity).item()
- except:
- pass
- return perplexity

单轮或多轮 | 样例 |
单轮prompt | "Human: Please tell me about Microsoft in a few sentence? Assistant: " |
多轮prompt | “Human: I’m buying a new video game console for the first time since in a decade, but I forget their setups. What do I need in addition to a Playstation 5 in order to play the console? Assistant: You need to buy a Playstation 5 first. Then you’ll also need a TV with HDMI input. It’s possible that HDMI to HDMI cables will also work, but it’s more likely that you’ll need a physical HDMI cord. You also might want to buy an extra power adapter, since the ones that come with new Playstation 5’s are quite short. Are you looking to play on a PC or any other system? That might affect what other hardware you need to buy. Human: Playstation 5’s cables aren’t short, but that’s good information. Can any television with an HDMI input play PS5? Assistant:” |
单轮chosen_sentence | “Human: Please tell me about Microsoft in a few sentence? Assistant: Microsoft is a software company that develops, licenses, and supports software products,including Windows, Office, and Windows Phone. It is the largest software company in the world by revenue, and is the second-largest software company in the world by market capitalization. Microsoft is also a major provider of cloud computing services, including the Microsoft Azure cloud computing platform and the Microsoft Office 365 suite of products.” |
多轮chosen_setence | “Human: I’m buying a new video game console for the first time since in a decade, but I forget their setups. What do I need in addition to a Playstation 5 in order to play the console? Assistant: You need to buy a Playstation 5 first. Then you’ll also need a TV with HDMI input. It’s possible that HDMI to HDMI cables will also work, but it’s more likely that you’ll need a physical HDMI cord. You also might want to buy an extra power adapter, since the ones that come with new Playstation 5’s are quite short. Are you looking to play on a PC or any other system? That might affect what other hardware you need to buy. Human: Playstation 5’s cables aren’t short, but that’s good information. Can any television with an HDMI input play PS5? Assistant: So you’ve got a Playstation 5 and a TV that you’re going to connect together with an HDMI cable, and you want to know if that’s going to work? It’s definitely possible for the two to work together, and you might need an additional power adapter if your TV only came with a shorter adapter. However, it may be difficult to determine if it will work for sure. This is one area where troubleshooting and making educated guesses may be necessary. You should still be able to easily use your console, but it may be necessary to troubleshoot first.” |
From InstructGPT work, it is recommended to train the model for overfitting (aka longer epochs) for better human-preferred answers. Through our exploration, we have found this to be particularly helpful for smaller model finetuning, such as OPT-1.3B.
数据格式名称 | 说明 | 样例 |
chosen_sentence | 人类偏好的完整对话,由prompt衔接偏好应答chosen得到,适用于phase1和phase2。 | “Human: Please tell me about Microsoft in a few sentence? Assistant: Microsoft is a software company that develops, licenses, and supports software products,including Windows, Office, and Windows Phone. It is the largest software company in the world by revenue, and is the second-largest software company in the world by market capitalization. Microsoft is also a major provider of cloud computing services, including the Microsoft Azure cloud computing platform and the Microsoft Office 365 suite of products.” |
reject_sentence | 人类排斥的完整对话,由prompt衔接排斥应答rejected得到,适用于phase2。 | “Human: Please tell me about Microsoft in a few sentence? Assistant: I’m not sure what you mean.” |
模型将基于排序损失对形如上述样例的数据对进行训练,最后将得到具备类人评分能力的RM(Reward Model)。
更多的数据格式可见【上篇】的“1.2.1 数据格式基本概念”。
- 在此简单讲述UML时序图的元素含义:
- - 箭头表示信息传递:实线表示调用,虚线表示返回;
- - alt表示假设分支,其后方“[]”中的内容表示“条件”;
- - loop表示循环;
- - 淡蓝色区域即为高亮部分。
main2.pyutils.pymodel_utils.pyreward_model.pylora.pydata_utils.pytransformersload_hf_tokenizer()1tokenizer2create_critic_model()3create_hf_model()4critic_model5RewardModel()6critic_model7rm_model(critic_model)8convert_linear_layer_to_lora()9model10only_optimize_lora_parameters()11model12alt[only_optimize_lora]alt[lora_dim > 0]create_prompt_dataset()13train_dataset, eval_dataset14DataCollatorReward()15data_collator16train_dataloader, eval_dataloader17DeepSpeedEngine: rm_model, opt, lrs18evaluation_reward()19reward_score, acc20critic_model.forward()21rwtranrsformer.forward()22LinearLayer_LoRA.forward()23output24alt[lora_dim > 0]output25loss26backward()27step()28loop[step]evaluation_reward()29reward_score, acc30save model31loop[epoch]main2.pyutils.pymodel_utils.pyreward_model.pylora.pydata_utils.pytransformers
- # applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py
- """
- rm_model调用了create_critic_model进行载入
- 默认情况下rm_model是不启用dropout的
- """
- rm_model = create_critic_model(···)
- # applications/DeepSpeed-Chat/training/utils/model/model_utils.py
- def create_critic_model(···):
- """此处的模型读取方法用的是“AutoModel”,因此此处critic_model只有主干部分"""
- critic_model = create_hf_model(AutoModel, ···)
- """
- critic_model传入RewardModel,将额外得到线性层输出头,
- 因此此处的critic_model结构为“v_head + 主干部分”
- """
- critic_model = RewardModel(critic_model, ···)
- ...
- return critic_model
- # applications/DeepSpeed-Chat/training/utils/model/reward_model.py
- class RewardModel(nn.Module):
- """
- 将读取得到的model的结构修改为适用于RewardModel的形式,
- 总的来说即是使用载入的主干网络进行特征提取,
- 其所提取的特征(最后层的各位置输出特征hidden_states)将被传入线性层,输出得到1个数值,
- 该数值即为分值,因此max_seq_len维度的每个位置均会得到1个分值
- """
- def __init__(self, base_model, ...):
- super().__init__()
- ···
- if hasattr(self.config, "word_embed_proj_dim"):
- """
- OPT系列模型的word_embed_proj_dim为embedding层的输出维度,
- 通常在transformer模型中也就等于 hidden_size,
- v_head将基于主干网络的输出特征 hidden_state 进行分值预测,共输出max_seq_len个分值
- """
- self.v_head = nn.Linear(self.config.word_embed_proj_dim,
- 1,
- bias=False)
- ···
- """base_model即为主干网络,因此RM最终由1个主干网络和1个线性层构成"""
- self.rwtranrsformer = base_model

- RewardModel(
- (v_head): Linear(in_features=768, out_features=1, bias=False)
- (rwtranrsformer): OPTModel(
- (decoder): OPTDecoder(
- (embed_tokens): Embedding(50272, 768, padding_idx=1)
- (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
- (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
- (layers): ModuleList(
- (0-11): 12 x OPTDecoderLayer(
- (self_attn): OPTAttention(
- (k_proj): Linear(in_features=768, out_features=768, bias=True)
- (v_proj): Linear(in_features=768, out_features=768, bias=True)
- (q_proj): Linear(in_features=768, out_features=768, bias=True)
- (out_proj): Linear(in_features=768, out_features=768, bias=True)
- )
- (activation_fn): ReLU()
- (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
- (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
- )
- )
- )
- )
- )

phase2使用的数据整理器data_collator为DataCollatorReward(),本阶段取出的单个样本example实际上是一个chosen-rejected数据对(见下方代码块),即1个大小为batch_size的batch取出了batch_size个数据对,data_collator将把数据对拆成chosen_sentence和reject_sentence(example一分为二),因此实际上1个batch真正输入模型的数据量大小应当为“batch_size * 2”。
- # applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py
- """phase2使用的data_collator为DataCollatorReward()"""
- data_collator = DataCollatorReward()
- # applications/DeepSpeed-Chat/training/utils/data/data_utils.py
- class DataCollatorReward:
- def __call__(self, data):
- """
- 对dataloader取到的数据 data 进一步整理,将数据整理成batch输入形式
- 入参 data 的具体样式可见下个代码块
- """
- batch = {}
- """f为data中的1个tuple,tuple的第0个元素和第2个元素
- 分别为chosen_sentence和reject_sentence的input_ids"""
- batch["input_ids"] = torch.cat([f[0] for f in data] +
- [f[2] for f in data],
- dim=0)
- """f为data中的1个tuple,tuple的第1个元素和第3个元素
- 分别为chosen_sentence和reject_sentence的attention_mask"""
- batch["attention_mask"] = torch.cat([f[1] for f in data] +
- [f[3] for f in data],
- dim=0)
- """batch的具体样式可见下个代码块"""
- return batch

- 输入的data为一个batch的数据列表,其中的 每个元素 为一对chosen-rejected数据:
- (
- chosen_sentence_input_ids,
- chosen_sentence_attention_mask,
- reject_sentence_input_ids,
- reject_sentence_attention_mask
- )
- 每组数据的第0个元素和第2个元素为input_ids,第1个元素和第3个元素为attention_mask。
- 输出的batch为字典:{“input_ids”: tensor([...]), "attention_mask": tensor([...])}
- 并且字典值中chosen位于前半部分,rejected位于后半部分:
- {
- "input_ids": [
- chosen_sentence_1_input_ids,
- chosen_sentence_2_input_ids,
- ...,
- reject_sentence_1_input_ids,
- reject_sentence_2_input_ids,
- ...
- ]
- "attention_mask": [
- chosen_sentence_1_attention_mask,
- chosen_sentence_2_attention_mask,
- ...,
- reject_sentence_1_attention_mask,
- reject_sentence_2_attention_mask,
- ...
- ]
- }
- 后续输入模型后,直接将数据切分出前半部分和后半部分进行并列,即可获得对应的chosen-rejected数据对。

成对排序损失(Pairwise Ranking Loss)
l o s s ( θ ) = E ( x , y c , y r ) ∼ D [ − l o g ( σ ( r θ ( x , y c ) − r θ ( x , y r ) ) ) ] loss(\theta) = E_{(x, y_c, y_r) \sim{D}} [-log(\sigma(r_\theta(x, y_c) - r_\theta(x, y_r)))]loss(θ)=E(x,yc,yr)∼D[−log(σ(rθ(x,yc)−rθ(x,yr)))]
其中,r θ r_\thetarθ为RM,x xx为prompt,y c y_cyc为chosen,y r y_ryr为rejected,( x , y c ) (x, y_c)(x,yc)和( x , y r ) (x, y_r)(x,yr)则分别为chosen_sentence和reject_sentence。
该损失函数的目的在于最大化“chosen/好的/排序靠前的”和“rejected/坏的/排序靠后的”的差值,由此促使r θ r_\thetarθ学习到相应的排序模式。
DeepSpeed-Chat在实现这部分时,r θ ( x , y c ) r_\theta(x,y_c)rθ(x,yc)和r θ ( x , y r ) r_\theta(x,y_r)rθ(x,yr)分别选择了chosen_sentence和reject_sentence两者answer的对齐部分,通过文字叙述略显抽象,查看下方的代码块有助于你理解这个概念:
- max_seq_len为10,pad_token_id为0,
- 有同属同个prompt的chosen_sentence和reject_sentence:
- prompt: [11, 22, 33]
- chosen_sentence: [11, 22, 33, 44, 55, 66, 0, 0, 0, 0]
- reject_sentence: [11, 22, 33, 40, 50, 0, 0, 0, 0, 0]
- “两者answer的对齐部分”即为“非prompt部分也非padding部分、但长度要对齐”:
- chosen_truncated: [44, 55, 66]
- reject_truncated: [40, 50, 0]
- chosen_sentence的answer比较长,所以reject_sentence在取相应部分时要取至与chosen部分等长为止;
- reject_sentence的answer较长时同理。
- pad_token_id = 0
- conversation = [11, 22, 33, 44, 55, 66, 0, 0, 0, 0]
- conversation_rewards = [2.01, 0.23, 2.89, 0.66, 0.33, 2.25, 0.36, 0.99, 1.32, 1.62]
- token_id为66的token作为该对话的最后1个有效token,
- 其对应的reward“2.25”将被用于表示整个对话的reward。
- # applications/DeepSpeed-Chat/training/utils/model/reward_model.py
- class RewardModel(nn.Module):
- def __init__(self, ···):
- ···
- ···
- def forward(self, input_ids=None, ···):
- """获得主干网络的输出的特征"""
- transformer_outputs = self.rwtranrsformer(···)
- """
- 取最后一层的输出特征
- hidden_states.shape: (bs*2, max_seq_len, hidden_size)
- """
- hidden_states = transformer_outputs[0]
- """
- 将特征送入全连接层得到分数回归值
- rewards.shape: (bs*2, max_seq_len)
- """
- rewards = self.v_head(hidden_states).squeeze(-1)
- """先前提及过,实际的bs应该是输入bs的一半"""
- bs = input_ids.shape[0] // 2
- """区分出chosen和reject"""
- chosen_ids = input_ids[:bs]
- rejected_ids = input_ids[bs:]
- chosen_rewards = rewards[:bs]
- rejected_rewards = rewards[bs:]
- loss = 0
- for i in range(bs):
- """
- 取出同组chosen和rejected的token_id和分值reward
- chosen_id.shape: (max_seq_len, )
- """
- chosen_id = chosen_ids[i]
- rejected_id = rejected_ids[i]
- chosen_reward = chosen_rewards[i]
- rejected_reward = rejected_rewards[i]
- """
- 下方本应有各种取index相关的操作,
- 基于源码解读的可读性考量,且这些部分只是逻辑形式上的弯弯绕绕,与相关原理并不存在直接关系,
- 所以我选择暂且将它们忽略。
- """
- """
- c_ind为chosen_sentence的answer后的第一个pad_token的index
- 例如pad_token_id=0,sentence[11,22,33,44,55,66,0,0,0,0],c_ind即为第一个pad_token的index=6。
- """
- c_ind = ···
- """r_ind同理,为reject_sentence的answer后的第一个pad_token的index"""
- r_ind = ···
- """end_ind则为两者的较大者"""
- end_ind = max(c_ind, r_ind)
- # 取chosen和rejected第一个不同的地方的index,可以理解为“response中两个回答自由发挥的第1个token的index”
- """divergence_ind为chosen_sentence和reject_sentence两者answer的第1个token的index"""
- divergence_ind = ···
- """
- 以chosen_sentence和reject_sentence最先不同的地方为起始、生成结束的地方为终止,取两者在这个片段的对应分值
- 这部分其实就是上个代码块提及的“对齐部分”
- """
- c_truncated_reward = chosen_reward[divergence_ind:end_ind]
- r_truncated_reward = rejected_reward[divergence_ind:end_ind]
- """
- (c_truncated_reward - r_truncated_reward).shape: (truncated_seq_len,)
- 计算损失时使用了rank loss的形式,并且是对chosen和rejected“对齐片段”进行计算的
- """
- loss += -torch.log(
- torch.sigmoid(c_truncated_reward - r_truncated_reward)).mean()
- loss = loss / bs
- """取代表结束的pad token所在位置的前一个位置(可以理解为的最后一个有效token的位置)的分值作为参考分值"""
- chosen_mean_scores.append(
- chosen_reward[c_ind - 1]) #use the end score for reference
- rejected_mean_scores.append(rejected_reward[r_ind - 1])
- chosen_mean_scores = torch.stack(chosen_mean_scores)
- rejected_mean_scores = torch.stack(rejected_mean_scores)
- """返回损失和参考分值"""
- return {
- "loss": loss,
- "chosen_mean_scores": chosen_mean_scores,
- "rejected_mean_scores": rejected_mean_scores,
- }
- ···

- def evaluation_reward(model, eval_dataloader):
- model.eval()
- """统计预测(赋分)正确的结果
- 即 chosen_reward > rejected_reward 的结果数"""
- correct_predictions = 0
- """统计预测总数"""
- total_predictions = 0
- scores = 0
- for step, batch in enumerate(eval_dataloader):
- batch = to_device(batch, device)
- with torch.no_grad():
- """outputs: {'loss':tensor(),
- 'chosen_mean_scores':tensor(bs,),
- 'rejected_mean_scores':tensor(bs,)}"""
- outputs = model(**batch)
- """chosen.shape: (bs,)"""
- chosen = outputs["chosen_mean_scores"]
- """rejected.shape: (bs,)"""
- rejected = outputs["rejected_mean_scores"]
- """"赋分正确"即为chosen分值大于rejected分值"""
- correct_predictions += (chosen > rejected).sum()
- total_predictions += chosen.shape[0]
- """累加每个step的平均chosen分值"""
- scores += outputs["chosen_mean_scores"].mean().float()
- if step == 99: # For faster evaluation and debugging
- break
- """计算acc指标"""
- acc = correct_predictions / total_predictions
- """计算当前step的平均chosen分值"""
- scores = scores / (step + 1)
- try:
- """多进程结果求和求平均"""
- acc = get_all_reduce_mean(acc).item()
- scores = get_all_reduce_mean(scores).item()
- except:
- pass
- return scores, acc

In our implementation, we use either the end token of the sequence or the first padding token as the aggregated score and compare them. Others may also use the average score for the entire answer as an alternative.
