赞
踩
接着分析data_collator.py之中的__call__函数的后续的内容
if self.mlm:
#special_tokens_mask = None
batch["input_ids"], batch["labels"] = self.mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
这里面需要进入self.mask_tokens去调用
def mask_tokens( self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: print('data/data_collator.py mask_tokens') """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) r""" labels = tensor( [[ 101, 169, 107, ..., 10539, 107, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], ..., [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 117, 169, 102]]) """ probability_matrix = torch.full(labels.shape, self.mlm_probability) r""" probability_matrix = tensor([[0.1500,0.1500,...], [0.1500,0.1500,...], .................. """ if special_tokens_mask is None: special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() probability_matrix.masked_fill_(special_tokens_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
首先复制一下对应的labels的值
labels = inputs.clone()
labels = tensor(
[[ 101, 169, 107, ..., 10539, 107, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
...,
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 117, 169, 102]]
)
接着调用对应的probability_matrix矩阵
probability_matrix = torch.full(labels.shape,self.mlm_probability)
得到的对应的probability_matrix矩阵
probability_matrix =
tensor([[0.1500,0.1500,...],
[0.1500,0.1500,...],
..................
[0.1500,0.1500,...]])
接下来查看对于masked_indices的调用
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
print('special_tokens_mask1 = ')
print(special_tokens_mask)
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
print('special_tokens_mask2 = ')
print(special_tokens_mask)
else:
special_tokens_mask = special_tokens_mask.bool()
得到对应的内容为
special_tokens_mask1 =
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
.....................
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1]]
对应的special_tokens_mask2的内容为
special_tokens_mask2 =
tensor([[ True, False, False, ..., False, False, True],
...,
[ True, False, False, ..., False, False, True]])
这里调用special_tokens_mask1需要调用get_special_tokens_mask的函数内容
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val,already_head_special_tokens=True) for valu in labels.tolist()
]
进入到self.tokenizer.get_special_tokens_mask函数之中去查看,这里的self.tokenizer指向的是PreTrainedTokenizer类别的内容
查看Transformer/tokenization_utils/PreTrainedTokenizer.py的文件内容
if isinstnace(examples[0], (dict,BatchEncoding)):
batch = self.tokenizer.pad(examples,return_tensors="pt",pad_to_multiple_of
print('|||self.tokenizer = |||')
print(self.tokenizer)
|||self.tokenizer = |||
PreTrainedTokenizer is_fast
PreTrainedTokenizer is_fast
PreTrainedTokenizer(name_or_path='/home/xiaoguzai/数据/nezha-chinese-base/vocab.txt', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
首先这里PreTrainedTokenizer必须调用的是is_fast函数的内容,因为这里的is_fast被标注为@property
@property
def is_fast(self) -> bool:
return False
这里不知道为何调用is_fast函数的内容,猜想这里的is_fast函数内容是函数初始化进行调用的
self.pad_to_multiple_of = None
这里调用的是PreTrainedTokenizer.pad,同时从tokenization_utils_base之中能够调用出PreTrainedTokenizerBase之中的pad函数
batch =
{'input_ids':tensor(
[[101,169,...102],
........................
[101,169,...102]])
'attention_mask':tensor(
[[1,1,...1,1]
...............
[1,1,...1,1]]
)
这里先总览一下对应的batch内容
batch = {'input_ids': tensor([[ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 107, 10539, 102], ..., [ 101, 169, 107, ..., 131, 107, 102], [ 101, 169, 103, ..., 100, 124, 102], [ 101, 169, 107, ..., 171, 117, 102]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], ..., [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], ..., [-100, -100, -100, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100]])}
然后应该是这波数据进入model之中进行训练
发现这里输入的预训练的内容仍然为:
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
#prediction_scores = torch.Size([32,90,21128])
outputs = (prediction_scores,) + outputs[2:]
#outputs[0].shape = torch.Size([32,90,21128])
masked_lm_labels = None
if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token
#prediction_scores = ([32,90,21128])
#prediction_scores.view = ([2880,21128])
#labels.view(-1) = ([2880])
#labels.view = tensor([-100,-100,...,117,-100,-100],device='cuda:0')
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
最后阅读一波关于在预训练之中使用的损失函数的代码内容
outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] masked_lm_labels = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs
这里的self.bert的内容为NeZhaModel的内容(带有cls网络层),接着调用的内容为
prediction_scores = self.cls(sequence_output)
这里调用的网络层内容为
self.cls = BertOnlyMLMHead(config)
进入到BertOnlyMLMHead(config)网络层之中查看调用过程
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
这里需要调用BertLMPredictionHead的网络层,进入到BertLMPredictionHead网络层之中
class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): #config.hidden_act = 'gelu',调用第一个if语句 self.transform_act_fn = ACT2FN[config.hidden_act] else: print('BertPredictionHeadTransform situation2') self.transform_act_fn = config.hidden_act self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states
完整地调用网络层的过程如下:
Linear(config.hidden_size,config.hidden_size)
'gelu'激活函数
LayerNorm(0.1)
Linear(config.hidden_size,config.vocab_size)
(最后一个Linear的bias=zeros)
最后调用相应的CrossEntropy()损失函数的内容
if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
!!!注意这里的对应的modeling.py的内容一定要读pretrain_code目录下面的modeling下面的modeling_nezha下面的modeling.py的内容
也就是说,这里本质上是求
batch = {'input_ids': tensor([[ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 107, 10539, 102], ..., [ 101, 169, 107, ..., 131, 107, 102], [ 101, 169, 103, ..., 100, 124, 102], [ 101, 169, 107, ..., 171, 117, 102]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], ..., [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], ..., [-100, -100, -100, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100]])}
中由input_ids输出的prediction_scores.view = ([2880,21128])和labels.view(-1) = ([2880])的好多是-100的计算对应的cross_entropy交叉熵概率,
由于没有-100这个标签,所以带有-100的标签交叉熵计算出来的概率为0,这样就剔除了这一部分的误差内容
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。