if self.mlm:
#special_tokens_mask = None
batch["input_ids"], batch["labels"] = self.mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
def mask_tokens( self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: print('data/data_collator.py mask_tokens') """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) r""" labels = tensor( [[ 101, 169, 107, ..., 10539, 107, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], ..., [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 117, 169, 102]]) """ probability_matrix = torch.full(labels.shape, self.mlm_probability) r""" probability_matrix = tensor([[0.1500,0.1500,...], [0.1500,0.1500,...], .................. """ if special_tokens_mask is None: special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() probability_matrix.masked_fill_(special_tokens_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
labels = inputs.clone()
labels = tensor(
[[ 101, 169, 107, ..., 10539, 107, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 117, 169, 102]]
probability_matrix = torch.full(labels.shape,self.mlm_probability)
probability_matrix =
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
print('special_tokens_mask1 = ')
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
print('special_tokens_mask2 = ')
special_tokens_mask = special_tokens_mask.bool()
special_tokens_mask1 =
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1]]
special_tokens_mask2 =
tensor([[ True, False, False, ..., False, False, True],
[ True, False, False, ..., False, False, True]])
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val,already_head_special_tokens=True) for valu in labels.tolist()
if isinstnace(examples[0], (dict,BatchEncoding)):
batch = self.tokenizer.pad(examples,return_tensors="pt",pad_to_multiple_of
print('|||self.tokenizer = |||')
|||self.tokenizer = |||
PreTrainedTokenizer is_fast
PreTrainedTokenizer is_fast
PreTrainedTokenizer(name_or_path='/home/xiaoguzai/数据/nezha-chinese-base/vocab.txt', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
def is_fast(self) -> bool:
return False
self.pad_to_multiple_of = None
batch =
batch = {'input_ids': tensor([[ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 107, 10539, 102], ..., [ 101, 169, 107, ..., 131, 107, 102], [ 101, 169, 103, ..., 100, 124, 102], [ 101, 169, 107, ..., 171, 117, 102]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], ..., [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], ..., [-100, -100, -100, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100]])}
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
#prediction_scores = torch.Size([32,90,21128])
outputs = (prediction_scores,) + outputs[2:]
#outputs[0].shape = torch.Size([32,90,21128])
masked_lm_labels = None
if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token
#prediction_scores = ([32,90,21128])
#prediction_scores.view = ([2880,21128])
#labels.view(-1) = ([2880])
#labels.view = tensor([-100,-100,...,117,-100,-100],device='cuda:0')
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores,) + outputs[2:] masked_lm_labels = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs
prediction_scores = self.cls(sequence_output)
self.cls = BertOnlyMLMHead(config)
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): #config.hidden_act = 'gelu',调用第一个if语句 self.transform_act_fn = ACT2FN[config.hidden_act] else: print('BertPredictionHeadTransform situation2') self.transform_act_fn = config.hidden_act self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states
if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
batch = {'input_ids': tensor([[ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 100, 100, 102], [ 101, 169, 107, ..., 107, 10539, 102], ..., [ 101, 169, 107, ..., 131, 107, 102], [ 101, 169, 103, ..., 100, 124, 102], [ 101, 169, 107, ..., 171, 117, 102]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], ..., [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], [-100, -100, -100, ..., -100, -100, -100], ..., [-100, -100, -100, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100], [-100, -100, 107, ..., -100, -100, -100]])}
中由input_ids输出的prediction_scores.view = ([2880,21128])和labels.view(-1) = ([2880])的好多是-100的计算对应的cross_entropy交叉熵概率,
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。