赞
踩
预训练语言模型中的非常重要的任务是MLM任务,MLM任务需要对原始文本进行mask。
transformers库已经集成了预训练语言模型中的mask机制,这里分析其中的两个函数DataCollatorForLanguageModeling
和DataCollatorForWholeWordMask
。
这个类实现了Bert模型中的MLM任务中提出的mask机制。下面我对照transformers库中的原始代码讲解。
class DataCollatorForLanguageModeling: """ Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length. Args: tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): The tokenizer used for encoding the data. mlm (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked tokens and the value to predict for the masked token. mlm_probability (:obj:`float`, `optional`, defaults to 0.15): The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`. pad_to_multiple_of (:obj:`int`, `optional`): If set will pad the sequence to a multiple of the provided value. .. note:: For best performance, this data collator should be used with a dataset having items that are dictionaries or BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the argument :obj:`return_special_tokens_mask=True`. """ tokenizer: PreTrainedTokenizerBase mlm: bool = True mlm_probability: float = 0.15 pad_to_multiple_of: Optional[int] = None def __post_init__(self): if self.mlm and self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. " "You should pass `mlm=False` to train on causal language modeling instead." ) def __call__( self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] ) -> Dict[str, torch.Tensor]: # Handle dict or lists with proper padding and conversion to tensor. if isinstance(examples[0], (dict, BatchEncoding)): batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of) else: batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)} # If special token mask has been preprocessed, pop it from the dict. special_tokens_mask = batch.pop("special_tokens_mask", None) if self.mlm: batch["input_ids"], batch["labels"] = self.mask_tokens( batch["input_ids"], special_tokens_mask=special_tokens_mask ) else: labels = batch["input_ids"].clone() if self.tokenizer.pad_token_id is not None: labels[labels == self.tokenizer.pad_token_id] = -100 batch["labels"] = labels return batch def mask_tokens( self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ ''' inputs必须为tensor类型, ''' # 这里的labels指的是哪些字要被mask labels = inputs.clone() # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) # 生成一个全为0.15 的矩阵,维度和inputs的大小一样。 probability_matrix = torch.full(labels.shape, self.mlm_probability) # 这里的spcial_tokens指的是cls,sep,pad等,special_tokens_mask的维度和inputs一样, # 特殊token的位置是true,其他位置是false if special_tokens_mask is None: special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() # masked_fill_函数的作用是会根据special_tokens_mask中true的位置,将probability_matrix中对于的位置置为0 # 前面special_tokens_mask已经将cls等特殊字符的位置置为true,也就是通过将probability_matrix中特殊字符的位置置0,从而 # 不会mask掉特殊字符。 probability_matrix.masked_fill_(special_tokens_mask, value=0.0) # Bert是将句子中15%的词随机mask掉,也就是每个词有15%的概率被mask掉(1),有85%的概率不被mask掉(0), # 因此可以建模为伯努利分布,参数p=0.15, masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
from transformers.data.data_collator import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask path = '../PTM/bert-base-chinese' tokenizer = BertTokenizer.from_pretrained(path) model = BertForMaskedLM.from_pretrained(path) sent = "我爱北京天安门,天安门上太阳升" # 创建一个实例,参数是tokenizer datacollecter = DataCollatorForLanguageModeling(tokenizer) encoded_dict = tokenizer.encode_plus( sent, # 输入文本 add_special_tokens=True, # 添加 '[CLS]' 和 '[SEP]' max_length=32, # 填充 & 截断长度 truncation=True, pad_to_max_length=True, return_attention_mask=True, # 返回 attn. masks. ) print(encoded_dict) input_ids = [torch.tensor(encoded_dict['input_ids'])] # 传入的参数是tensor形式的input_ids,返回input_ids和label,label中 # -100的位置的词没有被mask, output = datacollecter(input_ids) print(output) ''' {'input_ids': tensor([[ 101, 2769, 4263, 1266, 776, 1921, 103, 7305, 8024, 1921, 2128, 7305, 677, 1922, 7345, 1285, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100, -100, -100, -100, -100, -100, 2128(被mask掉的词) , -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])} '''
这个类实现了whole word mask机制,并且继承自DataCollatorForLanguageModeling
。对于中文来说,要和prepare_ref函数一起使用。prepare_ref函数也是transformer官方实现的一个针对中文情况的函数,其作用是返回子词位置。
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer): ''' sent = "我爱北京天安门,天安门上太阳升" bert_tokens = ['[CLS]','我', '爱', '北', '##京', '天', '##安', '##门', ',', '天', '##安', '##门', '上', '太', '##阳', '##升',] return: [4, 6, 7, 10, 11, 14, 15] 京,安, 门, 安, 门, 阳,升 ''' ltp_res = [] for i in range(0, len(lines), 100): res = ltp_tokenizer.seg(lines[i : i + 100])[0] res = [get_chinese_word(r) for r in res] ltp_res.extend(res) assert len(ltp_res) == len(lines) bert_res = [] for i in range(0, len(lines), 100): res = bert_tokenizer(lines[i : i + 100], add_special_tokens=True, truncation=True, max_length=512) bert_res.extend(res["input_ids"]) assert len(bert_res) == len(lines) ref_ids = [] for input_ids, chinese_word in zip(bert_res, ltp_res): input_tokens = [] for id in input_ids: token = bert_tokenizer._convert_id_to_token(id) input_tokens.append(token) input_tokens = add_sub_symbol(input_tokens, chinese_word) ref_id = [] # We only save pos of chinese subwords start with ##, which mean is part of a whole word. for i, token in enumerate(input_tokens): if token[:2] == "##": clean_token = token[2:] # save chinese tokens' pos if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)): ref_id.append(i) ref_ids.append(ref_id) assert len(ref_ids) == len(bert_res) return ref_ids
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): """ Data collator used for language modeling. - collates batches of tensors, honoring their tokenizer's pad_token - preprocesses batches for masked language modeling """ def __call__( self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] ) -> Dict[str, torch.Tensor]: if isinstance(examples[0], (dict, BatchEncoding)): input_ids = [e["input_ids"] for e in examples] else: input_ids = examples examples = [{"input_ids": e} for e in examples] batch_input = _collate_batch(input_ids, self.tokenizer) mask_labels = [] for e in examples: ref_tokens = [] for id in tolist(e["input_ids"]): token = self.tokenizer._convert_id_to_token(id) ref_tokens.append(token) # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢] if "chinese_ref" in e: ref_pos = tolist(e["chinese_ref"]) len_seq = len(e["input_ids"]) for i in range(len_seq): if i in ref_pos: ref_tokens[i] = "##" + ref_tokens[i] mask_labels.append(self._whole_word_mask(ref_tokens)) batch_mask = _collate_batch(mask_labels, self.tokenizer) inputs, labels = self.mask_tokens(batch_input, batch_mask) return {"input_ids": inputs, "labels": labels} def _whole_word_mask(self, input_tokens: List[str], max_predictions=512): """ Get 0/1 labels for masked tokens with whole word mask proxy """ cand_indexes = [] for (i, token) in enumerate(input_tokens): if token == "[CLS]" or token == "[SEP]": continue if len(cand_indexes) >= 1 and token.startswith("##"): cand_indexes[-1].append(i) else: cand_indexes.append([i]) random.shuffle(cand_indexes) # 这里的mask 15%的也是字,而不是15%的词 # 只不过mask的时候是根据词来mask,但是统计是否达到了15%是根据字来统计 num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability)))) masked_lms = [] covered_indexes = set() for index_set in cand_indexes: if len(masked_lms) >= num_to_predict: break # If adding a whole-word mask would exceed the maximum number of # predictions, then just skip this candidate. if len(masked_lms) + len(index_set) > num_to_predict: continue is_any_index_covered = False for index in index_set: if index in covered_indexes: is_any_index_covered = True break if is_any_index_covered: continue for index in index_set: covered_indexes.add(index) # 统计是否达到了mask 15%的字的比例,而不是词 masked_lms.append(index) assert len(covered_indexes) == len(masked_lms) mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))] return mask_labels def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) # 这里的mask_labels就相当于已经实现了15%的字MASK之后的masked_indices(DataCollatorForLanguageModeling类中 # 根据伯努利分布生成的) probability_matrix = mask_labels special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = probability_matrix.bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
from transformers.data.data_collator import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask path = '../PTM/bert-base-chinese' tokenizer = BertTokenizer.from_pretrained(path) model = BertForMaskedLM.from_pretrained(path) # seed_val = 42 # random.seed(seed_val) # np.random.seed(seed_val) # torch.manual_seed(seed_val) # torch.cuda.manual_seed_all(seed_val) ''' 这个方法也是bert自带的实现了随机mask 15%单词任务的方法。 由于我是要在领域内的数据集上微调bert,所以还是用language model的训练方法。 ''' from prepare import prepare_ref from ltp import LTP ltp = LTP() sent = "我爱北京天安门,天安门上太阳升" ref = prepare_ref([sent], ltp, tokenizer)[0] print(ref) encoded_dict = tokenizer.encode_plus( sent, # 输入文本 add_special_tokens=True, # 添加 '[CLS]' 和 '[SEP]' max_length=32, # 填充 & 截断长度 truncation=True, pad_to_max_length=True, return_attention_mask=True, # 返回 attn. masks. # return_tensors='pt' ) datacollecter = DataCollatorForWholeWordMask(tokenizer) # 加上子字信息,而且传入的是List,不是tensor。 features = [{'input_ids':encoded_dict['input_ids'], 'chinese_ref':ref}] tmp = datacollecter(features) print(tmp) ''' {'input_ids': tensor([[ 101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 8024, 103, 103, 103, 677, 1922, 7345, 1285, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, 1921, 2128, 7305(第二个“天安门”三个字被随机替换了), -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])} '''
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。