赞
踩
import pandas as pd
import os
import random
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AdamW, BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, List
from torch.nn.utils.rnn import pad_sequence
transformers是开源的预训练语言模型包,https://github.com/huggingface/transformers。其能够提供许多用于文本分类、信息抽取、问答、摘要、翻译等任务的预训练语言模型。本文使用transformers中的BERT来实现MLM预训练任务在特定语料上的继续预训练。
base = "./Data/imdb"
pretrain_model = "bert-base-uncased"
max_length = 512
epochs = 3
seed = 900
random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
train = pd.read_csv(os.path.join(base,"labeledTrainData.tsv"), header=0,delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(base,"testData.tsv"),header=0,delimiter="\t", quoting=3)
train['review'] = train['review'].apply(lambda r:r.strip("\""))
test['review'] = test['review'].apply(lambda r:r.strip("\""))
examples = list(train['review'])+list(test['review'])
tokenizer = BertTokenizer.from_pretrained(pretrain_model)
model = BertForMaskedLM.from_pretrained(pretrain_model)
class LineByLineTextDataset(Dataset):
def __init__(self, examples, tokenizer, max_length):
self.examples = tokenizer.batch_encode_plus(examples, add_special_tokens=True,
max_length=max_length, truncation=True)["input_ids"]
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
return torch.tensor(self.examples[idx], dtype=torch.long)
dataset = LineByLineTextDataset(examples, tokenizer, max_length)
print(" ".join(tokenizer.convert_ids_to_tokens(dataset[5])))
输出:
[CLS] i don ##t know why people think this is such a bad movie . its got a pretty good plot , some good action , and the change of location for harry does not hurt either . sure some of its offensive and gr ##at ##uit ##ous but this is not the only movie like that . eastwood is in good form as dirty harry , and i liked pat hi ##ng ##le in this movie as the small town cop . if you liked dirty harry , then you should see this one , its a lot better than the dead pool . 4 / 5 [SEP]
def collate(examples: List[torch.Tensor]):
if tokenizer._pad_token is None:
return pad_sequence(examples, batch_first=True)
return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
dataloader = DataLoader(dataset, shuffle=True, batch_size=8, collate_fn=collate)
class Trainer: def __init__(self, model, dataloader, tokenizer, mlm_probability=0.15, lr=1e-4, with_cuda=True, cuda_devices=None, log_freq=100): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = model self.is_parallel = False self.dataloader = dataloader self.tokenizer = tokenizer self.mlm_probability = mlm_probability self.log_freq = log_freq # 多GPU训练 if with_cuda and torch.cuda.device_count() > 1: print(f"Using {torch.cuda.device_count()} GPUS for BERT") self.model = nn.DataParallel(self.model, device_ids=cuda_devices) self.is_parallel = True self.model.train() self.model.to(self.device) self.optim = AdamW(self.model.parameters(), lr=1e-4) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def train(self, epoch): self.iteration(epoch, self.dataloader) def iteration(self, epoch, dataloader, train=True): str_code = 'Train' total_loss = 0.0 for i,batch in tqdm(enumerate(dataloader), desc="Training"): inputs, labels = self._mask_tokens(batch) inputs.to(self.device) labels.to(self.device) lm_loss,output = self.model(inputs, masked_lm_labels=labels) loss = lm_loss.mean() if train: self.model.zero_grad() self.optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) self.optim.step() total_loss += loss.item() post_fix = { "iter": i, "ave_loss": total_loss/(i+1) } if i % self.log_freq == 0: print(post_fix) print(f"EP{epoch}_{str_code},avg_loss={total_loss/len(dataloader)}") def _mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Masked Language Model """ if self.tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() # 使用mlm_probability填充张量 probability_matrix = torch.full(labels.shape, self.mlm_probability) # 获取special token掩码 special_tokens_mask = [ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] # 将special token位置的概率填充为0 probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if self.tokenizer._pad_token is not None: # padding掩码 padding_mask = labels.eq(tokenizer.pad_token_id) # 将padding位置的概率填充为0 probability_matrix.masked_fill_(padding_mask, value=0.0) # 对token进行mask采样 masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # loss只计算masked # 80%的概率将masked token替换为[MASK] indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) # 10%的概率将masked token替换为随机单词 indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # 余下的10%不做改变 return inputs, labels trainer = Trainer(model, dataloader, tokenizer)
输出:
Using 2 GPUS for BERT
Total Parameters: 109514298
for epoch in range(epochs):
trainer.train(epoch)
输出:
{'iter': 0, 'ave_loss': 2.836127758026123}
{'iter': 100, 'ave_loss': 2.370666817863389}
{'iter': 200, 'ave_loss': 2.3373566693927517}
...
{'iter': 6200, 'ave_loss': 2.2395244782787236}
EP0_Train,avg_loss=2.239439134082794
model.save_pretrained(".")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。