当前位置:   article > 正文

【自然语言处理】【Pytorch】使用领域语料继续预训练BERT_linebylinetextdataset

linebylinetextdataset
import pandas as pd
import os
import random
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AdamW, BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, List
from torch.nn.utils.rnn import pad_sequence
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

一、Transformers

transformers是开源的预训练语言模型包,https://github.com/huggingface/transformers。其能够提供许多用于文本分类、信息抽取、问答、摘要、翻译等任务的预训练语言模型。本文使用transformers中的BERT来实现MLM预训练任务在特定语料上的继续预训练。

二、设置参数

base = "./Data/imdb"
pretrain_model = "bert-base-uncased"
max_length = 512
epochs = 3
seed = 900

random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

三、读取数据并进行处理

train = pd.read_csv(os.path.join(base,"labeledTrainData.tsv"), header=0,delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(base,"testData.tsv"),header=0,delimiter="\t", quoting=3)
train['review'] = train['review'].apply(lambda r:r.strip("\""))
test['review'] = test['review'].apply(lambda r:r.strip("\""))
examples = list(train['review'])+list(test['review'])
  • 1
  • 2
  • 3
  • 4
  • 5

四、读取预训练模型和tokenizer

tokenizer = BertTokenizer.from_pretrained(pretrain_model)
model = BertForMaskedLM.from_pretrained(pretrain_model)
  • 1
  • 2

五、定义Dataset

class LineByLineTextDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.examples = tokenizer.batch_encode_plus(examples, add_special_tokens=True,
                                                    max_length=max_length, truncation=True)["input_ids"]
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)
    
dataset = LineByLineTextDataset(examples, tokenizer, max_length)
print(" ".join(tokenizer.convert_ids_to_tokens(dataset[5])))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

输出:

[CLS] i don ##t know why people think this is such a bad movie . its got a pretty good plot , some good action , and the change of location for harry does not hurt either . sure some of its offensive and gr ##at ##uit ##ous but this is not the only movie like that . eastwood is in good form as dirty harry , and i liked pat hi ##ng ##le in this movie as the small town cop . if you liked dirty harry , then you should see this one , its a lot better than the dead pool . 4 / 5 [SEP]
  • 1

六、定义DataLoader

def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

dataloader = DataLoader(dataset, shuffle=True, batch_size=8, collate_fn=collate)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

七、定义Trainer

class Trainer:
    def __init__(self, model, dataloader, tokenizer, mlm_probability=0.15, lr=1e-4, with_cuda=True, cuda_devices=None, log_freq=100):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model
        self.is_parallel = False
        self.dataloader = dataloader
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability
        self.log_freq = log_freq
        
        # 多GPU训练
        if with_cuda and torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUS for BERT")
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
            self.is_parallel = True
        self.model.train()
        self.model.to(self.device)
        self.optim = AdamW(self.model.parameters(), lr=1e-4)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
        
    def train(self, epoch):
        self.iteration(epoch, self.dataloader)
        
    def iteration(self, epoch, dataloader, train=True):
        str_code = 'Train'
        total_loss = 0.0
        for i,batch in tqdm(enumerate(dataloader), desc="Training"):
            inputs, labels = self._mask_tokens(batch)
            inputs.to(self.device)
            labels.to(self.device)
            lm_loss,output = self.model(inputs, masked_lm_labels=labels)
            loss = lm_loss.mean()
            
            if train:
                self.model.zero_grad()
                self.optim.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optim.step()
                
            total_loss += loss.item()
            post_fix = {
                "iter": i,
                "ave_loss": total_loss/(i+1)
            }
            if i % self.log_freq == 0:
                print(post_fix)
                
        print(f"EP{epoch}_{str_code},avg_loss={total_loss/len(dataloader)}")
        
    def _mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """ Masked Language Model """
        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
            )
            
        labels = inputs.clone()
        # 使用mlm_probability填充张量
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        # 获取special token掩码
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        # 将special token位置的概率填充为0
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if self.tokenizer._pad_token is not None:
            # padding掩码
            padding_mask = labels.eq(tokenizer.pad_token_id)
            # 将padding位置的概率填充为0
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        
        # 对token进行mask采样
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # loss只计算masked
        
        # 80%的概率将masked token替换为[MASK]
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        
        # 10%的概率将masked token替换为随机单词
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        
        # 余下的10%不做改变
        return inputs, labels
    
trainer = Trainer(model, dataloader, tokenizer)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89

输出:

Using 2 GPUS for BERT
Total Parameters: 109514298
  • 1
  • 2

八、进行训练

for epoch in range(epochs):
    trainer.train(epoch)
  • 1
  • 2

输出:

{'iter': 0, 'ave_loss': 2.836127758026123}
{'iter': 100, 'ave_loss': 2.370666817863389}
{'iter': 200, 'ave_loss': 2.3373566693927517}
...
{'iter': 6200, 'ave_loss': 2.2395244782787236}
EP0_Train,avg_loss=2.239439134082794
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

九、保存模型

model.save_pretrained(".")
  • 1
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/513807
推荐阅读
相关标签
  

闽ICP备14008679号