当前位置:   article > 正文

NLP项目7-文本分类之情感分析_distilbert-base-uncased

distilbert-base-uncased

1.分词器

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
print(tokenizer)
DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
  • 1
  • 2
  • 3
  • 4

2.批编码

tokenizer.batch_encode_plus([
    'hello, everyone, today is a good day',
    'how are you , fine thank you , and you?'])
{'input_ids': [[101, 7592, 1010, 3071, 1010, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2024, 2017, 1010, 2986, 4067, 2017, 1010, 1998, 2017, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
  • 1
  • 2
  • 3
  • 4

3.数据加载

from datasets import load_dataset
dataset = load_dataset(path='glue', name='cola')
dataset
  0%|          | 0/3 [00:00<?, ?it/s]
DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

dataset['train'][0]
{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23

4.数据集预处理 Sentence > Input_ids

def f(examples, tokenizer):
    return tokenizer.batch_encode_plus(examples['sentence'], truncation=True)
dataset = dataset.map(f,
                     batched=True,
                     batch_size=1000,
                     num_proc=1, # num_proc=1更快, 数据量不多时创建进程也是需要时间开销
                     remove_columns=['sentence', 'idx'],
                     fn_kwargs={'tokenizer': tokenizer})
  0%|          | 0/9 [00:00<?, ?ba/s]
  0%|          | 0/2 [00:00<?, ?ba/s]
  0%|          | 0/2 [00:00<?, ?ba/s]
  
print(dataset['train'][0])
{'label': 1, 'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

5.数据生成器

import torch
from transformers.data.data_collator import DataCollatorWithPadding
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=DataCollatorWithPadding(tokenizer),
    shuffle=True,
    drop_last=True,)
for data in loader:
    break
data
{'input_ids': tensor([[  101,  1996,  2062,  4180,  2098,  2057, 14688,  2000,  2022,  1010,
          1996,  2062,  2057,  3473,  4854,  2012,  1996,  7435,  1012,   102],
        [  101,  2040, 17749,  2073,  2057,  4149,  2054,  1029,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 18500,  2245,  2008,  2002,  2001,  1996,  3159,  1997,  4768,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1999,  1996,  4020,  2045, 28374,  1037,  2543,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2198, 14140,  2070,  2769,  1999,  1996,  2924,  2006,  5958,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2320,  9965,  2187,  1010,  5965,  2150,  2035,  1996, 13675,
         16103,  2121,  1012,   102,     0,     0,     0,     0,     0,     0],
        [  101,  2023,  2311,  2288, 12283,  1998, 12283,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3389,  2097,  2196,  2681,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1])}
        
len(loader)
1068
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37

6.加载预训练模型参数

from transformers import AutoModelForSequenceClassification, DistilBertModel
parameters = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
parameters
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (1): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (2): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (3): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (4): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (5): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
      )
    )
  )
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122

7.定义下游任务模型

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrained = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),
                                      torch.nn.ReLU(),
                                      torch.nn.Dropout(p=0.2),
                                      torch.nn.Linear(768, 2))
        parameters = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
        self.fc[0].load_state_dict(parameters.pre_classifier.state_dict())
        self.fc[3].load_state_dict(parameters.classifier.state_dict())
        self.criterion = torch.nn.CrossEntropyLoss()
    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
        logits = logits.last_hidden_state[:, 0]
        logits = self.fc(logits)
        loss = None
        if labels is not None:
#             print('logits: ', logits)
#             print('labels: ', labels)
            loss = self.criterion(logits, labels)
        return {'loss': loss, 'logits': logits}
model = Model()
print(sum(i.numel() for i in model.parameters()))
66955010

out = model(**data)
out['loss'], out['logits'].shape
(tensor(0.6949, grad_fn=<NllLossBackward0>), torch.Size([8, 2]))

data
{'input_ids': tensor([[  101,  1996,  2062,  4180,  2098,  2057, 14688,  2000,  2022,  1010,
          1996,  2062,  2057,  3473,  4854,  2012,  1996,  7435,  1012,   102],
        [  101,  2040, 17749,  2073,  2057,  4149,  2054,  1029,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 18500,  2245,  2008,  2002,  2001,  1996,  3159,  1997,  4768,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1999,  1996,  4020,  2045, 28374,  1037,  2543,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2198, 14140,  2070,  2769,  1999,  1996,  2924,  2006,  5958,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2320,  9965,  2187,  1010,  5965,  2150,  2035,  1996, 13675,
         16103,  2121,  1012,   102,     0,     0,     0,     0,     0,     0],
        [  101,  2023,  2311,  2288, 12283,  1998, 12283,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3389,  2097,  2196,  2681,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1])}
        
dataset['test'][0]
{'label': -1,
 'input_ids': [101, 3021, 26265, 2627, 1996, 2160, 1012, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
 
dataset['validation'][0]
{'label': 1,
 'input_ids': [101,
  1996,
  11279,
  8469,
  1996,
  9478,
  3154,
  1997,
  1996,
  5749,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75

8.测试1

def test(model):
    model.eval()
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['validation'],
        batch_size=16,
        collate_fn=DataCollatorWithPadding(tokenizer),
        shuffle=True,
        drop_last=True)
    outs = []
    labels = []
    for i, data in enumerate(loader_test):
#         print(data)
        with torch.no_grad():
            out = model(**data) 
        outs.append(out['logits'].argmax(dim=1)) # logits:[8, 2]
        labels.append(data['labels'])
        if i % 10 == 0:
            print(i)   
        if i == 50:
            break  
    outs = torch.cat(outs)
    labels = torch.cat(labels)
#     print('test labels:', labels)
    accuracy = (outs == labels).sum().item() / len(labels)
    print('accuracy: ', accuracy)
test(model)
0
10
20
30
40
50
accuracy:  0.5502450980392157
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33

9.训练

from transformers import AdamW
from transformers.optimization import get_scheduler
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)

def train():
    optimizer = AdamW(model.parameters(), betas=(0.9, 0.999), eps=1e-8, lr=2e-5) # 定义优化器
    scheduler = get_scheduler(name='linear',
                             num_warmup_steps=0,
                             num_training_steps=len(loader),
                             optimizer=optimizer)
    model.to(device)
    model.train()
    for i, data in enumerate(loader):
        input_ids, attention_mask, labels  = data['input_ids'], data['attention_mask'], data['labels']
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        out = model(input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    labels=labels)
        loss = out['loss']
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        model.zero_grad()
        if i % 50 == 0:
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            out = out['logits'].argmax(dim=1)
            accuracy = (labels==out).sum().item() / 8
            print(i, loss.item(), lr, accuracy)
            print()
train()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36

10.测试2

test(model.to('cpu'))
0
10
20
30
40
50
accuracy:  0.7781862745098039
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/正经夜光杯/article/detail/782562
推荐阅读
相关标签
  

闽ICP备14008679号