赞
踩
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
print(tokenizer)
DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
tokenizer.batch_encode_plus([
'hello, everyone, today is a good day',
'how are you , fine thank you , and you?'])
{'input_ids': [[101, 7592, 1010, 3071, 1010, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2024, 2017, 1010, 2986, 4067, 2017, 1010, 1998, 2017, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
from datasets import load_dataset
dataset = load_dataset(path='glue', name='cola')
dataset
0%| | 0/3 [00:00<?, ?it/s]
DatasetDict({
train: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 8551
})
validation: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 1043
})
test: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 1063
})
})
dataset['train'][0]
{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
'label': 1,
'idx': 0}
def f(examples, tokenizer):
return tokenizer.batch_encode_plus(examples['sentence'], truncation=True)
dataset = dataset.map(f,
batched=True,
batch_size=1000,
num_proc=1, # num_proc=1更快, 数据量不多时创建进程也是需要时间开销
remove_columns=['sentence', 'idx'],
fn_kwargs={'tokenizer': tokenizer})
0%| | 0/9 [00:00<?, ?ba/s]
0%| | 0/2 [00:00<?, ?ba/s]
0%| | 0/2 [00:00<?, ?ba/s]
print(dataset['train'][0])
{'label': 1, 'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
import torch
from transformers.data.data_collator import DataCollatorWithPadding
loader = torch.utils.data.DataLoader(
dataset=dataset['train'],
batch_size=8,
collate_fn=DataCollatorWithPadding(tokenizer),
shuffle=True,
drop_last=True,)
for data in loader:
break
data
{'input_ids': tensor([[ 101, 1996, 2062, 4180, 2098, 2057, 14688, 2000, 2022, 1010,
1996, 2062, 2057, 3473, 4854, 2012, 1996, 7435, 1012, 102],
[ 101, 2040, 17749, 2073, 2057, 4149, 2054, 1029, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 18500, 2245, 2008, 2002, 2001, 1996, 3159, 1997, 4768,
1012, 102, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 1999, 1996, 4020, 2045, 28374, 1037, 2543, 1012, 102,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 2198, 14140, 2070, 2769, 1999, 1996, 2924, 2006, 5958,
1012, 102, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 2320, 9965, 2187, 1010, 5965, 2150, 2035, 1996, 13675,
16103, 2121, 1012, 102, 0, 0, 0, 0, 0, 0],
[ 101, 2023, 2311, 2288, 12283, 1998, 12283, 1012, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 3389, 2097, 2196, 2681, 1012, 102, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1])}
len(loader)
1068
from transformers import AutoModelForSequenceClassification, DistilBertModel
parameters = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
parameters
DistilBertForSequenceClassification(
(distilbert): DistilBertModel(
(embeddings): Embeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(transformer): Transformer(
(layer): ModuleList(
(0): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
(activation): GELUActivation()
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(1): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
(activation): GELUActivation()
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(2): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
(activation): GELUActivation()
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(3): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
(activation): GELUActivation()
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(4): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
(activation): GELUActivation()
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(5): TransformerBlock(
(attention): MultiHeadSelfAttention(
(dropout): Dropout(p=0.1, inplace=False)
(q_lin): Linear(in_features=768, out_features=768, bias=True)
(k_lin): Linear(in_features=768, out_features=768, bias=True)
(v_lin): Linear(in_features=768, out_features=768, bias=True)
(out_lin): Linear(in_features=768, out_features=768, bias=True)
)
(sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(ffn): FFN(
(dropout): Dropout(p=0.1, inplace=False)
(lin1): Linear(in_features=768, out_features=3072, bias=True)
(lin2): Linear(in_features=3072, out_features=768, bias=True)
(activation): GELUActivation()
)
(output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
)
)
)
(pre_classifier): Linear(in_features=768, out_features=768, bias=True)
(classifier): Linear(in_features=768, out_features=2, bias=True)
(dropout): Dropout(p=0.2, inplace=False)
)
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.pretrained = DistilBertModel.from_pretrained('distilbert-base-uncased')
self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),
torch.nn.ReLU(),
torch.nn.Dropout(p=0.2),
torch.nn.Linear(768, 2))
parameters = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
self.fc[0].load_state_dict(parameters.pre_classifier.state_dict())
self.fc[3].load_state_dict(parameters.classifier.state_dict())
self.criterion = torch.nn.CrossEntropyLoss()
def forward(self, input_ids, attention_mask, labels=None):
logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
logits = logits.last_hidden_state[:, 0]
logits = self.fc(logits)
loss = None
if labels is not None:
# print('logits: ', logits)
# print('labels: ', labels)
loss = self.criterion(logits, labels)
return {'loss': loss, 'logits': logits}
model = Model()
print(sum(i.numel() for i in model.parameters()))
66955010
out = model(**data)
out['loss'], out['logits'].shape
(tensor(0.6949, grad_fn=<NllLossBackward0>), torch.Size([8, 2]))
data
{'input_ids': tensor([[ 101, 1996, 2062, 4180, 2098, 2057, 14688, 2000, 2022, 1010,
1996, 2062, 2057, 3473, 4854, 2012, 1996, 7435, 1012, 102],
[ 101, 2040, 17749, 2073, 2057, 4149, 2054, 1029, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 18500, 2245, 2008, 2002, 2001, 1996, 3159, 1997, 4768,
1012, 102, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 1999, 1996, 4020, 2045, 28374, 1037, 2543, 1012, 102,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 2198, 14140, 2070, 2769, 1999, 1996, 2924, 2006, 5958,
1012, 102, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 2320, 9965, 2187, 1010, 5965, 2150, 2035, 1996, 13675,
16103, 2121, 1012, 102, 0, 0, 0, 0, 0, 0],
[ 101, 2023, 2311, 2288, 12283, 1998, 12283, 1012, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 101, 3389, 2097, 2196, 2681, 1012, 102, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1])}
dataset['test'][0]
{'label': -1,
'input_ids': [101, 3021, 26265, 2627, 1996, 2160, 1012, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
dataset['validation'][0]
{'label': 1,
'input_ids': [101,
1996,
11279,
8469,
1996,
9478,
3154,
1997,
1996,
5749,
1012,
102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
def test(model):
model.eval()
loader_test = torch.utils.data.DataLoader(
dataset=dataset['validation'],
batch_size=16,
collate_fn=DataCollatorWithPadding(tokenizer),
shuffle=True,
drop_last=True)
outs = []
labels = []
for i, data in enumerate(loader_test):
# print(data)
with torch.no_grad():
out = model(**data)
outs.append(out['logits'].argmax(dim=1)) # logits:[8, 2]
labels.append(data['labels'])
if i % 10 == 0:
print(i)
if i == 50:
break
outs = torch.cat(outs)
labels = torch.cat(labels)
# print('test labels:', labels)
accuracy = (outs == labels).sum().item() / len(labels)
print('accuracy: ', accuracy)
test(model)
0
10
20
30
40
50
accuracy: 0.5502450980392157
from transformers import AdamW
from transformers.optimization import get_scheduler
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)
def train():
optimizer = AdamW(model.parameters(), betas=(0.9, 0.999), eps=1e-8, lr=2e-5) # 定义优化器
scheduler = get_scheduler(name='linear',
num_warmup_steps=0,
num_training_steps=len(loader),
optimizer=optimizer)
model.to(device)
model.train()
for i, data in enumerate(loader):
input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)
out = model(input_ids=input_ids,
attention_mask=attention_mask,
labels=labels)
loss = out['loss']
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
if i % 50 == 0:
lr = optimizer.state_dict()['param_groups'][0]['lr']
out = out['logits'].argmax(dim=1)
accuracy = (labels==out).sum().item() / 8
print(i, loss.item(), lr, accuracy)
print()
train()
test(model.to('cpu'))
0
10
20
30
40
50
accuracy: 0.7781862745098039
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。