当前位置:   article > 正文

Transformers微调模型(PyTorch)_transformers库微调

transformers库微调

仅供学习交流,如有侵权请告知

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer,AutoModelForSequenceClassification,BertModel,BertTokenizer
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AdamW,DistilBertForSequenceClassification
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_name = "distilbert-base-uncased"



def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos","neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)
    return texts, labels

# Large Movie Review Dataset
# http://ai.stanford.edu/~amaas/data/sentiment/
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

train_texts, val_texts, train_labels, val_lalbels = train_test_split(train_texts,train_labels,test_size=0.2,shuffle=True,random_state=42)

class IMDBdataset(Dataset):
    def __init__(self,encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
    def __len__(self):
        return len(self.labels)
        
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDBdataset(train_encodings, train_labels)
val_dataset = IMDBdataset(val_encodings, val_lalbels)
test_dataset = IMDBdataset(test_encodings, test_labels)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

num_train_epochs = 2
for each in range(num_train_epochs):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,labels=labels)
        loss = outputs[0]
        loss.backwoard()
        optim.step()
model.eval()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/487338
推荐阅读
相关标签
  

闽ICP备14008679号