赞
踩
仅供学习交流,如有侵权请告知
import torch from torch.utils.data import DataLoader from transformers import AutoTokenizer,AutoModelForSequenceClassification,BertModel,BertTokenizer from pathlib import Path from sklearn.model_selection import train_test_split from torch.utils.data import Dataset from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification from transformers import Trainer, TrainingArguments from transformers import AdamW,DistilBertForSequenceClassification device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model_name = "distilbert-base-uncased" def read_imdb_split(split_dir): split_dir = Path(split_dir) texts = [] labels = [] for label_dir in ["pos","neg"]: for text_file in (split_dir/label_dir).iterdir(): texts.append(text_file.read_text()) labels.append(0 if label_dir == "neg" else 1) return texts, labels # Large Movie Review Dataset # http://ai.stanford.edu/~amaas/data/sentiment/ train_texts, train_labels = read_imdb_split('aclImdb/train') test_texts, test_labels = read_imdb_split('aclImdb/test') train_texts, val_texts, train_labels, val_lalbels = train_test_split(train_texts,train_labels,test_size=0.2,shuffle=True,random_state=42) class IMDBdataset(Dataset): def __init__(self,encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) def __len__(self): return len(self.labels) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) train_encodings = tokenizer(train_texts, truncation=True, padding=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) test_encodings = tokenizer(test_texts, truncation=True, padding=True) train_dataset = IMDBdataset(train_encodings, train_labels) val_dataset = IMDBdataset(val_encodings, val_lalbels) test_dataset = IMDBdataset(test_encodings, test_labels) model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') model.to(device) model.train() train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) num_train_epochs = 2 for each in range(num_train_epochs): for batch in train_loader: optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask,labels=labels) loss = outputs[0] loss.backwoard() optim.step() model.eval()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。