赞
踩
最近用transformers用的比较多,用transformers加载预训练模型,然后做预测的教程挺多的,但是加载自己的数据进行训练的挺少的,我这里分享一下我的实现:
数据来自于kaggle上面情感分析的数据,地址为:
https://www.kaggle.com/lava18/google-play-store-apps?select=googleplaystore_user_reviews.csv
import transformers from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup import torch import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from collections import defaultdict from textwrap import wrap from torch import nn, optim from torch.utils.data import Dataset, DataLoader import torch.nn.functional as F RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
df = pd.read_csv("archive/googleplaystore_user_reviews.csv")
df=df.dropna()
def to_sentiment(rating):
if rating == 'Positive':
return 2
elif rating == 'Neutral':
return 1
return 0
df['sentiment'] = df.Sentiment.apply(to_sentiment)
class_names=["Negative","Neutral","Positive"]
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
class GPReviewDataset(Dataset): def __init__(self, reviews, targets, tokenizer, max_len): self.reviews = reviews self.targets = targets self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.reviews) def __getitem__(self, item): review = str(self.reviews[item]) target = self.targets[item] encoding = self.tokenizer.encode_plus( review, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) # print(target) return { 'review_text': review, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.long) } def create_data_loader(df, tokenizer, max_len, batch_size): ds = GPReviewDataset( reviews=df.Translated_Review.to_numpy(), targets=df.sentiment.to_numpy(), tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=4 ) BATCH_SIZE = 16 MAX_LEN = 160 PRE_TRAINED_MODEL_NAME = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE) val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE) test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
output = self.drop(pooled_output)
return self.out(output)
model = SentimentClassifier(len(class_names)) model = model.to(device) EPOCHS = 10 optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) loss_fn = nn.CrossEntropyLoss().to(device)
训练函数
def train_epoch( model, data_loader, loss_fn, optimizer, device, scheduler, n_examples ): model = model.train() losses = [] correct_predictions = 0 for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() optimizer.zero_grad() return correct_predictions.double() / n_examples, np.mean(losses)
验证函数:
def eval_model(model, data_loader, loss_fn, device, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses)
调用他们来进行训练:
history = defaultdict(list) best_accuracy = 0 for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 10) train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train) ) print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model( model, val_data_loader, loss_fn, device, len(df_val) ) print(f'Val loss {val_loss} accuracy {val_acc}') print() history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: torch.save(model.state_dict(), 'best_model_state.bin') best_accuracy = val_acc
结束,虽然预测只需要加载模型,然后把输入变成id就行了。但是训练过程还是比预测过程麻烦很多,我这里只提供了一种基础方法,还有很多需要完善的地方,欢迎改进。
[1].Google Play Store Apps. https://www.kaggle.com/lava18/google-play-store-apps?select=googleplaystore_user_reviews.csv
[2].Sentiment Analysis with BERT and Transformers by Hugging Face using PyTorch and Python. https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
[3].Sentiment Analysis with BERT. https://github.com/curiousily/Getting-Things-Done-with-Pytorch/blob/master/08.sentiment-analysis-with-bert.ipynb
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。