赞
踩
复旦大学邱锡鹏老师课题组的研究论文《How to Fine-Tune BERT for Text Classification?》。
论文: https://arxiv.org/pdf/1905.05583.pdf
https://mp.weixin.qq.com/s/9MrgIz2bchiCjUGpz6MbGQ
旨在文本分类任务上探索不同的BERT微调方法并提供一种通用的BERT微调解决方法。这篇论文从三种路线进行了探索:
(1) BERT自身的微调策略,包括长文本处理、学习率、不同层的选择等方法;
(2) 目标任务内、领域内及跨领域的进一步预训练BERT;
(3) 多任务学习。微调后的BERT在七个英文数据集及搜狗中文数据集上取得了当前最优的结果。
作者的实现代码: https://github.com/xuyige/BERT4doc-Classification
数据集来源:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset?select=train.csv
项目地址:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset
该数据集包含 6 个不同的标签(计算机科学、物理、数学、统计学、生物学、金融),根据摘要和标题对研究论文进行分类。标签列中的值 1 表示标签属于该标签。每个论文有多个标签为 1。
- #2.1 Import
-
- #关于torch的安装可以参考https://blog.csdn.net/Checkmate9949/article/details/119494673?spm=1001.2014.3001.5501
- import torch
- from transformers import BertTokenizerFast as BertTokenizer
- from utils.plot_results import plot_results
- from resources.train_val_model import train_model
- from resources.get_data import get_data
- from resources.build_model import BertClassifier
- from resources.test_model import test_model
- from resources.build_dataloader import build_dataloader
- 2.2 Get data
-
- ##################################
- # get data
- ##################################
-
- #该函数见2.2.1
- train_df, val_df, test_df = get_data()
-
- # fixed parameters
- #Columns: 第三行到倒数第二行
- label_columns = train_df.columns.tolist()[3:-1]
-
- num_labels = len(label_columns)
- max_token_len = 30
-
-
- # BERT_MODEL_NAME = "bert-base-uncased"
- # bert-base-uncased: for English. bert-base-Chinese
- BERT_MODEL_NAME = "model/bert-base-uncased"
- #分词
- tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
- def get_data():
- df = pd.read_csv("data/train.csv")
- #把标题和摘要合并作为TEXT
- df["TEXT"] = df["TITLE"] + df["ABSTRACT"]
-
- label_columns = df.columns.tolist()[3:-1]
- print(df[label_columns].sum().sort_values())
- #Split data in to train and test: 训练集占比80%
- test_df, train_df = train_test_split(df, test_size=0.8, random_state=42)
- #Split data in to valid and test: 分别占比50%
- test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
- #输出数据集
- return train_df, val_df, test_df
- ##################################
- # build data loaders
- ##################################
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
- #训练集使随机抽样random sample
- #val and test: squential sample
- train_dataloader = build_dataloader(
- train_df, label_columns, tokenizer, max_token_len, trainset=True
- )
- val_dataloader = build_dataloader(val_df, label_columns, tokenizer, max_token_len)
- test_dataloader = build_dataloader(test_df, label_columns, tokenizer, max_token_len)
- build_dataloader
-
- import os
- import torch
- from torch.utils.data import Dataset, DataLoader
- from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-
- #df=train_df
-
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-
- class text_dataset(Dataset):
- def __init__( df, label_columns, tokenizer, max_token_len):
- data = df
- label_columns = label_columns
- tokenizer = tokenizer
- max_token_len = max_token_len
-
- #返回数据长度
- def __len__(self):
- return len(data)
-
- #根据index获取item
- #index=3
- def __getitem__( index):
- data_row = data.iloc[index]
- text = data_row["TEXT"]
- labels = data_row[label_columns]
- encoding = tokenizer.encode_plus(
- text,
- add_special_tokens=True,
- max_length=max_token_len,
- return_token_type_ids=False,
- padding="max_length",
- truncation=True,
- return_attention_mask=True,
- return_tensors="pt",
- )
- return dict(
- text=text,
- input_ids=encoding["input_ids"].flatten(),
- attention_mask=encoding["attention_mask"].flatten(),
- labels=torch.FloatTensor(labels),
- )
- #test=__getitem__( index)
-
- def build_dataloader(df, label_columns, tokenizer, max_token_len, trainset=False):
- dataset = text_dataset(df, label_columns, tokenizer, max_token_len)
-
- #随机抽取样本
- if trainset:
- sampler = RandomSampler(df)
- #有次序地抽取样本
- else:
- sampler = SequentialSampler(df)
-
- return DataLoader(dataset, batch_size=10, sampler=sampler)
- ##################################
- # build model
- ##################################
-
- bert_classifier = BertClassifier(
- num_labels=num_labels, BERT_MODEL_NAME=BERT_MODEL_NAME, freeze_bert=False
- )
- import torch
- import torch.nn as nn
- from transformers import BertModel
-
- class BertClassifier(nn.Module):
- def __init__(self, num_labels: int, BERT_MODEL_NAME, freeze_bert=False):
- super().__init__()
- self.num_labels = num_labels
- self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
-
- # hidden size of BERT, hidden size of our classifier, and number of labels to classify
- D_in, H, D_out = self.bert.config.hidden_size, 50, num_labels
-
- # Instantiate an one-layer feed-forward classifier
- self.classifier = nn.Sequential(
- nn.Dropout(p=0.3),
- nn.Linear(D_in, H),
- nn.ReLU(),
- nn.Dropout(p=0.3),
- nn.Linear(H, D_out),
- )
- # loss
- self.loss_func = nn.BCEWithLogitsLoss()
-
- if freeze_bert:
- print("freezing bert parameters")
- for param in self.bert.parameters():
- param.requires_grad = False
-
- def forward(self, input_ids, attention_mask, labels=None):
- outputs = self.bert(input_ids, attention_mask=attention_mask)
-
- # Extract the last hidden state of the token `[CLS]` for classification task
- last_hidden_state_cls = outputs[0][:, 0, :]
-
- logits = self.classifier(last_hidden_state_cls)
-
- if labels is not None:
- predictions = torch.sigmoid(logits)
- loss = self.loss_func(
- predictions.view(-1, self.num_labels), labels.view(-1, self.num_labels)
- )
- return loss
- else:
- return logits
- ##################################
- # train and validate model
- ##################################
-
- trained_model, training_stats, train_loss_set = train_model(
- bert_classifier,
- train_dataloader,
- val_dataloader=val_dataloader,
- epochs=5,
- evaluation=True,
- )
-
- plot_results(training_stats, train_loss_set)
- import time
- import random
- import numpy as np
- import torch
- from utils.helper_functions import format_time
- from transformers import AdamW, get_linear_schedule_with_warmup
-
-
- def train_model(
- model, train_dataloader, val_dataloader=None, epochs=5, evaluation=False
- ):
- """Train and validate the BertClassifier model."""
- training_stats = []
- train_loss_set = []
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- model.to(device)
- optimizer, scheduler = build_optimizer_scheduler(
- model=model, epochs=epochs, train_dataloader=train_dataloader
- )
- print("Start training...\n")
- for epoch_i in range(epochs):
- # =======================================
- # Training
- # =======================================
- print(
- f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
- )
- print("-" * 70)
- t0 = time.time()
- t0_epoch, t0_batch = time.time(), time.time()
- total_loss, batch_loss, batch_counts = 0, 0, 0
-
- model.train()
-
- for step, batch in enumerate(train_dataloader):
- batch_counts += 1
- b_input_ids = batch["input_ids"].to(device)
- b_attention_mask = batch["attention_mask"].to(device)
- b_labels = batch["labels"].to(device)
- # b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
-
- model.zero_grad()
-
- loss = model(
- input_ids=b_input_ids,
- attention_mask=b_attention_mask,
- labels=b_labels,
- )
- batch_loss += loss.item()
- total_loss += loss.item()
- train_loss_set.append(loss.item())
-
- loss.backward()
-
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
- optimizer.step()
- scheduler.step()
-
- # Print the loss values and time elapsed for every 20 batches
- if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
- time_elapsed = time.time() - t0_batch
- print(
- f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}"
- )
- # Reset batch tracking variables
- batch_loss, batch_counts = 0, 0
- t0_batch = time.time()
-
- # Calculate the average loss over the entire training data
- avg_train_loss = total_loss / len(train_dataloader)
- training_time = format_time(time.time() - t0)
-
- print("-" * 70)
-
- # =======================================
- # Evaluation
- # =======================================
- if evaluation == True:
- avg_val_loss, avg_val_accuracy, validation_time = evaluate(
- model, val_dataloader
- )
- time_elapsed = time.time() - t0_epoch
- print(
- f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {avg_val_loss:^10.6f} | {avg_val_accuracy:^9.2f} | {time_elapsed:^9.2f}"
- )
- print("-" * 70)
-
- # save model
- if (
- len(training_stats) == 0
- or training_stats[-1]["Valid. Loss"] > avg_train_loss
- ):
- model_dir = "model/model.pt"
- torch.save(model.state_dict(), model_dir)
-
- training_stats.append(
- {
- "epoch": epoch_i + 1,
- "Training Loss": avg_train_loss,
- "Valid. Loss": avg_val_loss,
- "Valid. Accur.": avg_val_accuracy,
- "Training Time": training_time,
- "Validation Time": validation_time,
- }
- )
-
- print("\n")
- print("Training complete!")
- return model, training_stats, train_loss_set
-
-
- def evaluate(model, val_dataloader):
- """After the completion of each training epoch, measure the model's performance
- on our validation set.
- """
- # Put the model into the evaluation mode. The dropout layers are disabled during
- # the test time.
- t0 = time.time()
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- model.to(device)
- model.eval()
-
- # Tracking variables
- avg_val_accuracy = []
- avg_val_loss = []
-
- # For each batch in our validation set...
- for batch in val_dataloader:
- b_input_ids = batch["input_ids"].to(device)
- b_attention_mask = batch["attention_mask"].to(device)
- b_labels = batch["labels"].to(device)
-
- # Compute logits
- with torch.no_grad():
- logits = model(b_input_ids, b_attention_mask)
-
- # Compute loss
- loss_func = model.loss_func
- predictions = torch.sigmoid(logits)
- loss = loss_func(
- predictions.view(-1, model.num_labels), b_labels.view(-1, model.num_labels)
- )
- avg_val_loss.append(loss.item())
-
- # Get the predictions
- preds = torch.round(predictions)
-
- # Calculate the accuracy rate
- accuracy = (preds == b_labels).cpu().numpy().mean() * 100
- avg_val_accuracy.append(accuracy)
-
- # Compute the average accuracy and loss over the validation set.
- avg_val_loss = np.mean(avg_val_loss)
- avg_val_accuracy = np.mean(avg_val_accuracy)
- validation_time = format_time(time.time() - t0)
- return avg_val_loss, avg_val_accuracy, validation_time
-
-
- def build_optimizer_scheduler(model, epochs, train_dataloader):
-
- # setting custom optimization parameters for huggingface model and implement a scheduler here as well.
- param_optimizer = list(model.named_parameters())
- no_decay = ["bias", "gamma", "beta"]
- optimizer_grouped_parameters = [
- {
- "params": [
- p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
- ],
- "weight_decay_rate": 0.01,
- },
- {
- "params": [
- p for n, p in param_optimizer if any(nd in n for nd in no_decay)
- ],
- "weight_decay_rate": 0.0,
- },
- ]
-
- optimizer = AdamW(
- optimizer_grouped_parameters,
- lr=5e-5, # Default learning rate
- eps=1e-8, # Default epsilon value
- )
-
- total_steps = len(train_dataloader) * epochs
- scheduler = get_linear_schedule_with_warmup(
- optimizer,
- num_warmup_steps=0, # Default value
- num_training_steps=total_steps,
- )
-
- return optimizer, scheduler
- ##################################
- # test model
- ##################################
-
- test_model(
- test_dataloader=test_dataloader,
- BERT_MODEL_NAME=BERT_MODEL_NAME,
- num_labels=num_labels,
- label_columns=label_columns,
- )
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。