当前位置:   article > 正文

【Bert】文本多标签分类_bert多标签文本分类

bert多标签文本分类

1. 算法介绍

1.1 参考文献

复旦大学邱锡鹏老师课题组的研究论文《How to Fine-Tune BERT for Text Classification?》。

论文: https://arxiv.org/pdf/1905.05583.pdf

https://mp.weixin.qq.com/s/9MrgIz2bchiCjUGpz6MbGQ

1.2 论文思路

旨在文本分类任务上探索不同的BERT微调方法并提供一种通用的BERT微调解决方法。这篇论文从三种路线进行了探索:

  • (1) BERT自身的微调策略,包括长文本处理、学习率、不同层的选择等方法;

  • (2) 目标任务内、领域内及跨领域的进一步预训练BERT;

  • (3) 多任务学习。微调后的BERT在七个英文数据集及搜狗中文数据集上取得了当前最优的结果。

1.3 代码来源

作者的实现代码: https://github.com/xuyige/BERT4doc-Classification

数据集来源:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset?select=train.csv

项目地址:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset

该数据集包含 6 个不同的标签(计算机科学、物理、数学、统计学、生物学、金融),根据摘要和标题对研究论文进行分类。标签列中的值 1 表示标签属于该标签。每个论文有多个标签为 1。

2. 代码实践

2.1 Import

  1. #2.1 Import
  2. #关于torch的安装可以参考https://blog.csdn.net/Checkmate9949/article/details/119494673?spm=1001.2014.3001.5501
  3. import torch
  4. from transformers import BertTokenizerFast as BertTokenizer
  5. from utils.plot_results import plot_results
  6. from resources.train_val_model import train_model
  7. from resources.get_data import get_data
  8. from resources.build_model import BertClassifier
  9. from resources.test_model import test_model
  10. from resources.build_dataloader import build_dataloader

2.2 Get data: 分割样本

  1. 2.2 Get data
  2. ##################################
  3. # get data
  4. ##################################
  5. #该函数见2.2.1
  6. train_df, val_df, test_df = get_data()
  7. # fixed parameters
  8. #Columns: 第三行到倒数第二行
  9. label_columns = train_df.columns.tolist()[3:-1]
  10. num_labels = len(label_columns)
  11. max_token_len = 30
  12. # BERT_MODEL_NAME = "bert-base-uncased"
  13. # bert-base-uncased: for English. bert-base-Chinese
  14. BERT_MODEL_NAME = "model/bert-base-uncased"
  15. #分词
  16. tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
  1. def get_data():
  2. df = pd.read_csv("data/train.csv")
  3. #把标题和摘要合并作为TEXT
  4. df["TEXT"] = df["TITLE"] + df["ABSTRACT"]
  5. label_columns = df.columns.tolist()[3:-1]
  6. print(df[label_columns].sum().sort_values())
  7. #Split data in to train and test: 训练集占比80%
  8. test_df, train_df = train_test_split(df, test_size=0.8, random_state=42)
  9. #Split data in to valid and test: 分别占比50%
  10. test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
  11. #输出数据集
  12. return train_df, val_df, test_df

2.3 Build data loaders 

  1. ##################################
  2. # build data loaders
  3. ##################################
  4. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  5. #训练集使随机抽样random sample
  6. #val and test: squential sample
  7. train_dataloader = build_dataloader(
  8. train_df, label_columns, tokenizer, max_token_len, trainset=True
  9. )
  10. val_dataloader = build_dataloader(val_df, label_columns, tokenizer, max_token_len)
  11. test_dataloader = build_dataloader(test_df, label_columns, tokenizer, max_token_len)
  1. build_dataloader
  2. import os
  3. import torch
  4. from torch.utils.data import Dataset, DataLoader
  5. from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
  6. #df=train_df
  7. os.environ["TOKENIZERS_PARALLELISM"] = "false"
  8. class text_dataset(Dataset):
  9. def __init__( df, label_columns, tokenizer, max_token_len):
  10. data = df
  11. label_columns = label_columns
  12. tokenizer = tokenizer
  13. max_token_len = max_token_len
  14. #返回数据长度
  15. def __len__(self):
  16. return len(data)
  17. #根据index获取item
  18. #index=3
  19. def __getitem__( index):
  20. data_row = data.iloc[index]
  21. text = data_row["TEXT"]
  22. labels = data_row[label_columns]
  23. encoding = tokenizer.encode_plus(
  24. text,
  25. add_special_tokens=True,
  26. max_length=max_token_len,
  27. return_token_type_ids=False,
  28. padding="max_length",
  29. truncation=True,
  30. return_attention_mask=True,
  31. return_tensors="pt",
  32. )
  33. return dict(
  34. text=text,
  35. input_ids=encoding["input_ids"].flatten(),
  36. attention_mask=encoding["attention_mask"].flatten(),
  37. labels=torch.FloatTensor(labels),
  38. )
  39. #test=__getitem__( index)
  40. def build_dataloader(df, label_columns, tokenizer, max_token_len, trainset=False):
  41. dataset = text_dataset(df, label_columns, tokenizer, max_token_len)
  42. #随机抽取样本
  43. if trainset:
  44. sampler = RandomSampler(df)
  45. #有次序地抽取样本
  46. else:
  47. sampler = SequentialSampler(df)
  48. return DataLoader(dataset, batch_size=10, sampler=sampler)

2.4 Build model

  1. ##################################
  2. # build model
  3. ##################################
  4. bert_classifier = BertClassifier(
  5. num_labels=num_labels, BERT_MODEL_NAME=BERT_MODEL_NAME, freeze_bert=False
  6. )
  1. import torch
  2. import torch.nn as nn
  3. from transformers import BertModel
  4. class BertClassifier(nn.Module):
  5. def __init__(self, num_labels: int, BERT_MODEL_NAME, freeze_bert=False):
  6. super().__init__()
  7. self.num_labels = num_labels
  8. self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
  9. # hidden size of BERT, hidden size of our classifier, and number of labels to classify
  10. D_in, H, D_out = self.bert.config.hidden_size, 50, num_labels
  11. # Instantiate an one-layer feed-forward classifier
  12. self.classifier = nn.Sequential(
  13. nn.Dropout(p=0.3),
  14. nn.Linear(D_in, H),
  15. nn.ReLU(),
  16. nn.Dropout(p=0.3),
  17. nn.Linear(H, D_out),
  18. )
  19. # loss
  20. self.loss_func = nn.BCEWithLogitsLoss()
  21. if freeze_bert:
  22. print("freezing bert parameters")
  23. for param in self.bert.parameters():
  24. param.requires_grad = False
  25. def forward(self, input_ids, attention_mask, labels=None):
  26. outputs = self.bert(input_ids, attention_mask=attention_mask)
  27. # Extract the last hidden state of the token `[CLS]` for classification task
  28. last_hidden_state_cls = outputs[0][:, 0, :]
  29. logits = self.classifier(last_hidden_state_cls)
  30. if labels is not None:
  31. predictions = torch.sigmoid(logits)
  32. loss = self.loss_func(
  33. predictions.view(-1, self.num_labels), labels.view(-1, self.num_labels)
  34. )
  35. return loss
  36. else:
  37. return logits

2.5 训练模型,并且通过validation选择最佳参数

  1. ##################################
  2. # train and validate model
  3. ##################################
  4. trained_model, training_stats, train_loss_set = train_model(
  5. bert_classifier,
  6. train_dataloader,
  7. val_dataloader=val_dataloader,
  8. epochs=5,
  9. evaluation=True,
  10. )
  11. plot_results(training_stats, train_loss_set)
  1. import time
  2. import random
  3. import numpy as np
  4. import torch
  5. from utils.helper_functions import format_time
  6. from transformers import AdamW, get_linear_schedule_with_warmup
  7. def train_model(
  8. model, train_dataloader, val_dataloader=None, epochs=5, evaluation=False
  9. ):
  10. """Train and validate the BertClassifier model."""
  11. training_stats = []
  12. train_loss_set = []
  13. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  14. model.to(device)
  15. optimizer, scheduler = build_optimizer_scheduler(
  16. model=model, epochs=epochs, train_dataloader=train_dataloader
  17. )
  18. print("Start training...\n")
  19. for epoch_i in range(epochs):
  20. # =======================================
  21. # Training
  22. # =======================================
  23. print(
  24. f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
  25. )
  26. print("-" * 70)
  27. t0 = time.time()
  28. t0_epoch, t0_batch = time.time(), time.time()
  29. total_loss, batch_loss, batch_counts = 0, 0, 0
  30. model.train()
  31. for step, batch in enumerate(train_dataloader):
  32. batch_counts += 1
  33. b_input_ids = batch["input_ids"].to(device)
  34. b_attention_mask = batch["attention_mask"].to(device)
  35. b_labels = batch["labels"].to(device)
  36. # b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
  37. model.zero_grad()
  38. loss = model(
  39. input_ids=b_input_ids,
  40. attention_mask=b_attention_mask,
  41. labels=b_labels,
  42. )
  43. batch_loss += loss.item()
  44. total_loss += loss.item()
  45. train_loss_set.append(loss.item())
  46. loss.backward()
  47. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  48. optimizer.step()
  49. scheduler.step()
  50. # Print the loss values and time elapsed for every 20 batches
  51. if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
  52. time_elapsed = time.time() - t0_batch
  53. print(
  54. f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}"
  55. )
  56. # Reset batch tracking variables
  57. batch_loss, batch_counts = 0, 0
  58. t0_batch = time.time()
  59. # Calculate the average loss over the entire training data
  60. avg_train_loss = total_loss / len(train_dataloader)
  61. training_time = format_time(time.time() - t0)
  62. print("-" * 70)
  63. # =======================================
  64. # Evaluation
  65. # =======================================
  66. if evaluation == True:
  67. avg_val_loss, avg_val_accuracy, validation_time = evaluate(
  68. model, val_dataloader
  69. )
  70. time_elapsed = time.time() - t0_epoch
  71. print(
  72. f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {avg_val_loss:^10.6f} | {avg_val_accuracy:^9.2f} | {time_elapsed:^9.2f}"
  73. )
  74. print("-" * 70)
  75. # save model
  76. if (
  77. len(training_stats) == 0
  78. or training_stats[-1]["Valid. Loss"] > avg_train_loss
  79. ):
  80. model_dir = "model/model.pt"
  81. torch.save(model.state_dict(), model_dir)
  82. training_stats.append(
  83. {
  84. "epoch": epoch_i + 1,
  85. "Training Loss": avg_train_loss,
  86. "Valid. Loss": avg_val_loss,
  87. "Valid. Accur.": avg_val_accuracy,
  88. "Training Time": training_time,
  89. "Validation Time": validation_time,
  90. }
  91. )
  92. print("\n")
  93. print("Training complete!")
  94. return model, training_stats, train_loss_set
  95. def evaluate(model, val_dataloader):
  96. """After the completion of each training epoch, measure the model's performance
  97. on our validation set.
  98. """
  99. # Put the model into the evaluation mode. The dropout layers are disabled during
  100. # the test time.
  101. t0 = time.time()
  102. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  103. model.to(device)
  104. model.eval()
  105. # Tracking variables
  106. avg_val_accuracy = []
  107. avg_val_loss = []
  108. # For each batch in our validation set...
  109. for batch in val_dataloader:
  110. b_input_ids = batch["input_ids"].to(device)
  111. b_attention_mask = batch["attention_mask"].to(device)
  112. b_labels = batch["labels"].to(device)
  113. # Compute logits
  114. with torch.no_grad():
  115. logits = model(b_input_ids, b_attention_mask)
  116. # Compute loss
  117. loss_func = model.loss_func
  118. predictions = torch.sigmoid(logits)
  119. loss = loss_func(
  120. predictions.view(-1, model.num_labels), b_labels.view(-1, model.num_labels)
  121. )
  122. avg_val_loss.append(loss.item())
  123. # Get the predictions
  124. preds = torch.round(predictions)
  125. # Calculate the accuracy rate
  126. accuracy = (preds == b_labels).cpu().numpy().mean() * 100
  127. avg_val_accuracy.append(accuracy)
  128. # Compute the average accuracy and loss over the validation set.
  129. avg_val_loss = np.mean(avg_val_loss)
  130. avg_val_accuracy = np.mean(avg_val_accuracy)
  131. validation_time = format_time(time.time() - t0)
  132. return avg_val_loss, avg_val_accuracy, validation_time
  133. def build_optimizer_scheduler(model, epochs, train_dataloader):
  134. # setting custom optimization parameters for huggingface model and implement a scheduler here as well.
  135. param_optimizer = list(model.named_parameters())
  136. no_decay = ["bias", "gamma", "beta"]
  137. optimizer_grouped_parameters = [
  138. {
  139. "params": [
  140. p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
  141. ],
  142. "weight_decay_rate": 0.01,
  143. },
  144. {
  145. "params": [
  146. p for n, p in param_optimizer if any(nd in n for nd in no_decay)
  147. ],
  148. "weight_decay_rate": 0.0,
  149. },
  150. ]
  151. optimizer = AdamW(
  152. optimizer_grouped_parameters,
  153. lr=5e-5, # Default learning rate
  154. eps=1e-8, # Default epsilon value
  155. )
  156. total_steps = len(train_dataloader) * epochs
  157. scheduler = get_linear_schedule_with_warmup(
  158. optimizer,
  159. num_warmup_steps=0, # Default value
  160. num_training_steps=total_steps,
  161. )
  162. return optimizer, scheduler

2.6 测试模型

  1. ##################################
  2. # test model
  3. ##################################
  4. test_model(
  5. test_dataloader=test_dataloader,
  6. BERT_MODEL_NAME=BERT_MODEL_NAME,
  7. num_labels=num_labels,
  8. label_columns=label_columns,
  9. )

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/361812
推荐阅读
相关标签
  

闽ICP备14008679号