36 万多条,带情感标注 新浪微博,包含 4 种情感,其中喜悦约 20 万条,愤怒、厌恶、低落各约 5 万条。





2. 概述Tokenizer

先介绍下BERT Tokenizer中的max_length、padding和truncation参数的工作原理。










3. 寻找最大长度

  1. # simplifyweibo_4_moods
  2. import matplotlib.pyplot as plt
  3. plt.style.use('seaborn')
  4. import pandas as pd
  5. x = range(16)
  6. y = [0 for _ in range(16)]
  7. df = pd.read_csv('simplifyweibo_4_moods.csv')
  8. text = df['review']
  9. for line in text:
  10. y[len(line.split(' '))] += 1
  11. for i in range(1, 16):
  12. y[i] += y[i - 1]
  13. fig = plt.figure(figsize=(15, 9))
  14. plt.bar(x, y,label='simplifyweibo_4_moods')
  15. plt.legend(loc="upper left",fontsize=25)
  16. plt.xlabel('Length',fontsize=25)
  17. plt.show()


4. 训练代码

  1. import torch
  2. import numpy as np
  3. from transformers import BertTokenizer
  4. import pandas as pd
  5. from torch import nn
  6. from transformers import BertModel
  7. from torch.optim import Adam
  8. from tqdm import tqdm
  9. df = pd.read_csv('simplifyweibo_4_moods.csv')
  10. np.random.seed(112)
  11. df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
  12. [int(.8*len(df)), int(.9*len(df))]) # 拆分为训练集、验证集和测试集,比例为 80:10:10。
  13. tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
  14. class Dataset(torch.utils.data.Dataset):
  15. def __init__(self, df):
  16. self.labels = np.array(df['label'])
  17. self.texts = [tokenizer(text,
  18. padding='max_length',
  19. max_length = 16,
  20. truncation=True,
  21. return_tensors="pt")
  22. for text in df['review']]
  23. def classes(self):
  24. return self.labels
  25. def __len__(self):
  26. return len(self.labels)
  27. def get_batch_labels(self, idx):
  28. # Fetch a batch of labels
  29. return np.array(self.labels[idx])
  30. def get_batch_texts(self, idx):
  31. # Fetch a batch of inputs
  32. return self.texts[idx]
  33. def __getitem__(self, idx):
  34. batch_texts = self.get_batch_texts(idx)
  35. batch_y = self.get_batch_labels(idx)
  36. return batch_texts, batch_y
  37. # 构建模型
  38. class BertClassifier(nn.Module):
  39. def __init__(self, dropout=0.5):
  40. super(BertClassifier, self).__init__()
  41. self.bert = BertModel.from_pretrained('bert-base-chinese',num_labels=15)
  42. self.dropout = nn.Dropout(dropout)
  43. self.linear = nn.Linear(768, 4)
  44. self.relu = nn.ReLU()
  45. def forward(self, input_id, mask):
  46. _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
  47. dropout_output = self.dropout(pooled_output)
  48. linear_output = self.linear(dropout_output)
  49. final_layer = self.relu(linear_output)
  50. return final_layer
  51. # 训练模型
  52. def train(model, train_data, val_data, learning_rate, epochs, batch_size):
  53. # 通过Dataset类获取训练和验证集
  54. train, val = Dataset(train_data), Dataset(val_data)
  55. # DataLoader根据batch_size获取数据,训练时选择打乱样本
  56. train_dataloader = torch.utils.data.DataLoader(train, batch_size, shuffle=True)
  57. val_dataloader = torch.utils.data.DataLoader(val, batch_size)
  58. # 判断是否使用GPU
  59. use_cuda = torch.cuda.is_available()
  60. device = torch.device("cuda" if use_cuda else "cpu")
  61. # 定义损失函数和优化器
  62. criterion = nn.CrossEntropyLoss()
  63. optimizer = Adam(model.parameters(), lr=learning_rate)
  64. if use_cuda:
  65. model = model.cuda()
  66. criterion = criterion.cuda()
  67. # 开始进入训练循环
  68. for epoch_num in range(epochs):
  69. # 定义两个变量,用于存储训练集的准确率和损失
  70. total_acc_train = 0
  71. total_loss_train = 0
  72. # 进度条函数tqdm
  73. for train_input, train_label in tqdm(train_dataloader):
  74. train_label = train_label.to(device)
  75. mask = train_input['attention_mask'].to(device)
  76. input_id = train_input['input_ids'].squeeze(1).to(device)
  77. # 通过模型得到输出
  78. output = model(input_id, mask)
  79. # 计算损失
  80. batch_loss = criterion(output, train_label.long())
  81. total_loss_train += batch_loss.item()
  82. # 计算精度
  83. acc = (output.argmax(dim=1) == train_label).sum().item()
  84. total_acc_train += acc
  85. # 模型更新
  86. model.zero_grad()
  87. batch_loss.backward()
  88. optimizer.step()
  89. # ------ 验证模型 -----------
  90. # 定义两个变量,用于存储验证集的准确率和损失
  91. total_acc_val = 0
  92. total_loss_val = 0
  93. # 不需要计算梯度
  94. with torch.no_grad():
  95. # 循环获取数据集,并用训练好的模型进行验证
  96. for val_input, val_label in val_dataloader:
  97. # 如果有GPU,则使用GPU,接下来的操作同训练
  98. val_label = val_label.to(device)
  99. mask = val_input['attention_mask'].to(device)
  100. input_id = val_input['input_ids'].squeeze(1).to(device)
  101. output = model(input_id, mask)
  102. batch_loss = criterion(output, val_label.long())
  103. total_loss_val += batch_loss.item()
  104. acc = (output.argmax(dim=1) == val_label).sum().item()
  105. total_acc_val += acc
  106. print(
  107. f'''Epochs: {epoch_num + 1}
  108. | Train Loss: {total_loss_train / len(train_data): .3f}
  109. | Train Accuracy: {total_acc_train / len(train_data): .3f}
  110. | Val Loss: {total_loss_val / len(val_data): .3f}
  111. | Val Accuracy: {total_acc_val / len(val_data): .3f}''')
  112. EPOCHS = 10 # 训练轮数
  113. model = BertClassifier() # 定义的模型
  114. LR = 1e-6 # 学习率
  115. Batch_Size = 16 # 看你的GPU,要合理取值
  116. train(model, df_train, df_val, LR, EPOCHS, Batch_Size)
  117. torch.save(model.state_dict(), 'BERT-weibo.pt')
  118. # 评估模型
  119. def evaluate(model, test_data, batch_size):
  120. test = Dataset(test_data)
  121. test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)
  122. use_cuda = torch.cuda.is_available()
  123. device = torch.device("cuda" if use_cuda else "cpu")
  124. if use_cuda:
  125. model = model.cuda()
  126. total_acc_test = 0
  127. with torch.no_grad():
  128. for test_input, test_label in test_dataloader:
  129. test_label = test_label.to(device)
  130. mask = test_input['attention_mask'].to(device)
  131. input_id = test_input['input_ids'].squeeze(1).to(device)
  132. output = model(input_id, mask)
  133. acc = (output.argmax(dim=1) == test_label).sum().item()
  134. total_acc_test += acc
  135. print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
  136. evaluate(model, df_test, Batch_Size)

5. 测试代码

  1. import torch
  2. from transformers import BertTokenizer
  3. from torch import nn
  4. from transformers import BertModel
  5. def get_label_string(label):
  6. labels = {'喜悦': 0,
  7. '愤怒': 1,
  8. '厌恶': 2,
  9. '低落': 3
  10. }
  11. for key, value in labels.items():
  12. if value == label:
  13. return key
  14. return None
  15. # 构建模型
  16. class BertClassifier(nn.Module):
  17. def __init__(self, dropout=0.5):
  18. super(BertClassifier, self).__init__()
  19. self.bert = BertModel.from_pretrained('bert-base-chinese',num_labels=4)
  20. self.dropout = nn.Dropout(dropout)
  21. self.linear = nn.Linear(768, 4)
  22. self.relu = nn.ReLU()
  23. def forward(self, input_id, mask):
  24. _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
  25. dropout_output = self.dropout(pooled_output)
  26. linear_output = self.linear(dropout_output)
  27. final_layer = self.relu(linear_output)
  28. return final_layer
  29. model = BertClassifier()
  30. model.load_state_dict(torch.load('BERT-weibo.pt'))
  31. model.eval()
  32. tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
  33. text = '元旦快乐!'
  34. text_input = tokenizer(text,padding='max_length',max_length = 16,truncation=True,return_tensors="pt")
  35. mask = text_input['attention_mask']
  36. input_id = text_input['input_ids']
  37. output = model(input_id, mask)
  38. output = output.argmax(dim=1)
  39. output = output.item()
  40. label_string = get_label_string(output)
  41. print(label_string)



