赞
踩
在上一篇中我们基于今日头条新闻数据集构建了分类模型,取得了较好的结果。。今天我们基于simplifyweibo_4_moods数据集进行文本分类,篇幅有限,完整代码可在文末获取。
一.simplifyweibo_4_moods数据集
36 万多条,带情感标注 新浪微博,包含 4 种情感,其中喜悦约 20 万条,愤怒、厌恶、低落各约 5 万条。
二.代码实现
1.下载预训练模型
bert-chinese:https://huggingface.co/bert-base-chinese
simplifyweibo_4_moods数据集:simplifyweibo_4_moods
2. 概述Tokenizer
先介绍下BERT Tokenizer中的max_length、padding和truncation参数的工作原理。
(1)padding
用于指定填充的方式。可以使用不同的填充方式。
max_length:填充后的序列长度将与max_length参数指定的长度一致。对于超过max_length长度的序列,进行截断;对于不足max_length长度的序列,进行填充。
longest:填充后的序列长度将与最长的序列长度一致。所有序列都将在最长序列的基础上进行填充或截断。
do_not_pad:不进行填充操作
(2)max_length
用于指定切分后的文本序列的最大长度。如果输入文本的长度超过了max_length,则会进行截断(truncation)以确保序列的长度不超过max_length。如果输入文本的长度不足max_length,将会进行填充(padding)以使序列长度一致。
(2)truncation参数
truncation参数用于指定是否进行截断,默认为False。当truncation参数设置为True时,如果输入文本长度超过了max_length,将会进行截断操作;当truncation参数设置为False时,输入文本长度超过max_length将导致错误。
3. 寻找最大长度
- # simplifyweibo_4_moods
- import matplotlib.pyplot as plt
- plt.style.use('seaborn')
- import pandas as pd
- x = range(16)
- y = [0 for _ in range(16)]
- df = pd.read_csv('simplifyweibo_4_moods.csv')
- text = df['review']
- for line in text:
- y[len(line.split(' '))] += 1
- for i in range(1, 16):
- y[i] += y[i - 1]
- fig = plt.figure(figsize=(15, 9))
- plt.bar(x, y,label='simplifyweibo_4_moods')
- plt.legend(loc="upper left",fontsize=25)
- plt.xlabel('Length',fontsize=25)
- plt.show()
可以看到所有文本都在16以内,故最大长度我们设置为16即可,这样可以加快训练速度。
4. 训练代码
- import torch
- import numpy as np
- from transformers import BertTokenizer
- import pandas as pd
- from torch import nn
- from transformers import BertModel
- from torch.optim import Adam
- from tqdm import tqdm
-
- df = pd.read_csv('simplifyweibo_4_moods.csv')
- np.random.seed(112)
- df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
- [int(.8*len(df)), int(.9*len(df))]) # 拆分为训练集、验证集和测试集,比例为 80:10:10。
-
- tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
-
- class Dataset(torch.utils.data.Dataset):
- def __init__(self, df):
- self.labels = np.array(df['label'])
- self.texts = [tokenizer(text,
- padding='max_length',
- max_length = 16,
- truncation=True,
- return_tensors="pt")
- for text in df['review']]
-
- def classes(self):
- return self.labels
-
- def __len__(self):
- return len(self.labels)
-
- def get_batch_labels(self, idx):
- # Fetch a batch of labels
- return np.array(self.labels[idx])
-
- def get_batch_texts(self, idx):
- # Fetch a batch of inputs
- return self.texts[idx]
-
- def __getitem__(self, idx):
- batch_texts = self.get_batch_texts(idx)
- batch_y = self.get_batch_labels(idx)
- return batch_texts, batch_y
-
- # 构建模型
- class BertClassifier(nn.Module):
- def __init__(self, dropout=0.5):
- super(BertClassifier, self).__init__()
- self.bert = BertModel.from_pretrained('bert-base-chinese',num_labels=15)
- self.dropout = nn.Dropout(dropout)
- self.linear = nn.Linear(768, 4)
- self.relu = nn.ReLU()
-
- def forward(self, input_id, mask):
- _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
- dropout_output = self.dropout(pooled_output)
- linear_output = self.linear(dropout_output)
- final_layer = self.relu(linear_output)
- return final_layer
-
-
- # 训练模型
- def train(model, train_data, val_data, learning_rate, epochs, batch_size):
- # 通过Dataset类获取训练和验证集
- train, val = Dataset(train_data), Dataset(val_data)
- # DataLoader根据batch_size获取数据,训练时选择打乱样本
- train_dataloader = torch.utils.data.DataLoader(train, batch_size, shuffle=True)
- val_dataloader = torch.utils.data.DataLoader(val, batch_size)
- # 判断是否使用GPU
- use_cuda = torch.cuda.is_available()
- device = torch.device("cuda" if use_cuda else "cpu")
- # 定义损失函数和优化器
- criterion = nn.CrossEntropyLoss()
- optimizer = Adam(model.parameters(), lr=learning_rate)
-
- if use_cuda:
- model = model.cuda()
- criterion = criterion.cuda()
- # 开始进入训练循环
- for epoch_num in range(epochs):
- # 定义两个变量,用于存储训练集的准确率和损失
- total_acc_train = 0
- total_loss_train = 0
- # 进度条函数tqdm
- for train_input, train_label in tqdm(train_dataloader):
- train_label = train_label.to(device)
- mask = train_input['attention_mask'].to(device)
- input_id = train_input['input_ids'].squeeze(1).to(device)
- # 通过模型得到输出
- output = model(input_id, mask)
- # 计算损失
- batch_loss = criterion(output, train_label.long())
- total_loss_train += batch_loss.item()
- # 计算精度
- acc = (output.argmax(dim=1) == train_label).sum().item()
- total_acc_train += acc
- # 模型更新
- model.zero_grad()
- batch_loss.backward()
- optimizer.step()
- # ------ 验证模型 -----------
- # 定义两个变量,用于存储验证集的准确率和损失
- total_acc_val = 0
- total_loss_val = 0
- # 不需要计算梯度
- with torch.no_grad():
- # 循环获取数据集,并用训练好的模型进行验证
- for val_input, val_label in val_dataloader:
- # 如果有GPU,则使用GPU,接下来的操作同训练
- val_label = val_label.to(device)
- mask = val_input['attention_mask'].to(device)
- input_id = val_input['input_ids'].squeeze(1).to(device)
- output = model(input_id, mask)
- batch_loss = criterion(output, val_label.long())
- total_loss_val += batch_loss.item()
- acc = (output.argmax(dim=1) == val_label).sum().item()
- total_acc_val += acc
- print(
- f'''Epochs: {epoch_num + 1}
- | Train Loss: {total_loss_train / len(train_data): .3f}
- | Train Accuracy: {total_acc_train / len(train_data): .3f}
- | Val Loss: {total_loss_val / len(val_data): .3f}
- | Val Accuracy: {total_acc_val / len(val_data): .3f}''')
-
- EPOCHS = 10 # 训练轮数
- model = BertClassifier() # 定义的模型
- LR = 1e-6 # 学习率
- Batch_Size = 16 # 看你的GPU,要合理取值
- train(model, df_train, df_val, LR, EPOCHS, Batch_Size)
- torch.save(model.state_dict(), 'BERT-weibo.pt')
-
- # 评估模型
- def evaluate(model, test_data, batch_size):
- test = Dataset(test_data)
- test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)
- use_cuda = torch.cuda.is_available()
- device = torch.device("cuda" if use_cuda else "cpu")
- if use_cuda:
- model = model.cuda()
- total_acc_test = 0
- with torch.no_grad():
- for test_input, test_label in test_dataloader:
- test_label = test_label.to(device)
- mask = test_input['attention_mask'].to(device)
- input_id = test_input['input_ids'].squeeze(1).to(device)
- output = model(input_id, mask)
- acc = (output.argmax(dim=1) == test_label).sum().item()
- total_acc_test += acc
- print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
- evaluate(model, df_test, Batch_Size)
5. 测试代码
- import torch
- from transformers import BertTokenizer
- from torch import nn
- from transformers import BertModel
-
- def get_label_string(label):
- labels = {'喜悦': 0,
- '愤怒': 1,
- '厌恶': 2,
- '低落': 3
- }
- for key, value in labels.items():
- if value == label:
- return key
- return None
-
- # 构建模型
- class BertClassifier(nn.Module):
- def __init__(self, dropout=0.5):
- super(BertClassifier, self).__init__()
- self.bert = BertModel.from_pretrained('bert-base-chinese',num_labels=4)
- self.dropout = nn.Dropout(dropout)
- self.linear = nn.Linear(768, 4)
- self.relu = nn.ReLU()
-
- def forward(self, input_id, mask):
- _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
- dropout_output = self.dropout(pooled_output)
- linear_output = self.linear(dropout_output)
- final_layer = self.relu(linear_output)
- return final_layer
- model = BertClassifier()
- model.load_state_dict(torch.load('BERT-weibo.pt'))
- model.eval()
- tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
- text = '元旦快乐!'
- text_input = tokenizer(text,padding='max_length',max_length = 16,truncation=True,return_tensors="pt")
- mask = text_input['attention_mask']
- input_id = text_input['input_ids']
- output = model(input_id, mask)
- output = output.argmax(dim=1)
- output = output.item()
- label_string = get_label_string(output)
- print(label_string)
训练曲线和测试结果,并且得到了训练权重,喜欢的小伙伴可关注公众号回复“BERT微博”获取源代码和训练好的权重文件。
最后:
会不定期发布相关设计内容包括但不限于如下内容:信号处理、通信仿真、算法设计、matlab appdesigner,gui设计、simulink仿真......希望能帮到你!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。