当前位置:   article > 正文

BERT+textCNN+aclImdb文本情感分析(改)_bert+textcnn情感分析模型设计图

bert+textcnn情感分析模型设计图

本次的内容没有大改动,只是可以不用配置torchtext,也不import d2l,那就意味着可以使用任意版本的pyrtoch。

相关的环境配置也只需要 pip install transformers 就可以了。

目录

训练网络

import相关包

导入bert

读取数据

数据预处理

定义网络

训练网络

模型保存

输入文字进行测试

模型读取与测试


训练网络

import相关包

  1. import os
  2. from tqdm import tqdm
  3. import random
  4. import time
  5. import torch
  6. from torch import nn
  7. import torch.utils.data as Data
  8. import torch.nn.functional as F
  9. import sys
  10. sys.path.append("..")
  11. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  12. DATA_ROOT = "../../data"
  13. print(torch.__version__, device)

导入bert

  1. from transformers import BertModel, BertTokenizer
  2. # 这里我们调用bert-base模型,同时模型的词典经过小写处理
  3. model_name = 'bert-base-uncased'
  4. # 读取模型对应的tokenizer
  5. tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
  6. # 载入模型
  7. model = BertModel.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
  1. model.to(device)
  2. model.eval()

读取数据

  1. def read_imdb(folder='train', data_root="/Datasets/aclImdb"):
  2. data = []
  3. for label in ['pos', 'neg']:
  4. folder_name = os.path.join(data_root, folder, label)
  5. for file in tqdm(os.listdir(folder_name)):
  6. with open(os.path.join(folder_name, file), 'rb') as f:
  7. review = f.read().decode('utf-8').replace('\n', '').lower()
  8. data.append([review, 1 if label == 'pos' else 0])
  9. random.shuffle(data)
  10. return data
  1. train_data = read_imdb('train', data_root=os.path.join(DATA_ROOT, "aclImdb"))
  2. test_data = read_imdb('test', data_root=os.path.join(DATA_ROOT, "aclImdb"))

数据预处理

  1. def pretreatment(original_data):
  2. i = 0
  3. for element in original_data:
  4. temporary = []
  5. original_data[i][0] = torch.tensor(tokenizer.encode(element[0], add_special_tokens=True))
  6. if (original_data[i][0].shape)[0] > 500:
  7. original_data[i][0] = original_data[i][0][:500]
  8. original_data[i][0][499] = 102
  9. elif (original_data[i][0].shape)[0] < 500:
  10. n = torch.zeros(500)
  11. n[: (original_data[i][0].shape)[0]-1] = original_data[i][0][:(original_data[i][0].shape)[0]-1]
  12. original_data[i][0] = n
  13. original_data[i][0][499] = 102
  14. temporary.append(element[1])
  15. original_data[i][1] = torch.tensor(temporary)
  16. i = i+1
  17. features = torch.cat([original_data[i][0].unsqueeze(0).long() for i in range(len(test_data))])
  18. labels = torch.cat( [original_data[i][1] for i in range(len(test_data))], 0)
  19. return features, labels
  1. train_set = Data.TensorDataset(*(pretreatment(train_data)))
  2. test_set = Data.TensorDataset(*(pretreatment(test_data)))
  1. batch_size = 2
  2. train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
  3. test_iter = Data.DataLoader(test_set, batch_size, shuffle=True)
  1. del(train_data)
  2. del(test_data)
  3. del(train_set)
  4. del(test_set)

定义网络

  1. class GlobalMaxPool1d(nn.Module):
  2. def __init__(self):
  3. super(GlobalMaxPool1d, self).__init__()
  4. def forward(self, x):
  5. # x shape: (batch_size, channel, seq_len)
  6. return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)
  1. class TextCNN(nn.Module):
  2. def __init__(self, embed_size, kernel_sizes, num_channels):
  3. super(TextCNN, self).__init__()
  4. # self.embedding = nn.Embedding(len(vocab), embed_size)
  5. # 不参与训练的嵌入层
  6. # self.constant_embedding = nn.Embedding(len(vocab), embed_size)
  7. self.dropout = nn.Dropout(0.5)
  8. self.decoder = nn.Linear(sum(num_channels), 2)
  9. # 时序最大池化层没有权重,所以可以共用一个实例
  10. self.pool = GlobalMaxPool1d()
  11. self.convs = nn.ModuleList() # 创建多个一维卷积层
  12. for c, k in zip(num_channels, kernel_sizes):
  13. self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
  14. out_channels = c,
  15. kernel_size = k))
  16. def forward(self, inputs):
  17. outputs = model(inputs)[0] #shape(500768
  18. embeddings = torch.cat((
  19. outputs,
  20. outputs), dim=2) # (batch, seq_len, 2*embed_size)
  21. # 根据Conv1D要求的输入格式,将词向量维,即一维卷积层的通道维(即词向量那一维),变换到前一维
  22. embeddings = embeddings.permute(0, 2, 1) # 交换维度的函数
  23. # 对于每个一维卷积层,在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
  24. # Tensor。使用flatten函数去掉最后一维,然后在通道维上连结
  25. encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
  26. # 应用丢弃法后使用全连接层得到输出
  27. outputs = self.decoder(self.dropout(encoding))
  28. return outputs

训练网络

  1. embed_size, kernel_sizes, nums_channels = 768, [3, 4, 5], [100, 100, 100]
  2. net = TextCNN(embed_size, kernel_sizes, nums_channels)
  1. def evaluate_accuracy(data_iter, net, device=None):
  2. if device is None and isinstance(net, torch.nn.Module):
  3. # 如果没指定device就使用net的device
  4. device = list(net.parameters())[0].device
  5. acc_sum, n = 0.0, 0
  6. with torch.no_grad():
  7. for X, y in data_iter:
  8. if isinstance(net, torch.nn.Module):
  9. net.eval() # 评估模式, 这会关闭dropout
  10. acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
  11. net.train() # 改回训练模式
  12. else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
  13. if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
  14. # 将is_training设置成False
  15. acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
  16. else:
  17. acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
  18. n += y.shape[0]
  19. return acc_sum / n
  1. def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
  2. net = net.to(device)
  3. print("training on ", device)
  4. batch_count = 0
  5. for epoch in range(num_epochs):
  6. train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
  7. for X, y in train_iter:
  8. X = X.to(device)
  9. y = y.to(device)
  10. y_hat = net(X)
  11. l = loss(y_hat, y)
  12. optimizer.zero_grad()
  13. l.backward()
  14. optimizer.step()
  15. train_l_sum += l.cpu().item()
  16. train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
  17. n += y.shape[0]
  18. batch_count += 1
  19. test_acc = evaluate_accuracy(test_iter, net)
  20. print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
  21. % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
  1. lr, num_epochs = 0.001, 1
  2. optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
  3. loss = nn.CrossEntropyLoss()
  4. train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

模型保存

  1. PATH = "./SaveModel/BertBaseEpoch1.pth"
  2. torch.save(net, PATH)

输入文字进行测试

  1. def predict_sentiment(net, sentence):
  2. """sentence是词语的列表"""
  3. device = list(net.parameters())[0].device
  4. sentence = torch.tensor(tokenizer.encode(s, add_special_tokens=True), device=device)
  5. label = torch.argmax(net(sentence.view((1, -1))), dim=1)
  6. return 'positive' if label.item() == 1 else 'negative'
  1. print("请输入一句评价电影的英文:")
  2. s = input()
  3. # 需不需要填充到500
  4. print(predict_sentiment(net, s))

模型读取与测试

  1. import torch
  2. from torch import nn
  3. import torch.nn.functional as F
  4. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  1. from transformers import BertModel, BertTokenizer
  2. # 这里我们调用bert-base模型,同时模型的词典经过小写处理
  3. model_name = 'bert-base-uncased'
  4. # 读取模型对应的tokenizer
  5. tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
  6. # 载入模型
  7. model = BertModel.from_pretrained(model_name, cache_dir='./transformers/bert-base-uncased/')
  1. model.to(device)
  2. model.eval()
  1. class GlobalMaxPool1d(nn.Module):
  2. def __init__(self):
  3. super(GlobalMaxPool1d, self).__init__()
  4. def forward(self, x):
  5. # x shape: (batch_size, channel, seq_len)
  6. return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)
  1. class TextCNN(nn.Module):
  2. def __init__(self, embed_size, kernel_sizes, num_channels):
  3. super(TextCNN, self).__init__()
  4. # self.embedding = nn.Embedding(len(vocab), embed_size)
  5. # 不参与训练的嵌入层
  6. # self.constant_embedding = nn.Embedding(len(vocab), embed_size)
  7. self.dropout = nn.Dropout(0.5)
  8. self.decoder = nn.Linear(sum(num_channels), 2)
  9. # 时序最大池化层没有权重,所以可以共用一个实例
  10. self.pool = GlobalMaxPool1d()
  11. self.convs = nn.ModuleList() # 创建多个一维卷积层
  12. for c, k in zip(num_channels, kernel_sizes):
  13. self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
  14. out_channels = c,
  15. kernel_size = k))
  16. def forward(self, inputs):
  17. outputs = model(inputs)[0] #shape(500, 768)
  18. embeddings = torch.cat((
  19. outputs,
  20. outputs), dim=2) # (batch, seq_len, 2*embed_size)
  21. # 根据Conv1D要求的输入格式,将词向量维,即一维卷积层的通道维(即词向量那一维),变换到前一维
  22. embeddings = embeddings.permute(0, 2, 1) # 交换维度的函数
  23. # 对于每个一维卷积层,在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
  24. # Tensor。使用flatten函数去掉最后一维,然后在通道维上连结
  25. encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
  26. # 应用丢弃法后使用全连接层得到输出
  27. outputs = self.decoder(self.dropout(encoding))
  28. return outputs
  1. PATH = "./SaveModel/BertBaseEpoch1.pth"
  2. net = torch.load(PATH)
net.to(device)
  1. def predict_sentiment(net, sentence):
  2. """sentence是词语的列表"""
  3. device = list(net.parameters())[0].device
  4. sentence = torch.tensor(tokenizer.encode(s, add_special_tokens=True), device=device)
  5. label = torch.argmax(net(sentence.view((1, -1))), dim=1)
  6. return 'positive' if label.item() == 1 else 'negative'
  1. print("请输入一句评价电影的英文:")
  2. s = input()
  3. # 需不需要填充到500
  4. print(predict_sentiment(net, s))

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号