Pytorch:循环神经网络-GRU

Pytorch: GRU网络进行情感分类

Copyright: Jingmin Wei, Pattern Recognition and Intelligent System, School of Artificial and Intelligence, Huazhong University of Science and Technology



详细的 GRU 结构可以参考教程的上上篇文章

本文主要是采用门控循环单元网络 GRU 来进行情感分类,大家也可以尝试把模型改成上篇教程 LSTM 对比两种网络的效果。

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time
import copy

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchtext import data
from torchtext.vocab import Vectors
# 定义文本用空格切分

mytokenize = lambda x: x.split()
TEXT = data.Field(sequential = True, tokenize = mytokenize,
                  include_lengths = True, use_vocab = True,
                  batch_first = True, fix_length = 200)
LABEL = data.Field(sequential = False, use_vocab = False,
                   pad_token = None, unk_token = None)
# 对所要读取的数据集的列进行处理
train_test_fields = [('text', TEXT), ('label', LABEL)]
# 读取数据
traindata, testdata = data.TabularDataset.splits(
    path = './data/aclImdb', format = 'csv',
    train = 'imdb_train.csv', fields = train_test_fields,
    test = 'imdb_test.csv', skip_header = True # 跳过文件第一行
# Vectors导入预训练好的词向量文件
vec = Vectors('glove.6B.100d.txt', './data/aclImdb')
# 使用训练集构建单词表,导入预先训练的词嵌入
TEXT.build_vocab(traindata, max_size = 20000, vectors = vec)
# 训练集,验证集和测试集定义为加载器
print(TEXT.vocab.freqs.most_common(20)) # 打印出现次数最多的20个单词
print(TEXT.vocab.itos[:10]) # 按词汇表索引顺序打印单词
class GRUNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim):
        vocab_size: 词典长度
        embedding_dim: 词向量的维度
        hidden_dim: GRU神经元个数
        layer_dim: GRU的层数
        output_dim: 隐藏层输出的维度(分类的数量)
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        # 对文本进行词向量处理
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # GRU + FC
        self.gru = nn.GRU(embedding_dim, hidden_dim, layer_dim, batch_first = True)
        self.fc1 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embeds = self.embedding(x)
        # r_out shape (batch, time_stop, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        r_out, h_n = self.gru(embeds, None) # None 表初始的hidden state=0
        # 选取最后一个时间点的out输出
        out = self.fc1(r_out[:, -1, :])
        return out
# 初始化循环神经网络
vocab_size = len(TEXT.vocab)
embedding_dim = vec.dim # 词向量的维度
hidden_dim = 128 # 128个神经元
layer_dim = 1
output_dim = 2 # 二分类问题
mygru = GRUNet(vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim)
  (embedding): Embedding(20002, 100)
  (gru): GRU(100, 128, batch_first=True)
  (fc1): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=2, bias=True)
# 将无法识别的词'<unk>','<pad>'的向量初始化为0
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
mygru.embedding.weight.data[UNK_IDX] = torch.zeros(vec.dim)
mygru.embedding.weight.data[PAD_IDX] = torch.zeros(vec.dim)
# 定义网络的训练过程函数
def train_model(model, traindataloader, testdataloader, criterion, optimizer, num_epochs = 25):
    train_loss_all = []
    train_acc_all = []
    test_loss_all = []
    test_acc_all = []
    learn_rate = []
    since = time.time()
    # 设置等间隔调整学习率,每隔step_size个epoch,学习率缩小为原来的1/10
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.1)
    for epoch in range(num_epochs):
        print('-' * 10)
        print('Epoch {}/{} Lr:{}'.format(epoch, num_epochs - 1, learn_rate[-1]))

        # 每个epoch有两个阶段:训练和验证
        train_loss = 0.0
        train_corrects = 0
        train_num = 0
        test_loss = 0.0
        test_corrects = 0
        test_num = 0

        for step, batch in enumerate(traindataloader):
            textdata, target = batch.text[0], batch.label
            textdata, target = textdata.to(device), target.to(device)
            out = model(textdata)
            pre_lab = torch.argmax(out, 1) # 预测的标签
            loss = criterion(out, target) # 损失函数值
            optimizer.zero_grad() # 清空过往梯度
            loss.backward() # 误差反向传播
            optimizer.step() # 更新权重参数
            train_loss += loss.item() * len(target)
            train_corrects += torch.sum(pre_lab == target.data)
            train_num += len(target)
        # 计算一个epoch在训练集上的损失和梯度
        train_loss_all.append(train_loss / train_num)
        train_acc_all.append(train_corrects.double().item() / train_num)
        print('{} Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch, train_loss_all[-1], train_acc_all[-1]))
        scheduler.step() # 更新学习率

        # 验证阶段
        for step, batch in enumerate(testdataloader):
            textdata, target = batch.text[0], batch.label
            textdata, target = textdata.to(device), target.to(device)
            out = model(textdata)
            pre_lab = torch.argmax(out, 1) # 预测的标签
            loss = criterion(out, target) # 损失函数值
            test_loss += loss.item() * len(target)
            test_corrects += torch.sum(pre_lab == target.data)
            test_num += len(target)
        # 计算一个epoch在训练集上的损失和梯度
        test_loss_all.append(test_loss / test_num)
        test_acc_all.append(test_corrects.double().item() / test_num)
        print('{} Test Loss: {:.4f} Test Acc: {:.4f}'.format(epoch, test_loss_all[-1], test_acc_all[-1]))

    train_process = pd.DataFrame(
        data = {'epoch': range(num_epochs),
                'train_loss_all': train_loss_all,
                'train_acc_all': train_acc_all,
                'test_loss_all': test_loss_all,
                'test_acc_all': test_acc_all,
                'learn_rate': learn_rate}

    return model, train_process
# 定义优化器
optimizer = optim.RMSprop(mygru.parameters(), lr = 0.003)
loss_func = nn.CrossEntropyLoss().to(device) # 交叉熵损失
# 迭代训练,所有数据训练十轮
mygru, train_process = train_model(mygru, train_iter, test_iter, loss_func, optimizer, num_epochs = 10)
Epoch 0/9 Lr:0.003
0 Train Loss: 0.5919 Train Acc: 0.6246
0 Test Loss: 0.3181 Test Acc: 0.8641
Epoch 1/9 Lr:0.003
1 Train Loss: 0.2620 Train Acc: 0.9001
1 Test Loss: 0.3017 Test Acc: 0.8706
Epoch 2/9 Lr:0.003
2 Train Loss: 0.1353 Train Acc: 0.9525
2 Test Loss: 0.4090 Test Acc: 0.8604
Epoch 3/9 Lr:0.003
3 Train Loss: 0.0543 Train Acc: 0.9828
3 Test Loss: 0.5829 Test Acc: 0.8574
Epoch 4/9 Lr:0.003
4 Train Loss: 0.0246 Train Acc: 0.9928
4 Test Loss: 0.9080 Test Acc: 0.8448
Epoch 5/9 Lr:3.0000000000000004e-05
5 Train Loss: 0.0060 Train Acc: 0.9984
5 Test Loss: 1.2189 Test Acc: 0.8474
Epoch 6/9 Lr:0.00030000000000000003
6 Train Loss: 0.0026 Train Acc: 0.9994
6 Test Loss: 1.6207 Test Acc: 0.8396
Epoch 7/9 Lr:0.00030000000000000003
7 Train Loss: 0.0011 Train Acc: 0.9997
7 Test Loss: 1.9210 Test Acc: 0.8419
Epoch 8/9 Lr:0.00030000000000000003
8 Train Loss: 0.0001 Train Acc: 1.0000
8 Test Loss: 2.5510 Test Acc: 0.8357
Epoch 9/9 Lr:0.00030000000000000003
9 Train Loss: 0.0000 Train Acc: 1.0000
9 Test Loss: 2.8857 Test Acc: 0.8424
# 可视化模型训练过程
plt.figure(figsize = (18, 6))

plt.subplot(1, 2, 1)
plt.plot(train_process.epoch, train_process.train_loss_all, 'r.-', label = 'Train loss')
plt.plot(train_process.epoch, train_process.test_loss_all, 'bs-', label = 'Test loss')
plt.xlabel('Epoch number', size = 13)
plt.ylabel('Loss value', size = 13)

plt.subplot(1, 2, 2)
plt.plot(train_process.epoch, train_process.train_acc_all, 'r.-', label = 'Train acc')
plt.plot(train_process.epoch, train_process.test_acc_all, 'bs-', label = 'Test acc')
plt.xlabel('Epoch number', size = 13)
plt.ylabel('Acc', size = 13)

# 对测试集进行预测并计算精度
test_y_all = torch.LongTensor().to(device) # 推到GPU上
pre_lab_all = torch.LongTensor().to(device) # 推到GPU上
for step, batch in enumerate(test_iter):
    textdata, target = batch.text[0], batch.label.view(-1)
    out = mygru(textdata)
    pre_lab = torch.argmax(out, 1)
    test_y_all = torch.cat((test_y_all, target)) # 测试集的标签
    pre_lab_all = torch.cat((pre_lab_all, pre_lab)) # 测试集的预测标签
acc = accuracy_score(test_y_all.cpu(), pre_lab_all.cpu()) # 转为cpu调用
print('测试集的预测精度为:', acc)
tensor([1, 1, 1,  ..., 0, 1, 1], device='cuda:0')
tensor([1, 0, 1,  ..., 0, 1, 0], device='cuda:0')
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
test_cloud = pd.read_csv('./data/aclImdb/imdb_test.csv')
test_label = test_cloud['label']
test_text = test_cloud['text']
# 将text列的句子转为ndarray
test_text_pre = []

for i in range(len(test_text)):

test_text_pre = np.array(test_text_pre)
# 分词,并写入列表中
def split_word(datalist):
    datalist_pre = []
    for text in datalist:
        text_words = word_tokenize(text) # 分词
    return np.array(datalist_pre)
test_word = split_word(test_text) # 进行分词处理
# 网络预测值转为ndarray
pre_lab_all = pre_lab_all.cpu().numpy()
  • 1
array([1, 0, 1, ..., 0, 1, 0], dtype=int64)
  • 1
# 评论,评论的每个单词的列表,标签,预测值
test_data = pd.DataFrame({'test_text': test_text,
                          'test_word': test_word,
                          'test_label': test_label,
                          'pre_label': pre_lab_all})
  • 6
0went saw movie last night coaxed friends mine ...[went, saw, movie, last, night, coaxed, friend...11
1actor turned director bill paxton follows prom...[actor, turned, director, bill, paxton, follow...10
2recreational golfer knowledge sport history pl...[recreational, golfer, knowledge, sport, histo...11
3saw film sneak preview delightful cinematograp...[saw, film, sneak, preview, delightful, cinema...11
4bill paxton taken true story us golf open made...[bill, paxton, taken, true, story, us, golf, o...11
24995occasionally let kids watch garbage understand...[occasionally, let, kids, watch, garbage, unde...01
24996anymore pretty much reality tv shows people ma...[anymore, pretty, much, reality, tv, shows, pe...00
24997basic genre thriller intercut uncomfortable me...[basic, genre, thriller, intercut, uncomfortab...00
24998four things intrigued film firstly stars carly...[four, things, intrigued, film, firstly, stars...01
24999david bryce comments nearby exceptionally well...[david, bryce, comments, nearby, exceptionally...00

25000 rows × 4 columns

# 测试集之前打过标记label,词云可视化两种情感的词频差异
plt.figure(figsize = (16, 10))
for ii in np.unique(test_label):
    # 准备每种情感的所有词语
    text = np.array(test_data.test_word[test_data.test_label == ii])
    text = ' '.join(np.concatenate(text))
    plt.subplot(1, 2, ii + 1)
    # 生成词云
    wordcod = WordCloud(margin = 5, width = 1800, height = 1000, max_words = 500, min_font_size = 5, background_color = 'white', max_font_size = 250)
    wordcod.generate_from_text(text) # 可视化
    if ii == 1:
    plt.subplots_adjust(wspace = 0.05)
# 经网络分类后,词云可视化两种情感的词频差异
plt.figure(figsize = (16, 10))
for ii in np.unique(test_label):
    # 准备每种情感的所有词语
    text = np.array(test_data.test_word[test_data.pre_label == ii])
    text = ' '.join(np.concatenate(text))
    plt.subplot(1, 2, ii + 1)
    # 生成词云
    wordcod = WordCloud(margin = 5, width = 1800, height = 1000, max_words = 500, min_font_size = 5, background_color = 'white', max_font_size = 250)
    wordcod.generate_from_text(text) # 可视化
    if ii == 1:
    plt.subplots_adjust(wspace = 0.05)
plt.title('Predicted Wordcloud')
# 词云合并
plt.figure(figsize = (16, 10))
text = np.array([0])
for i in range(2):
    for ii in np.unique(test_label):
        # 准备每种情感的所有词语
        if i == 0:
            text = np.array(test_data.test_word[test_data.test_label == ii])
            plt.subplot(2, 2, ii + 1)
            text = np.array(test_data.test_word[test_data.pre_label == ii])
            plt.subplot(2, 2, ii + 3)
        text = ' '.join(np.concatenate(text))
        # 生成词云
        wordcod = WordCloud(margin = 5, width = 1800, height = 1000, max_words = 500, min_font_size = 5, background_color = 'white', max_font_size = 250)
        wordcod.generate_from_text(text) # 可视化
        if ii == 1 and i == 0:
            plt.title('Label Positive')
        elif ii == 0 and i == 0:
            plt.title('Label Negative')
        elif ii == 1 and i ==1:
            plt.title('Predicted Positive')
            plt.title('Predicted Negative')
        plt.subplots_adjust(wspace = 0.05)
