赞
踩
本示例教程演示如何在IMDB数据集上使用RNN网络完成文本分类的任务。IMDB数据集包含对电影评论进行正向和负向标注的数据,共有25000条文本数据作为训练集,25000条文本数据作为测试集。数据集的官方地址为:IMDB Dataset
本示例基于飞桨开源框架2.0版本。
import paddle
import numpy as np
import matplotlib.pyplot as plt
import paddle.nn as nn
print(paddle.__version__) # 查看当前版本
# cpu/gpu环境选择,在 paddle.set_device() 输入对应运行设备。
device = paddle.set_device('gpu')
2.0.1
由于IMDB是NLP领域中常见的数据集,飞桨框架将其内置,路径为paddle.text.datasets.Imdb
。通过mode
参数可以控制训练集与测试集。
print('loading dataset...')
train_dataset = paddle.text.datasets.Imdb(mode='train')
test_dataset = paddle.text.datasets.Imdb(mode='test')
print('loading finished')
构建了训练集与测试集后,可以通过word_idx
获取数据集的词表。
word_dict = train_dataset.word_idx # 获取数据集的词表
# add a pad token to the dict for later padding the sequence
word_dict['<pad>'] = len(word_dict)
for k in list(word_dict)[:5]:
print("{}:{}".format(k.decode('ASCII'), word_dict[k]))
print("...")
for k in list(word_dict)[-5:]:
print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))
print("totally {} words".format(len(word_dict)))
在这里设置词表大小、embedding大小、batch_size等参数。
vocab_size = len(word_dict) + 1 print(vocab_size) emb_size = 256 seq_len = 200 batch_size = 32 epochs = 2 pad_id = word_dict['<pad>'] classes = ['negative', 'positive'] # 生成句子列表 def ids_to_str(ids): words = [] for k in ids: w = list(word_dict)[k] words.append(w if isinstance(w, str) else w.decode('ASCII')) return " ".join(words)
文本数据中,每一句话的长度都是不一样的,为了方便后续的神经网络的计算,通常使用padding的方式对齐数据。
# 读取数据归一化处理 def create_padded_dataset(dataset): padded_sents = [] labels = [] for batch_id, data in enumerate(dataset): sent, label = data[0], data[1] padded_sent = np.concatenate([sent[:seq_len], [pad_id] * (seq_len - len(sent))]).astype('int32') padded_sents.append(padded_sent) labels.append(label) return np.array(padded_sents), np.array(labels) # 对train、test数据进行实例化 train_sents, train_labels = create_padded_dataset(train_dataset) test_sents, test_labels = create_padded_dataset(test_dataset) # 查看数据大小及举例内容 print(train_sents.shape) print(train_labels.shape) print(test_sents.shape) print(test_labels.shape) for sent in train_sents[:3]: print(ids_to_str(sent))
将前面准备好的训练集与测试集用Dataset
与DataLoader
封装后,完成数据的加载。
class IMDBDataset(paddle.io.Dataset): ''' 继承paddle.io.Dataset类进行封装数据 ''' def __init__(self, sents, labels): self.sents = sents self.labels = labels def __getitem__(self, index): data = self.sents[index] label = self.labels[index] return data, label def __len__(self): return len(self.sents) train_dataset = IMDBDataset(train_sents, train_labels) test_dataset = IMDBDataset(test_sents, test_labels) train_loader = paddle.io.DataLoader(train_dataset, return_list=True, shuffle=True, batch_size=batch_size, drop_last=True) test_loader = paddle.io.DataLoader(test_dataset, return_list=True, shuffle=True, batch_size=batch_size, drop_last=True)
本示例中使用一个序列特性的RNN网络,在查找到每个词对应的embedding后,取平均作为一个句子的表示。然后用Linear进行线性变换,同时使用Dropout防止过拟合。
class MyRNN(paddle.nn.Layer):
def __init__(self):
super(MyRNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, 256)
self.rnn = nn.SimpleRNN(256, 256, num_layers=2, direction='forward',dropout=0.5)
self.linear = nn.Linear(in_features=256*2, out_features=2)
self.dropout = nn.Dropout(0.5)
def forward(self, inputs):
emb = self.dropout(self.embedding(inputs))
output, hidden = self.rnn(emb)
hidden = paddle.concat((hidden[-2,:,:], hidden[-1,:,:]), axis = 1)
hidden = self.dropout(hidden)
return self.linear(hidden)
# 可视化定义 def draw_process(title, color, iters, data, label): plt.title(title, fontsize=24) plt.xlabel("iter", fontsize=20) plt.ylabel(label, fontsize=20) plt.plot(iters, data, color=color, label=label) plt.legend() plt.grid() plt.show() # 对模型进行封装 def train(model): model.train() opt = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) steps = 0 Iters, total_loss, total_acc = [], [], [] for epoch in range(epochs): for batch_id, data in enumerate(train_loader): steps += 1 sent = data[0] label = data[1] logits = model(sent) loss = paddle.nn.functional.cross_entropy(logits, label) acc = paddle.metric.accuracy(logits, label) if batch_id % 500 == 0: # 500个epoch输出一次结果 Iters.append(steps) total_loss.append(loss.numpy()[0]) total_acc.append(acc.numpy()[0]) print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy())) loss.backward() opt.step() opt.clear_grad() # evaluate model after one epoch model.eval() accuracies = [] losses = [] for batch_id, data in enumerate(test_loader): sent = data[0] label = data[1] logits = model(sent) loss = paddle.nn.functional.cross_entropy(logits, label) acc = paddle.metric.accuracy(logits, label) accuracies.append(acc.numpy()) losses.append(loss.numpy()) avg_acc, avg_loss = np.mean(accuracies), np.mean(losses) print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss)) model.train() # 保存模型 paddle.save(model.state_dict(), str(epoch) + "_model_final.pdparams") # 可视化查看 draw_process("training loss", "red", Iters, total_loss, "training loss") draw_process("training acc", "green", Iters, total_acc, "training acc") model = MyRNN() train(model)
model_state_dict = paddle.load('1_model_final.pdparams') # 导入模型 model = MyRNN() model.set_state_dict(model_state_dict) model.eval() accuracies = [] losses = [] for batch_id, data in enumerate(test_loader): sent = data[0] label = data[1] logits = model(sent) loss = paddle.nn.functional.cross_entropy(logits, label) acc = paddle.metric.accuracy(logits, label) accuracies.append(acc.numpy()) losses.append(loss.numpy()) avg_acc, avg_loss = np.mean(accuracies), np.mean(losses) print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))
def ids_to_str(ids): words = [] for k in ids: w = list(word_dict)[k] words.append(w if isinstance(w, str) else w.decode('UTF-8')) return " ".join(words) label_map = {0: "negative", 1: "positive"} # 导入模型 model_state_dict = paddle.load('1_model_final.pdparams') model = MyRNN() model.set_state_dict(model_state_dict) model.eval() for batch_id, data in enumerate(test_loader): sent = data[0] results = model(sent) predictions = [] for probs in results: # 映射分类label idx = np.argmax(probs) labels = label_map[idx] predictions.append(labels) for i, pre in enumerate(predictions): print(' 数据: {} \n 情感: {}'.format(ids_to_str(sent[0]), pre)) break break
以上是使用RNN完成IMDB电影评论情感分析的示例。通过搭建RNN网络,对文本数据进行预处理、模型训练和评估,最终实现了对电影评论情感的分类。在实际应用中,可以根据需求调整网络结构和超参数,提高模型性能。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。