当前位置:   article > 正文

昇思25天学习打卡营第25天 | RNN情感分类

昇思25天学习打卡营第25天 | RNN情感分类
内容介绍:

情感分类是自然语言处理中的经典任务,是典型的分类问题。本节使用MindSpore实现一个基于RNN网络的情感分类模型,实现如下的效果:

  1. 输入: This film is terrible
  2. 正确标签: Negative
  3. 预测标签: Negative
  4. 输入: This film is great
  5. 正确标签: Positive
  6. 预测标签: Positive

具体内容:

1. 导包

  1. import os
  2. import shutil
  3. import requests
  4. import tempfile
  5. from tqdm import tqdm
  6. from typing import IO
  7. from pathlib import Path
  8. import re
  9. import six
  10. import string
  11. import tarfile
  12. import mindspore.dataset as ds
  13. import zipfile
  14. import numpy as np
  15. import mindspore as ms
  16. import math
  17. import mindspore as ms
  18. import mindspore.nn as nn
  19. import mindspore.ops as ops
  20. from mindspore.common.initializer import Uniform, HeUniform

2. 数据下载

  1. # 指定保存路径为 `home_path/.mindspore_examples`
  2. cache_dir = Path.home() / '.mindspore_examples'
  3. def http_get(url: str, temp_file: IO):
  4. """使用requests库下载数据,并使用tqdm库进行流程可视化"""
  5. req = requests.get(url, stream=True)
  6. content_length = req.headers.get('Content-Length')
  7. total = int(content_length) if content_length is not None else None
  8. progress = tqdm(unit='B', total=total)
  9. for chunk in req.iter_content(chunk_size=1024):
  10. if chunk:
  11. progress.update(len(chunk))
  12. temp_file.write(chunk)
  13. progress.close()
  14. def download(file_name: str, url: str):
  15. """下载数据并存为指定名称"""
  16. if not os.path.exists(cache_dir):
  17. os.makedirs(cache_dir)
  18. cache_path = os.path.join(cache_dir, file_name)
  19. cache_exist = os.path.exists(cache_path)
  20. if not cache_exist:
  21. with tempfile.NamedTemporaryFile() as temp_file:
  22. http_get(url, temp_file)
  23. temp_file.flush()
  24. temp_file.seek(0)
  25. with open(cache_path, 'wb') as cache_file:
  26. shutil.copyfileobj(temp_file, cache_file)
  27. return cache_path
  1. imdb_path = download('aclImdb_v1.tar.gz', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz')
  2. imdb_path

3. 数据集加载

  1. class IMDBData():
  2. """IMDB数据集加载器
  3. 加载IMDB数据集并处理为一个Python迭代对象。
  4. """
  5. label_map = {
  6. "pos": 1,
  7. "neg": 0
  8. }
  9. def __init__(self, path, mode="train"):
  10. self.mode = mode
  11. self.path = path
  12. self.docs, self.labels = [], []
  13. self._load("pos")
  14. self._load("neg")
  15. def _load(self, label):
  16. pattern = re.compile(r"aclImdb/{}/{}/.*\.txt$".format(self.mode, label))
  17. # 将数据加载至内存
  18. with tarfile.open(self.path) as tarf:
  19. tf = tarf.next()
  20. while tf is not None:
  21. if bool(pattern.match(tf.name)):
  22. # 对文本进行分词、去除标点和特殊字符、小写处理
  23. self.docs.append(str(tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
  24. .translate(None, six.b(string.punctuation)).lower()).split())
  25. self.labels.append([self.label_map[label]])
  26. tf = tarf.next()
  27. def __getitem__(self, idx):
  28. return self.docs[idx], self.labels[idx]
  29. def __len__(self):
  30. return len(self.docs)
  1. imdb_train = IMDBData(imdb_path, 'train')
  2. len(imdb_train)
  1. def load_imdb(imdb_path):
  2. imdb_train = ds.GeneratorDataset(IMDBData(imdb_path, "train"), column_names=["text", "label"], shuffle=True, num_samples=10000)
  3. imdb_test = ds.GeneratorDataset(IMDBData(imdb_path, "test"), column_names=["text", "label"], shuffle=False)
  4. return imdb_train, imdb_test
  1. imdb_train, imdb_test = load_imdb(imdb_path)
  2. imdb_train

4. 加载预训练词向量

  1. def load_glove(glove_path):
  2. glove_100d_path = os.path.join(cache_dir, 'glove.6B.100d.txt')
  3. if not os.path.exists(glove_100d_path):
  4. glove_zip = zipfile.ZipFile(glove_path)
  5. glove_zip.extractall(cache_dir)
  6. embeddings = []
  7. tokens = []
  8. with open(glove_100d_path, encoding='utf-8') as gf:
  9. for glove in gf:
  10. word, embedding = glove.split(maxsplit=1)
  11. tokens.append(word)
  12. embeddings.append(np.fromstring(embedding, dtype=np.float32, sep=' '))
  13. # 添加 <unk>, <pad> 两个特殊占位符对应的embedding
  14. embeddings.append(np.random.rand(100))
  15. embeddings.append(np.zeros((100,), np.float32))
  16. vocab = ds.text.Vocab.from_list(tokens, special_tokens=["<unk>", "<pad>"], special_first=False)
  17. embeddings = np.array(embeddings).astype(np.float32)
  18. return vocab, embeddings

5. 下载Glove

  1. glove_path = download('glove.6B.zip', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/glove.6B.zip')
  2. vocab, embeddings = load_glove(glove_path)
  3. len(vocab.vocab())
  1. idx = vocab.tokens_to_ids('the')
  2. embedding = embeddings[idx]
  3. idx, embedding

6. 数据预处理

  1. lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
  2. pad_op = ds.transforms.PadEnd([500], pad_value=vocab.tokens_to_ids('<pad>'))
  3. type_cast_op = ds.transforms.TypeCast(ms.float32)
  1. imdb_train = imdb_train.map(operations=[lookup_op, pad_op], input_columns=['text'])
  2. imdb_train = imdb_train.map(operations=[type_cast_op], input_columns=['label'])
  3. imdb_test = imdb_test.map(operations=[lookup_op, pad_op], input_columns=['text'])
  4. imdb_test = imdb_test.map(operations=[type_cast_op], input_columns=['label'])
imdb_train, imdb_valid = imdb_train.split([0.7, 0.3])
  1. imdb_train = imdb_train.batch(64, drop_remainder=True)
  2. imdb_valid = imdb_valid.batch(64, drop_remainder=True)

6. 模型构建

  1. class RNN(nn.Cell):
  2. def __init__(self, embeddings, hidden_dim, output_dim, n_layers,
  3. bidirectional, pad_idx):
  4. super().__init__()
  5. vocab_size, embedding_dim = embeddings.shape
  6. self.embedding = nn.Embedding(vocab_size, embedding_dim, embedding_table=ms.Tensor(embeddings), padding_idx=pad_idx)
  7. self.rnn = nn.LSTM(embedding_dim,
  8. hidden_dim,
  9. num_layers=n_layers,
  10. bidirectional=bidirectional,
  11. batch_first=True)
  12. weight_init = HeUniform(math.sqrt(5))
  13. bias_init = Uniform(1 / math.sqrt(hidden_dim * 2))
  14. self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init=weight_init, bias_init=bias_init)
  15. def construct(self, inputs):
  16. embedded = self.embedding(inputs)
  17. _, (hidden, _) = self.rnn(embedded)
  18. hidden = ops.concat((hidden[-2, :, :], hidden[-1, :, :]), axis=1)
  19. output = self.fc(hidden)
  20. return output

7. 损失函数和优化器

  1. hidden_size = 256
  2. output_size = 1
  3. num_layers = 2
  4. bidirectional = True
  5. lr = 0.001
  6. pad_idx = vocab.tokens_to_ids('<pad>')
  7. model = RNN(embeddings, hidden_size, output_size, num_layers, bidirectional, pad_idx)
  8. loss_fn = nn.BCEWithLogitsLoss(reduction='mean')
  9. optimizer = nn.Adam(model.trainable_params(), learning_rate=lr)

8. 训练过程

  1. def forward_fn(data, label):
  2. logits = model(data)
  3. loss = loss_fn(logits, label)
  4. return loss
  5. grad_fn = ms.value_and_grad(forward_fn, None, optimizer.parameters)
  6. def train_step(data, label):
  7. loss, grads = grad_fn(data, label)
  8. optimizer(grads)
  9. return loss
  10. def train_one_epoch(model, train_dataset, epoch=0):
  11. model.set_train()
  12. total = train_dataset.get_dataset_size()
  13. loss_total = 0
  14. step_total = 0
  15. with tqdm(total=total) as t:
  16. t.set_description('Epoch %i' % epoch)
  17. for i in train_dataset.create_tuple_iterator():
  18. loss = train_step(*i)
  19. loss_total += loss.asnumpy()
  20. step_total += 1
  21. t.set_postfix(loss=loss_total/step_total)
  22. t.update(1)

9. 评估指标

  1. def binary_accuracy(preds, y):
  2. """
  3. 计算每个batch的准确率
  4. """
  5. # 对预测值进行四舍五入
  6. rounded_preds = np.around(ops.sigmoid(preds).asnumpy())
  7. correct = (rounded_preds == y).astype(np.float32)
  8. acc = correct.sum() / len(correct)
  9. return acc
  1. def evaluate(model, test_dataset, criterion, epoch=0):
  2. total = test_dataset.get_dataset_size()
  3. epoch_loss = 0
  4. epoch_acc = 0
  5. step_total = 0
  6. model.set_train(False)
  7. with tqdm(total=total) as t:
  8. t.set_description('Epoch %i' % epoch)
  9. for i in test_dataset.create_tuple_iterator():
  10. predictions = model(i[0])
  11. loss = criterion(predictions, i[1])
  12. epoch_loss += loss.asnumpy()
  13. acc = binary_accuracy(predictions, i[1])
  14. epoch_acc += acc
  15. step_total += 1
  16. t.set_postfix(loss=epoch_loss/step_total, acc=epoch_acc/step_total)
  17. t.update(1)
  18. return epoch_loss / total

10. 模型训练与保存

  1. num_epochs = 2
  2. best_valid_loss = float('inf')
  3. ckpt_file_name = os.path.join(cache_dir, 'sentiment-analysis.ckpt')
  4. for epoch in range(num_epochs):
  5. train_one_epoch(model, imdb_train, epoch)
  6. valid_loss = evaluate(model, imdb_valid, loss_fn, epoch)
  7. if valid_loss < best_valid_loss:
  8. best_valid_loss = valid_loss
  9. ms.save_checkpoint(model, ckpt_file_name)

我对RNN的基本原理,如时间步的展开、状态传递、梯度消失与爆炸等问题有了更加深入的理解,还掌握了如何通过门控机制(如LSTM、GRU)来优化这些问题。MindSpore的API设计清晰直观,使得从理论到实践的转换变得顺畅无比,我能够迅速地将理论知识应用于构建具体的模型,如文本生成、情感分析或时间序列预测等任务中。、

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小舞很执着/article/detail/843848
推荐阅读
相关标签
  

闽ICP备14008679号