当前位置:   article > 正文

AG_NEWS新闻分类任务_ag news数据集下载

ag news数据集下载

关于新闻主题分类任务:目前视频和网上的代码都不能完整的运行,所以从下载数据集开始,重新写一下。

1. 数据集介绍

AG_NEWS 数据集包含4个文件,如下图

 

classes.txt:保存类别

test.csv:测试数据,7600条

train.csv:训练数据,120000条

 2. 对数据集处理

导入包

  1. import torch
  2. import pandas as pd
  3. from keras.preprocessing.text import Tokenizer
  4. from keras.utils import pad_sequences
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. from torch.utils.data import DataLoader
  8. import time
  9. from torch.utils.data.dataset import random_split # 导入数据随机划分方法工具
  10. import warnings
  11. warnings.filterwarnings('ignore')

读取csv文件

  1. def load_data(csv_file):
  2. df = pd.read_csv(csv_file, header=None) # pd默认第一行不读取,所以添加 header
  3. dataTmep = []
  4. # 逐行读取,_ 行号,row 内容
  5. for _, row in df.iterrows():
  6. label = row[0]
  7. context = row[1] + row[2] # 将标题,内容合并
  8. dataTmep.append((label, context))
  9. return dataTmep
  10. cutlen = 64
  11. train_dataset = load_data("./data/ag_news_csv/train.csv")
  12. test_dataset = load_data("./data/ag_news_csv/test.csv")

将读取到的文件打包,形成可以读取的dataset,并生成vocab,查看结果

  1. def process_datasets_by_Tokenizer(train_datasets, test_datasets, cutlen=cutlen):
  2. tokenizer = Tokenizer()
  3. train_datasets_texts = []
  4. train_datasets_labels = []
  5. test_datasets_texts = []
  6. test_datasets_labels = []
  7. for index in range(len(train_datasets)):
  8. train_datasets_labels.append(train_datasets[index][0] - 1)
  9. train_datasets_texts.append(train_datasets[index][1])
  10. for index in range(len(test_datasets)):
  11. test_datasets_labels.append(test_datasets[index][0] - 1)
  12. test_datasets_texts.append(test_datasets[index][1])
  13. all_datasets_texts = train_datasets_texts + test_datasets_texts
  14. all_datasets_labels = train_datasets_labels + test_datasets_labels
  15. tokenizer.fit_on_texts(all_datasets_texts)
  16. train_datasets_seqs = tokenizer.texts_to_sequences(train_datasets_texts)
  17. test_datasets_seqs = tokenizer.texts_to_sequences(test_datasets_texts)
  18. train_datasets_seqs = pad_sequences(train_datasets_seqs, cutlen)
  19. test_datasets_seqs = pad_sequences(test_datasets_seqs, cutlen)
  20. train_datasets = list(zip(train_datasets_seqs, train_datasets_labels))
  21. test_datasets = list(zip(test_datasets_seqs, test_datasets_labels))
  22. vocab_size = len(tokenizer.index_word.keys())
  23. num_class = len(set(all_datasets_labels))
  24. return train_datasets, test_datasets, vocab_size, num_class, tokenizer
  25. train_datasets, test_datasets, vocab_size, num_class, tokenizer = process_datasets_by_Tokenizer(train_dataset, test_dataset, cutlen=cutlen)
  26. print("查看处理之后的数据: ")
  27. print("train:\n", train_datasets[:2])
  28. print("test:\n", test_datasets[:2])
  29. print("vocab_size = {}, num_class = {}".format(vocab_size, num_class))
  30. print()

3. 构建带有 Embedding 层的文本分类模型

  1. BATCH_SIZE = 16
  2. VOCAB_SIZE = vocab_size # 获得整个语料包含的不同词汇总数
  3. NUM_CLASS = num_class # 获得类别总数
  4. EMBED_DIM = 128
  5. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  6. class TextSentiment(nn.Module):
  7. def __init__(self, vocab_size, embed_dim, num_class):
  8. """
  9. 类的初始化函数
  10. :param vocab_size: 整个语料包含的不同词汇总数
  11. :param embed_dim: 指定词嵌入的维度
  12. :param num_class: 文本分类的类别总数
  13. """
  14. super().__init__()
  15. self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
  16. self.fc = nn.Linear(embed_dim, num_class)
  17. self.init_weights()
  18. def init_weights(self):
  19. initrange = 0.5
  20. self.embedding.weight.data.uniform_(-initrange, initrange)
  21. self.fc.weight.data.uniform_(-initrange, initrange)
  22. self.fc.bias.data.zero_()
  23. def forward(self, text):
  24. """
  25. 逻辑函数
  26. :param text: 文本数值映射后的结果
  27. :return: 与类别数尺寸相同的张量,用以判断文本类别
  28. """
  29. embedded = self.embedding(text)
  30. c = embedded.size(0) // BATCH_SIZE
  31. embedded = embedded[: BATCH_SIZE * c]
  32. embedded = embedded.transpose(1, 0).unsqueeze(0)
  33. embedded = F.avg_pool1d(embedded, kernel_size=c)
  34. return self.fc(embedded[0].transpose(1, 0))
  35. # 实例化模型
  36. model = TextSentiment(VOCAB_SIZE + 1, EMBED_DIM, NUM_CLASS).to(device)
  37. print("查看模型: ")
  38. print(model)
  39. print()

 4. 对数据进行 batch 处理

  1. def generate_batch(batch):
  2. """
  3. 生成 batch 数据函数
  4. :param batch: 由样本核对应标签的元组组成的 batch_size 大小的列表,形如[(sample1, label1), (sample2, label2)......]
  5. :return: 样本张量核标签各自的列表形式 (张量),形如 text = tensor([sample1, sample2....]),label = tensor([label1, label2,...])
  6. """
  7. text = []
  8. label = []
  9. for item in batch:
  10. text.extend(item[0])
  11. label.append(item[1])
  12. return torch.tensor(text), torch.tensor(label)
  13. # 假设一个输入
  14. print("测试将一个 batch 张量合并: ")
  15. batch = [(torch.tensor([3, 23, 2, 8]), 1), (torch.tensor([3, 45, 21, 6]), 0)]
  16. res = generate_batch(batch)
  17. print(res)
  18. print()

5.构建训练与验证函数

 构建损失函数,优化器等

  1. criterion = torch.nn.CrossEntropyLoss().to(device) # 选择损失函数,选择预定义的交叉熵损失函数
  2. optimizer = torch.optim.SGD(model.parameters(), lr=4.0) # 选择随机梯度下降优化器
  3. scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9) # 选择优化器步长调节方法 StepLR,用来衰减学习率

定义训练函数

  1. def train(train_data):
  2. train_loss = 0
  3. train_acc = 0
  4. # 使用数据加载器生成 BATCH_SIZE 大小的数据进行批次训练
  5. data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
  6. for i, (text, cls) in enumerate(data):
  7. optimizer.zero_grad()
  8. text = text.to(device)
  9. cls = cls.to(device)
  10. output = model(text)
  11. loss = criterion(output, cls)
  12. train_loss += loss.item() # 将该批次的损失加到总损失中
  13. loss.backward()
  14. optimizer.step()
  15. train_acc += (output.argmax(1) == cls).sum().item() # 将该批次的准去率加到总准确率中 (返回 1 和 0,再被累加)
  16. scheduler.step()
  17. # 返回本轮训练的平均损失核平均准确率
  18. return train_loss / len(train_data), train_acc / len(train_data)

定义预测函数

  1. def valid(test_data):
  2. loss = 0
  3. acc = 0
  4. # 和训练相同,使用 DataLoader 获得训练数据生成器
  5. data = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
  6. for text, cls in data:
  7. with torch.no_grad():
  8. text = text.to(device)
  9. cls = cls.to(device)
  10. output = model(text)
  11. loss = criterion(output, cls)
  12. loss += loss.item() # 将损失和准确率加到总损失和准确率中
  13. acc += (output.argmax(1) == cls).sum().item()
  14. # 返回本轮验证的平均损失和平均准确率
  15. return loss / len(test_data), acc / len(test_data)

6. 进行模型训练和验证(调用已经定义的模块)

定义训练信息

  1. N_EPOCHS = 20 # 指定训练轮数
  2. train_len = int(len(train_datasets) * 0.95) # 从 train_datasets 取出 0.95 作为训练集,先取其长度
  3. # 然后使用 random_split 进行乱序划分,得到对应的训练集和验证集
  4. sub_train_, sub_valid_ = random_split(train_datasets, [train_len, len(train_datasets) - train_len])

迭代训练,并打印训练集、验证集的损失函数和准确率

  1. # 开始每一轮训练
  2. for epoch in range(N_EPOCHS):
  3. start_time = time.time() # 记录训练开始的时间
  4. # 调用 train 和 valid 函数得到训练和验证的平均损失,平均准确率
  5. train_loss, train_acc = train(sub_train_)
  6. valid_loss, valid_acc = valid(sub_valid_)
  7. # 计算训练和验证的总耗时
  8. secs = int(time.time() - start_time)
  9. # 用分钟和秒表示
  10. mins = secs / 60
  11. secs = secs % 60
  12. # 打印训练和验证耗时,平均损失,平均准确率
  13. print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
  14. print(f'\t Loss: {train_loss: .4f}(train) \t | \t Acc: {train_acc * 100: .1f} % (train)')
  15. print(f'\t Loss: {valid_loss: .4f}(valid) \t | \t Acc: {valid_acc * 100: .1f} % (valid)')

 

7. 用测试集进行测试

  1. valid_loss, valid_acc = valid(test_datasets)
  2. print("测试集上测试: ")
  3. print(f'\t Loss: {valid_loss: .4f}(valid) \t | \t Acc: {valid_acc * 100: .1f} % (valid)')
  4. print()

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/344590
推荐阅读
相关标签
  

闽ICP备14008679号