当前位置:   article > 正文

单层纯RNN+全连接实现AG_NEWS数据集的分类

单层纯RNN+全连接实现AG_NEWS数据集的分类

基于RNN的文本分类任务实现

环境配置

python为3.9.19

  1. import os
  2. import matplotlib.pyplot as plt
  3. import numpy as np
  4. import torch
  5. import torch.nn as nn
  6. import torch.optim
  7. from torch.utils.data import DataLoader
  8. from torchtext.data.utils import get_tokenizer
  9. from torchtext.datasets import AG_NEWS
  10. from torchtext.vocab import build_vocab_from_iterator
  11. from tqdm import tqdm
  12. from model_set import RNNnet

数据集

使用torchtext自带的数据集AG_NEWS

  1. def loaddata(config):
  2.    # Step1 加载数据集
  3.    #######################################################################
  4.    print("Step1: Loading DateSet")
  5.    #######################################################################
  6.    # 【数据集介绍】AG_NEWS, 新闻语料库,仅仅使用了标题和描述字段,
  7.    # 包含4个大类新闻:World、Sports、Business、Sci/Tec。
  8.    # 【样本数据】 120000条训练样本集(train.csv), 7600测试样本数据集(test.csv);
  9.    # 每个类别分别拥有 30,000 个训练样本及 1900 个测试样本。
  10.    os.makedirs(config.datapath, exist_ok=True)
  11.    train_dataset_o, test_dataset_o = AG_NEWS(root=config.datapath, split=('train', 'test'))
  12.    classes = ['World', 'Sports', 'Business', 'Sci/Tech']
  13.    return train_dataset_o, test_dataset_o, classes

文本分词

tokenizer = get_tokenizer('basic_english')  # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba

构建词汇表

  1. def bulvocab(traindata):
  2.    # Step2 分词,构建词汇表
  3.    #######################################################################
  4.    #
  5.    print("Step2: Building VocabSet")
  6.    #######################################################################
  7.    tokenizer = get_tokenizer('basic_english')  # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  8.    def yield_tokens(data_iter):  # 分词生成器
  9.        for _, text in data_iter:
  10.            yield tokenizer(text)  # yield会构建一个类似列表可迭代的东西,但比起直接使用列表要少占用很多内存
  11.    # 根据训练数据构建词汇表
  12.    vocab = build_vocab_from_iterator(yield_tokens(traindata), specials=["<PAD>"])  # <unk>代指低频词或未在词表中的词
  13.    # 词汇表会将token映射到词汇表中的索引上,注意词汇表的构建不需要用测试集
  14.    vocab.set_default_index(vocab["<PAD>"])  # 设置默认索引,当某个单词不在词汇表vocab时(OOV),返回该单词索引
  15.    print(f"len vocab:{len(vocab)}")
  16.    len_vocab = len(vocab)
  17.    return vocab, len_vocab

文本映射索引

映射在下面dataload里,这里就用图片展示一下过程

构建数据dataloader

构建成batch时需要每个句子一样长度

  1. # 指定句子长度统一的标准
  2. if config.seq_mode == "min":
  3. seq_len = min(len(item) for item in text_list)
  4. elif config.seq_mode == "max":
  5. seq_len = max(len(item) for item in text_list)
  6. elif config.seq_mode == "avg":
  7. seq_len = sum(len(item) for item in text_list)/len(text_list)
  8. elif isinstance(config.seq_mode, int):
  9. seq_len = config.seq_mode
  10. else:
  11. seq_len = min(len(item) for item in text_list)
  12. seq_len = int(seq_len)
  13. # 每一个batch里统一长度
  14. batch_seq = torch.stack(tensor_padding(text_list, seq_len))
  1. def tensor_padding(tensor_list, seq_len):
  2. # 填充前两个张量
  3. padded_tensors = []
  4. for tensor in tensor_list:
  5. padding = (0, seq_len - len(tensor)) # 在末尾填充0
  6. padded_tensor = torch.nn.functional.pad(tensor, padding, mode='constant', value=0)
  7. padded_tensors.append(padded_tensor)
  8. return padded_tensors
  9. def dateset2loader(config, vocab, traindata, testdata):
  10. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  11. # Step3 构建数据加载器 dataloader
  12. ##########################################################################
  13. print("Step3: DateSet -> Dataloader")
  14. ##########################################################################
  15. # text_pipeline将一个文本字符串转换为整数List, List中每项对应词汇表voca中的单词的索引号
  16. text_pipeline = lambda x: vocab(tokenizer(x))
  17. # label_pipeline将label转换为整数
  18. label_pipeline = lambda x: int(x) - 1
  19. # 加载数据集合,转换为张量
  20. def collate_batch(batch):
  21. """
  22. (3, "Wall") -> (2, "467")
  23. :param batch:
  24. :return:
  25. """
  26. label_list, text_list = [], []
  27. for (_label, _text) in batch:
  28. label_list.append(label_pipeline(_label))
  29. processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
  30. text_list.append(processed_text)
  31. # 指定句子长度统一的标准
  32. if config.seq_mode == "min":
  33. seq_len = min(len(item) for item in text_list)
  34. elif config.seq_mode == "max":
  35. seq_len = max(len(item) for item in text_list)
  36. elif config.seq_mode == "avg":
  37. seq_len = sum(len(item) for item in text_list)/len(text_list)
  38. elif isinstance(config.seq_mode, int):
  39. seq_len = config.seq_mode
  40. else:
  41. seq_len = min(len(item) for item in text_list)
  42. seq_len = int(seq_len)
  43. # 每一个batch里统一长度
  44. batch_seq = torch.stack(tensor_padding(text_list, seq_len))
  45. label_list = torch.tensor(label_list, dtype=torch.int64)
  46. return label_list, batch_seq
  47. train_dataloader = DataLoader(traindata, batch_size=config.batchsize, shuffle=True, collate_fn=collate_batch)
  48. test_dataloader = DataLoader(testdata, batch_size=config.batchsize, shuffle=True, collate_fn=collate_batch)
  49. return train_dataloader, test_dataloader

文本词嵌入

  1. self.embedding = nn.Embedding(len_vocab, embedding_size)
  2. #len_vocab是上面构建的词汇表的长度
  3. #embedding_size就是向量化后,每个词变成的维度大小

网络搭建

batch_first=True默认是为False的,如果不设置为True,那么你的索引文本的尺寸就得是[句子长度,batch_size]

  1. class RNNnet(nn.Module):
  2.    def __init__(self, len_vocab, embedding_size, hidden_size, num_class, num_layers):
  3.        super(RNNnet, self).__init__()
  4.        self.hidden = hidden_size
  5.        self.num_layers = num_layers
  6.        self.embedding = nn.Embedding(len_vocab, embedding_size)
  7.        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers=num_layers, batch_first=True)
  8.        self.fc = nn.Linear(hidden_size, num_class)
  9.    def forward(self, text):
  10.        """
  11.       :param text: [batch_size, sentence_len]
  12.       :return:
  13.       """
  14.        # embedded:[batch_size,sentence_len , embedding_size]
  15.        embedded = self.embedding(text)
  16.        # output:[batch_size, sentence_len, hidden_size]
  17.        # hidden:[1, batch_size, hidden_size]
  18.        output, hidden = self.rnn(embedded)
  19.        return self.fc(hidden[-1])

模型训练

  1. def model_train(config, len_vocab, classes, train_dataloader, test_dataloader):
  2.    # 构建模型
  3.    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  4.    rnn_model = RNNnet(
  5.        len_vocab=len_vocab,
  6.        embedding_size=config.embedding_size,
  7.        hidden_size=config.hidden_size,
  8.        num_class=len(classes),
  9.        num_layers=config.num_layers
  10.   )
  11.    optimizer = torch.optim.Adam(rnn_model.parameters(), lr=config.l_r)
  12.    loss_fn = nn.CrossEntropyLoss()
  13.    rnn_model.train()
  14.    rnn_model.to(device)
  15.    # 训练模型
  16.    LOSS = []
  17.    ACC = []
  18.    os.makedirs(config.savepath, exist_ok=True)
  19.    for epoch in range(config.epochs):
  20.        loop = tqdm(train_dataloader, desc='Train')
  21.        total_loss, total_acc, count, i = 0, 0, 0, 0
  22.        for idx, (label, text) in enumerate(loop):
  23.            text = text.to(device)
  24.            label = label.to(device)
  25.            optimizer.zero_grad()
  26.            output = rnn_model(text)  # 预测
  27.            loss = loss_fn(output, label)
  28.            loss.backward()
  29.            optimizer.step()
  30.            predict = torch.argmax(output, dim=1)  # 判断与原标签是否一样
  31.            acc = (predict == label).sum()
  32.            total_loss += loss.item()
  33.            total_acc += acc.item()
  34.            count += len(label)
  35.            i += 1
  36.            # 打印过程
  37.            loop.set_description(f'Epoch [{epoch + 1}/{config.epochs}]')
  38.            loop.set_postfix(loss=round(loss.item(), 4), acc=(round(acc.item() / len(label), 4) * 100))
  39.        print(
  40.            f"epoch_loss:{round(total_loss / i, 4)}\nepoch_acc:{round(total_acc / count, 4) * 100}%")
  41.        # 保存模型参数
  42.        torch.save(rnn_model.state_dict(), f"{config.savepath}/{config.modelpath}")
  43.        LOSS.append(round(total_loss / i, 4))
  44.        ACC.append(round((total_acc / count) * 100, 4))
  45.        modeltest(config, len_vocab, classes, test_dataloader)
  46.    print(f"LOSS_array:{LOSS}")
  47.    print(f"ACC_array:{ACC}")
  48.    with open(config.logpath, 'w') as f:
  49.        f.write(f"LOSS_array:{LOSS}\nACC_array:{ACC}")

损失值和准确度曲线图

  1. def plot_result(logpath, mode):
  2.    mode_set = ["loss", 'accuracy']
  3.    if mode not in mode_set:
  4.        return "wrong mode"
  5.    color = ['blue', 'red']
  6.    with open(logpath, "r") as f:
  7.        line = f.readlines()[mode_set.index(mode)]
  8.        y = eval(line[line.index(':') + 1:])
  9.        x = [i for i in range(len(y))]
  10.        plt.figure()
  11.        # 去除顶部和右边框框
  12.        ax = plt.axes()
  13.        ax.spines['top'].set_visible(False)
  14.        ax.spines['right'].set_visible(False)
  15.        plt.xlabel('epoch')
  16.        plt.ylabel(f'{mode}')
  17.        plt.plot(x, y, color=color[mode_set.index(mode)], linestyle="solid", label=f"train {mode}")
  18.        plt.legend()
  19.        plt.title(f'train {mode} curve')
  20.        # plt.show()
  21.        plt.savefig(f"{mode}.png")

模型测试

  1. def modeltest(config, len_vocab, classes, test_dataloader):
  2.    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  3.    # 测试模型
  4.    rnn_model_test = RNNnet(
  5.        len_vocab=len_vocab,
  6.        embedding_size=config.embedding_size,
  7.        hidden_size=config.hidden_size,
  8.        num_class=len(classes),
  9.        num_layers=config.num_layers
  10.   )
  11.    rnn_model_test.load_state_dict(torch.load(f"{config.savepath}/{config.modelpath}"))
  12.    rnn_model_test.eval()
  13.    rnn_model_test.to(device)
  14.    test_loop = tqdm(test_dataloader)
  15.    total_acc, count = 0, 0
  16.    for idx, (label, text) in enumerate(test_loop):
  17.        text = text.to(device)
  18.        label = label.to(device)
  19.        output = rnn_model_test(text)
  20.        predict = torch.argmax(output, dim=1)  # 判断与原标签是否一样
  21.        acc = (predict == label).sum()
  22.        total_acc += acc.item()
  23.        count += len(label)
  24.    print(f"测试集精度:{round((total_acc / count) * 100, 2)}%")

测试集精度:86.53%

测试自己输入的样本

  1. #Fears for T N pension after talksUnions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
  2. #The Race is On: Second Private Team Sets Launch Date for Human SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
  3. #Dutch Retailer Beats Apple to Local Download Market AMSTERDAM (Reuters) - Free Record Shop, a Dutch music retail chain, beat Apple Computer Inc. to market on Tuesday with the launch of a new download service in Europe's latest battleground for digital song services.
  4. #U.S. Urges China to Push for More N.Korea Talks BEIJING (Reuters) - Secretary of State Colin Powell urged China on Monday to exert its influence over North Korea to resume stalled talks on scrapping its nuclear weapons programs and pressed Beijing to accept a Taiwan offer of talks.
  5. #这四个样本的原标签分别为:3,4,4,1
  1. def simple(vocab, text_one):
  2. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  3. # 测试模型
  4. rnn_model_test = RNNnet(
  5. len_vocab=len_vocab,
  6. embedding_size=config.embedding_size,
  7. hidden_size=config.hidden_size,
  8. num_class=len(classes),
  9. num_layers=config.num_layers
  10. )
  11. rnn_model_test.load_state_dict(torch.load(f"{config.savepath}/{config.modelpath}"))
  12. rnn_model_test.eval()
  13. rnn_model_test.to(device)
  14. text_pipeline = lambda x: vocab(tokenizer(x))
  15. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  16. for text in text_one:
  17. text_one_tensor = torch.tensor(text_pipeline(text), dtype=torch.int64)
  18. text_one_tensor = text_one_tensor.to(device)
  19. print(f"预测标签为:{torch.argmax(rnn_model_test(text_one_tensor)).item()+1}")

预测结果:

完整代码

model_set.py

  1. import os
  2. import matplotlib.pyplot as plt
  3. import torch
  4. import torch.nn as nn
  5. import torch.optim
  6. from torch.utils.data import DataLoader
  7. from torchtext.data.utils import get_tokenizer
  8. from torchtext.datasets import AG_NEWS
  9. from torchtext.vocab import build_vocab_from_iterator
  10. from tqdm import tqdm
  11. from model_set import RNNnet
  12. def loaddata(config):
  13. # Step1 加载数据集
  14. #######################################################################
  15. print("Step1: Loading DateSet")
  16. #######################################################################
  17. # 【数据集介绍】AG_NEWS, 新闻语料库,仅仅使用了标题和描述字段,
  18. # 包含4个大类新闻:World、Sports、Business、Sci/Tec。
  19. # 【样本数据】 120000条训练样本集(train.csv), 7600测试样本数据集(test.csv);
  20. # 每个类别分别拥有 30,000 个训练样本及 1900 个测试样本。
  21. os.makedirs(config.datapath, exist_ok=True)
  22. train_dataset_o, test_dataset_o = AG_NEWS(root=config.datapath, split=('train', 'test'))
  23. classes = ['World', 'Sports', 'Business', 'Sci/Tech']
  24. # for t in test_dataset_o:
  25. # print(t)
  26. # break
  27. return train_dataset_o, test_dataset_o, classes
  28. def bulvocab(traindata):
  29. # Step2 分词,构建词汇表
  30. #######################################################################
  31. #
  32. print("Step2: Building VocabSet")
  33. #######################################################################
  34. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  35. def yield_tokens(data_iter): # 分词生成器
  36. for _, text in data_iter:
  37. yield tokenizer(text) # yield会构建一个类似列表可迭代的东西,但比起直接使用列表要少占用很多内存
  38. # 根据训练数据构建词汇表
  39. vocab = build_vocab_from_iterator(yield_tokens(traindata), specials=["<PAD>"]) # <unk>代指低频词或未在词表中的词
  40. # 词汇表会将token映射到词汇表中的索引上,注意词汇表的构建不需要用测试集
  41. vocab.set_default_index(vocab["<PAD>"]) # 设置默认索引,当某个单词不在词汇表vocab时(OOV),返回该单词索引
  42. print(f"len vocab:{len(vocab)}")
  43. len_vocab = len(vocab)
  44. return vocab, len_vocab
  45. def tensor_padding(tensor_list, seq_len):
  46. # 填充前两个张量
  47. padded_tensors = []
  48. for tensor in tensor_list:
  49. padding = (0, seq_len - len(tensor)) # 在末尾填充0
  50. padded_tensor = torch.nn.functional.pad(tensor, padding, mode='constant', value=0)
  51. padded_tensors.append(padded_tensor)
  52. return padded_tensors
  53. def dateset2loader(config, vocab, traindata, testdata):
  54. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  55. # Step3 构建数据加载器 dataloader
  56. ##########################################################################
  57. print("Step3: DateSet -> Dataloader")
  58. ##########################################################################
  59. # text_pipeline将一个文本字符串转换为整数List, List中每项对应词汇表voca中的单词的索引号
  60. text_pipeline = lambda x: vocab(tokenizer(x))
  61. # label_pipeline将label转换为整数
  62. label_pipeline = lambda x: int(x) - 1
  63. # 加载数据集合,转换为张量
  64. def collate_batch(batch):
  65. """
  66. (3, "Wall") -> (2, "467")
  67. :param batch:
  68. :return:
  69. """
  70. label_list, text_list = [], []
  71. for (_label, _text) in batch:
  72. label_list.append(label_pipeline(_label))
  73. processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
  74. text_list.append(processed_text)
  75. # 指定句子长度统一的标准
  76. if config.seq_mode == "min":
  77. seq_len = min(len(item) for item in text_list)
  78. elif config.seq_mode == "max":
  79. seq_len = max(len(item) for item in text_list)
  80. elif config.seq_mode == "avg":
  81. seq_len = sum(len(item) for item in text_list) / len(text_list)
  82. elif isinstance(config.seq_mode, int):
  83. seq_len = config.seq_mode
  84. else:
  85. seq_len = min(len(item) for item in text_list)
  86. seq_len = int(seq_len)
  87. # 每一个batch里统一长度
  88. batch_seq = torch.stack(tensor_padding(text_list, seq_len))
  89. label_list = torch.tensor(label_list, dtype=torch.int64)
  90. return label_list, batch_seq
  91. train_dataloader = DataLoader(traindata, batch_size=config.batchsize, shuffle=True, collate_fn=collate_batch)
  92. test_dataloader = DataLoader(testdata, batch_size=config.batchsize, shuffle=True, collate_fn=collate_batch)
  93. return train_dataloader, test_dataloader
  94. def model_train(config, len_vocab, classes, train_dataloader, test_dataloader):
  95. # 构建模型
  96. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  97. rnn_model = RNNnet(
  98. len_vocab=len_vocab,
  99. embedding_size=config.embedding_size,
  100. hidden_size=config.hidden_size,
  101. num_class=len(classes),
  102. num_layers=config.num_layers
  103. )
  104. optimizer = torch.optim.Adam(rnn_model.parameters(), lr=config.l_r)
  105. loss_fn = nn.CrossEntropyLoss()
  106. rnn_model.train()
  107. rnn_model.to(device)
  108. # 训练模型
  109. LOSS = []
  110. ACC = []
  111. os.makedirs(config.savepath, exist_ok=True)
  112. best_acc = 0
  113. for epoch in range(config.epochs):
  114. loop = tqdm(train_dataloader, desc='Train')
  115. total_loss, total_acc, count, i = 0, 0, 0, 0
  116. rnn_model.train()
  117. for idx, (label, text) in enumerate(loop):
  118. text = text.to(device)
  119. label = label.to(device)
  120. optimizer.zero_grad()
  121. output = rnn_model(text) # 预测
  122. loss = loss_fn(output, label)
  123. loss.backward()
  124. optimizer.step()
  125. predict = torch.argmax(output, dim=1) # 判断与原标签是否一样
  126. acc = (predict == label).sum()
  127. total_loss += loss.item()
  128. total_acc += acc.item()
  129. count += len(label)
  130. i += 1
  131. # 打印过程
  132. loop.set_description(f'Epoch [{epoch + 1}/{config.epochs}]')
  133. loop.set_postfix(loss=round(loss.item(), 4), acc=(round(acc.item() / len(label), 4) * 100))
  134. print(
  135. f"epoch_loss:{round(total_loss / i, 4)}\nepoch_acc:{round(total_acc / count, 4) * 100}%")
  136. # 保存模型参数
  137. LOSS.append(round(total_loss / i, 4))
  138. ACC.append(round((total_acc / count) * 100, 4))
  139. rnn_model.eval()
  140. test_loop = tqdm(test_dataloader)
  141. total_acc, count = 0, 0
  142. for idx, (label, text) in enumerate(test_loop):
  143. text = text.to(device)
  144. label = label.to(device)
  145. output = rnn_model(text)
  146. predict = torch.argmax(output, dim=1) # 判断与原标签是否一样
  147. acc = (predict == label).sum()
  148. total_acc += acc.item()
  149. count += len(label)
  150. print(f"测试集精度:{round((total_acc / count) * 100, 2)}%")
  151. temp_acc = round((total_acc / count) * 100, 2)
  152. if temp_acc>best_acc:
  153. best_acc = temp_acc
  154. torch.save(rnn_model.state_dict(), f"{config.savepath}/{config.modelpath}")
  155. print(f"LOSS_array:{LOSS}")
  156. print(f"ACC_array:{ACC}")
  157. with open(config.logpath, 'w') as f:
  158. f.write(f"LOSS_array:{LOSS}\nACC_array:{ACC}")
  159. def modeltest(config, len_vocab, classes, test_dataloader):
  160. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  161. # 测试模型
  162. rnn_model_test = RNNnet(
  163. len_vocab=len_vocab,
  164. embedding_size=config.embedding_size,
  165. hidden_size=config.hidden_size,
  166. num_class=len(classes),
  167. num_layers=config.num_layers
  168. )
  169. rnn_model_test.load_state_dict(torch.load(f"{config.savepath}/{config.modelpath}"))
  170. rnn_model_test.eval()
  171. rnn_model_test.to(device)
  172. test_loop = tqdm(test_dataloader)
  173. total_acc, count = 0, 0
  174. for idx, (label, text) in enumerate(test_loop):
  175. text = text.to(device)
  176. label = label.to(device)
  177. output = rnn_model_test(text)
  178. predict = torch.argmax(output, dim=1) # 判断与原标签是否一样
  179. acc = (predict == label).sum()
  180. total_acc += acc.item()
  181. count += len(label)
  182. print(f"测试集精度:{round((total_acc / count) * 100, 2)}%")
  183. def plot_result(logpath, mode):
  184. mode_set = ["loss", 'accuracy']
  185. if mode not in mode_set:
  186. return "wrong mode"
  187. color = ['blue', 'red']
  188. with open(logpath, "r") as f:
  189. line = f.readlines()[mode_set.index(mode)]
  190. y = eval(line[line.index(':') + 1:])
  191. x = [i for i in range(len(y))]
  192. plt.figure()
  193. # 去除顶部和右边框框
  194. ax = plt.axes()
  195. ax.spines['top'].set_visible(False)
  196. ax.spines['right'].set_visible(False)
  197. plt.xlabel('epoch')
  198. plt.ylabel(f'{mode}')
  199. plt.plot(x, y, color=color[mode_set.index(mode)], linestyle="solid", label=f"train {mode}")
  200. plt.legend()
  201. plt.title(f'train {mode} curve')
  202. # plt.show()
  203. plt.savefig(f"{mode}.png")
  204. class Config:
  205. def __init__(self):
  206. #######################################
  207. # 数据集使用的是AG_NEWS,实现文本分类任务
  208. #######################################
  209. self.datapath = './data'
  210. self.savepath = './save_model'
  211. self.modelpath = 'rnn_model.pt'
  212. self.logpath = 'log_best.txt'
  213. self.embedding_size = 128
  214. self.hidden_size = 256
  215. self.num_layers = 1 # rnn的层数
  216. self.l_r = 1e-3
  217. self.epochs = 50
  218. self.batchsize = 1024
  219. self.plotloss = False
  220. self.plotacc = False
  221. self.train = True
  222. self.test = False
  223. self.seq_mode = "avg" # seq_mode:min、max、avg、也可输入一个数字自定义长度
  224. self.test_one = False
  225. self.test_self = "./test_self.txt"
  226. def parm(self):
  227. print(
  228. f"datapath={self.datapath}\n"
  229. f"savepath={self.savepath}\n"
  230. f"modelpath={self.modelpath}\n"
  231. f"logpath={self.logpath}\n"
  232. f"embedding_size={self.embedding_size}\n"
  233. f"hidden_size={self.hidden_size}\n"
  234. f"num_layers={self.num_layers}\n"
  235. f"l_r={self.l_r}\n"
  236. f"epochs={self.epochs}\n"
  237. f"batchsize={self.batchsize}\n"
  238. f"plotloss={self.plotloss}\n"
  239. f"plotacc={self.plotacc}\n"
  240. f"train={self.train}\n"
  241. f"test={self.test}\n"
  242. f"seq_mode={self.seq_mode}\n"
  243. f"test_one={self.test_one}\n"
  244. f"test_self={self.test_self}\n"
  245. )
  246. def simple(vocab, text_one):
  247. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  248. # 测试模型
  249. rnn_model_test = RNNnet(
  250. len_vocab=len_vocab,
  251. embedding_size=config.embedding_size,
  252. hidden_size=config.hidden_size,
  253. num_class=len(classes),
  254. num_layers=config.num_layers
  255. )
  256. rnn_model_test.load_state_dict(torch.load(f"{config.savepath}/{config.modelpath}"))
  257. rnn_model_test.eval()
  258. rnn_model_test.to(device)
  259. text_pipeline = lambda x: vocab(tokenizer(x))
  260. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  261. for text in text_one:
  262. text_one_tensor = torch.tensor(text_pipeline(text), dtype=torch.int64)
  263. text_one_tensor = text_one_tensor.to(device)
  264. print(f"预测标签为:{torch.argmax(rnn_model_test(text_one_tensor)).item()+1}")
  265. if __name__ == "__main__":
  266. config = Config()
  267. config.parm()
  268. if config.train:
  269. train_dataset_o, test_dataset_o, classes = loaddata(config)
  270. vocab, len_vocab = bulvocab(train_dataset_o)
  271. train_dataloader, test_dataloader = dateset2loader(config, vocab, train_dataset_o, test_dataset_o)
  272. model_train(config, len_vocab, classes, train_dataloader, test_dataloader)
  273. elif config.test:
  274. train_dataset_o, test_dataset_o, classes = loaddata(config)
  275. vocab, len_vocab = bulvocab(train_dataset_o)
  276. train_dataloader, test_dataloader = dateset2loader(config, vocab, train_dataset_o, test_dataset_o)
  277. modeltest(config, len_vocab, classes, test_dataloader)
  278. elif config.plotloss:
  279. plot_result(config.logpath, 'loss')
  280. elif config.plotacc:
  281. plot_result(config.logpath, 'accuracy')
  282. elif config.test_one:
  283. train_dataset_o, test_dataset_o, classes = loaddata(config)
  284. vocab, len_vocab = bulvocab(train_dataset_o)
  285. with open(config.test_self, "r") as f:
  286. simple(vocab, f.readlines())

  1. import os
  2. import matplotlib.pyplot as plt
  3. import torch
  4. import torch.nn as nn
  5. import torch.optim
  6. from torch.utils.data import DataLoader
  7. from torchtext.data.utils import get_tokenizer
  8. from torchtext.datasets import AG_NEWS
  9. from torchtext.vocab import build_vocab_from_iterator
  10. from tqdm import tqdm
  11. from model_set import RNNnet
  12. def loaddata(config):
  13. # Step1 加载数据集
  14. #######################################################################
  15. print("Step1: Loading DateSet")
  16. #######################################################################
  17. # 【数据集介绍】AG_NEWS, 新闻语料库,仅仅使用了标题和描述字段,
  18. # 包含4个大类新闻:World、Sports、Business、Sci/Tec。
  19. # 【样本数据】 120000条训练样本集(train.csv), 7600测试样本数据集(test.csv);
  20. # 每个类别分别拥有 30,000 个训练样本及 1900 个测试样本。
  21. os.makedirs(config.datapath, exist_ok=True)
  22. train_dataset_o, test_dataset_o = AG_NEWS(root=config.datapath, split=('train', 'test'))
  23. classes = ['World', 'Sports', 'Business', 'Sci/Tech']
  24. # for t in test_dataset_o:
  25. # print(t)
  26. # break
  27. return train_dataset_o, test_dataset_o, classes
  28. def bulvocab(traindata):
  29. # Step2 分词,构建词汇表
  30. #######################################################################
  31. #
  32. print("Step2: Building VocabSet")
  33. #######################################################################
  34. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  35. def yield_tokens(data_iter): # 分词生成器
  36. for _, text in data_iter:
  37. yield tokenizer(text) # yield会构建一个类似列表可迭代的东西,但比起直接使用列表要少占用很多内存
  38. # 根据训练数据构建词汇表
  39. vocab = build_vocab_from_iterator(yield_tokens(traindata), specials=["<PAD>"]) # <unk>代指低频词或未在词表中的词
  40. # 词汇表会将token映射到词汇表中的索引上,注意词汇表的构建不需要用测试集
  41. vocab.set_default_index(vocab["<PAD>"]) # 设置默认索引,当某个单词不在词汇表vocab时(OOV),返回该单词索引
  42. print(f"len vocab:{len(vocab)}")
  43. len_vocab = len(vocab)
  44. return vocab, len_vocab
  45. def tensor_padding(tensor_list, seq_len):
  46. # 填充前两个张量
  47. padded_tensors = []
  48. for tensor in tensor_list:
  49. padding = (0, seq_len - len(tensor)) # 在末尾填充0
  50. padded_tensor = torch.nn.functional.pad(tensor, padding, mode='constant', value=0)
  51. padded_tensors.append(padded_tensor)
  52. return padded_tensors
  53. def dateset2loader(config, vocab, traindata, testdata):
  54. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  55. # Step3 构建数据加载器 dataloader
  56. ##########################################################################
  57. print("Step3: DateSet -> Dataloader")
  58. ##########################################################################
  59. # text_pipeline将一个文本字符串转换为整数List, List中每项对应词汇表voca中的单词的索引号
  60. text_pipeline = lambda x: vocab(tokenizer(x))
  61. # label_pipeline将label转换为整数
  62. label_pipeline = lambda x: int(x) - 1
  63. # 加载数据集合,转换为张量
  64. def collate_batch(batch):
  65. """
  66. (3, "Wall") -> (2, "467")
  67. :param batch:
  68. :return:
  69. """
  70. label_list, text_list = [], []
  71. for (_label, _text) in batch:
  72. label_list.append(label_pipeline(_label))
  73. processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
  74. text_list.append(processed_text)
  75. # 指定句子长度统一的标准
  76. if config.seq_mode == "min":
  77. seq_len = min(len(item) for item in text_list)
  78. elif config.seq_mode == "max":
  79. seq_len = max(len(item) for item in text_list)
  80. elif config.seq_mode == "avg":
  81. seq_len = sum(len(item) for item in text_list) / len(text_list)
  82. elif isinstance(config.seq_mode, int):
  83. seq_len = config.seq_mode
  84. else:
  85. seq_len = min(len(item) for item in text_list)
  86. seq_len = int(seq_len)
  87. # 每一个batch里统一长度
  88. batch_seq = torch.stack(tensor_padding(text_list, seq_len))
  89. label_list = torch.tensor(label_list, dtype=torch.int64)
  90. return label_list, batch_seq
  91. train_dataloader = DataLoader(traindata, batch_size=config.batchsize, shuffle=True, collate_fn=collate_batch)
  92. test_dataloader = DataLoader(testdata, batch_size=config.batchsize, shuffle=True, collate_fn=collate_batch)
  93. return train_dataloader, test_dataloader
  94. def model_train(config, len_vocab, classes, train_dataloader, test_dataloader):
  95. # 构建模型
  96. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  97. rnn_model = RNNnet(
  98. len_vocab=len_vocab,
  99. embedding_size=config.embedding_size,
  100. hidden_size=config.hidden_size,
  101. num_class=len(classes),
  102. num_layers=config.num_layers
  103. )
  104. optimizer = torch.optim.Adam(rnn_model.parameters(), lr=config.l_r)
  105. loss_fn = nn.CrossEntropyLoss()
  106. rnn_model.train()
  107. rnn_model.to(device)
  108. # 训练模型
  109. LOSS = []
  110. ACC = []
  111. os.makedirs(config.savepath, exist_ok=True)
  112. for epoch in range(config.epochs):
  113. loop = tqdm(train_dataloader, desc='Train')
  114. total_loss, total_acc, count, i = 0, 0, 0, 0
  115. for idx, (label, text) in enumerate(loop):
  116. text = text.to(device)
  117. label = label.to(device)
  118. optimizer.zero_grad()
  119. output = rnn_model(text) # 预测
  120. loss = loss_fn(output, label)
  121. loss.backward()
  122. optimizer.step()
  123. predict = torch.argmax(output, dim=1) # 判断与原标签是否一样
  124. acc = (predict == label).sum()
  125. total_loss += loss.item()
  126. total_acc += acc.item()
  127. count += len(label)
  128. i += 1
  129. # 打印过程
  130. loop.set_description(f'Epoch [{epoch + 1}/{config.epochs}]')
  131. loop.set_postfix(loss=round(loss.item(), 4), acc=(round(acc.item() / len(label), 4) * 100))
  132. print(
  133. f"epoch_loss:{round(total_loss / i, 4)}\nepoch_acc:{round(total_acc / count, 4) * 100}%")
  134. # 保存模型参数
  135. torch.save(rnn_model.state_dict(), f"{config.savepath}/{config.modelpath}")
  136. LOSS.append(round(total_loss / i, 4))
  137. ACC.append(round((total_acc / count) * 100, 4))
  138. modeltest(config, len_vocab, classes, test_dataloader)
  139. print(f"LOSS_array:{LOSS}")
  140. print(f"ACC_array:{ACC}")
  141. with open(config.logpath, 'w') as f:
  142. f.write(f"LOSS_array:{LOSS}\nACC_array:{ACC}")
  143. def modeltest(config, len_vocab, classes, test_dataloader):
  144. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  145. # 测试模型
  146. rnn_model_test = RNNnet(
  147. len_vocab=len_vocab,
  148. embedding_size=config.embedding_size,
  149. hidden_size=config.hidden_size,
  150. num_class=len(classes),
  151. num_layers=config.num_layers
  152. )
  153. rnn_model_test.load_state_dict(torch.load(f"{config.savepath}/{config.modelpath}"))
  154. rnn_model_test.eval()
  155. rnn_model_test.to(device)
  156. test_loop = tqdm(test_dataloader)
  157. total_acc, count = 0, 0
  158. for idx, (label, text) in enumerate(test_loop):
  159. text = text.to(device)
  160. label = label.to(device)
  161. output = rnn_model_test(text)
  162. predict = torch.argmax(output, dim=1) # 判断与原标签是否一样
  163. acc = (predict == label).sum()
  164. total_acc += acc.item()
  165. count += len(label)
  166. print(f"测试集精度:{round((total_acc / count) * 100, 2)}%")
  167. def plot_result(logpath, mode):
  168. mode_set = ["loss", 'accuracy']
  169. if mode not in mode_set:
  170. return "wrong mode"
  171. color = ['blue', 'red']
  172. with open(logpath, "r") as f:
  173. line = f.readlines()[mode_set.index(mode)]
  174. y = eval(line[line.index(':') + 1:])
  175. x = [i for i in range(len(y))]
  176. plt.figure()
  177. # 去除顶部和右边框框
  178. ax = plt.axes()
  179. ax.spines['top'].set_visible(False)
  180. ax.spines['right'].set_visible(False)
  181. plt.xlabel('epoch')
  182. plt.ylabel(f'{mode}')
  183. plt.plot(x, y, color=color[mode_set.index(mode)], linestyle="solid", label=f"train {mode}")
  184. plt.legend()
  185. plt.title(f'train {mode} curve')
  186. # plt.show()
  187. plt.savefig(f"{mode}.png")
  188. class Config:
  189. def __init__(self):
  190. #######################################
  191. # 数据集使用的是AG_NEWS,实现文本分类任务
  192. #######################################
  193. self.datapath = './data'
  194. self.savepath = './save_model'
  195. self.modelpath = 'rnn_model_best.pt'
  196. self.logpath = 'log_best.txt'
  197. self.embedding_size = 128
  198. self.hidden_size = 256
  199. self.num_layers = 1 # rnn的层数
  200. self.l_r = 1e-3
  201. self.epochs = 50
  202. self.batchsize = 1024
  203. self.plotloss = False
  204. self.plotacc = False
  205. self.train = False
  206. self.test = False
  207. self.seq_mode = "min" # seq_mode:min、max、avg、也可输入一个数字自定义长度
  208. self.test_one = True
  209. self.test_self = "./test_self.txt"
  210. def parm(self):
  211. print(
  212. f"datapath={self.datapath}\n"
  213. f"savepath={self.savepath}\n"
  214. f"modelpath={self.modelpath}\n"
  215. f"logpath={self.logpath}\n"
  216. f"embedding_size={self.embedding_size}\n"
  217. f"hidden_size={self.hidden_size}\n"
  218. f"num_layers={self.num_layers}\n"
  219. f"l_r={self.l_r}\n"
  220. f"epochs={self.epochs}\n"
  221. f"batchsize={self.batchsize}\n"
  222. f"plotloss={self.plotloss}\n"
  223. f"plotacc={self.plotacc}\n"
  224. f"train={self.train}\n"
  225. f"test={self.test}\n"
  226. f"seq_mode={self.seq_mode}\n"
  227. f"test_one={self.test_one}\n"
  228. f"test_self={self.test_self}\n"
  229. )
  230. def simple(vocab, text_one):
  231. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  232. # 测试模型
  233. rnn_model_test = RNNnet(
  234. len_vocab=len_vocab,
  235. embedding_size=config.embedding_size,
  236. hidden_size=config.hidden_size,
  237. num_class=len(classes),
  238. num_layers=config.num_layers
  239. )
  240. rnn_model_test.load_state_dict(torch.load(f"{config.savepath}/{config.modelpath}"))
  241. rnn_model_test.eval()
  242. rnn_model_test.to(device)
  243. text_pipeline = lambda x: vocab(tokenizer(x))
  244. tokenizer = get_tokenizer('basic_english') # 基本的英文分词器,tokenizer会把句子进行分割,类似jieba
  245. for text in text_one:
  246. text_one_tensor = torch.tensor(text_pipeline(text), dtype=torch.int64)
  247. text_one_tensor = text_one_tensor.to(device)
  248. print(f"预测标签为:{torch.argmax(rnn_model_test(text_one_tensor)).item()+1}")
  249. if __name__ == "__main__":
  250. config = Config()
  251. config.parm()
  252. if config.train:
  253. train_dataset_o, test_dataset_o, classes = loaddata(config)
  254. vocab, len_vocab = bulvocab(train_dataset_o)
  255. train_dataloader, test_dataloader = dateset2loader(config, vocab, train_dataset_o, test_dataset_o)
  256. model_train(config, len_vocab, classes, train_dataloader, test_dataloader)
  257. elif config.test:
  258. train_dataset_o, test_dataset_o, classes = loaddata(config)
  259. vocab, len_vocab = bulvocab(train_dataset_o)
  260. train_dataloader, test_dataloader = dateset2loader(config, vocab, train_dataset_o, test_dataset_o)
  261. modeltest(config, len_vocab, classes, test_dataloader)
  262. elif config.plotloss:
  263. plot_result(config.logpath, 'loss')
  264. elif config.plotacc:
  265. plot_result(config.logpath, 'accuracy')
  266. elif config.test_one:
  267. train_dataset_o, test_dataset_o, classes = loaddata(config)
  268. vocab, len_vocab = bulvocab(train_dataset_o)
  269. with open(config.test_self, "r") as f:
  270. simple(vocab, f.readlines())

细节补充:

重点在于构建batch时,对局部batch内进行长度统一化,在dateset2loader函数里有细节,而且batch尽量大,但不能过大,类似我这1024,训练时间短,效果也不错,学习率1e-3即可,大概在epoch20多轮便有很好结果出现了

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/546566
推荐阅读
相关标签
  

闽ICP备14008679号