当前位置:   article > 正文

中文文本分类_1(pytorch 实现)

中文文本分类_1(pytorch 实现)
  1. import torch
  2. import torch.nn as nn
  3. import torchvision
  4. from torchvision import transforms, datasets
  5. import os, PIL, pathlib, warnings
  6. warnings.filterwarnings("ignore") # 忽略警告信息
  7. # win10系统
  8. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  9. print(device)
train.csv 链接:https://pan.baidu.com/s/1Vnyvo5T5eSuzb0VwTsznqA?pwd=fqok 提取码:fqok 
  1. import pandas as pd
  2. # 加载自定义中文数据集
  3. train_data = pd.read_csv('D:/train.csv', sep='\t', header=None)
  4. train_data.head()
  5. # 构建数据集迭代器
  6. def coustom_data_iter(texts, labels):
  7. for x, y in zip(texts, labels):
  8. yield x, y
  9. train_iter = coustom_data_iter(train_data[0].values[:], train_data[1].values[:])

1.构建词典:

  1. from torchtext.data.utils import get_tokenizer
  2. from torchtext.vocab import build_vocab_from_iterator
  3. import jieba
  4. # 中文分词方法
  5. tokenizer = jieba.lcut
  6. def yield_tokens(data_iter):
  7. for text, in data_iter:
  8. yield tokenizer(text)
  9. vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
  10. vocab.set_default_index(vocab["<unk>"])

 调用vocab(词汇表)对一个中文句子进行索引转换,这个句子被分词后得到的词汇列表会被转换成它们在词汇表中的索引。

print(vocab(['我', '想', '看', '书', '和', '你', '一起', '看', '电影', '的', '新款', '视频']))

生成一个标签列表,用于查看在数据集中所有可能的标签类型。 

  1. label_name = list(set(train_data[1].values[:]))
  2. print(label_name)

创建了两个lambda函数,一个用于将文本转换成词汇索引,另一个用于将标签文本转换成它们在label_name列表中的索引。

  1. text_pipeline = lambda x: vocab(tokenizer(x))
  2. label_pipeline = lambda x: label_name.index(x)
  3. print(text_pipeline('我想看新闻或者上网站看最新的游戏视频'))
  4. print(label_pipeline('Video-Play'))

2.生成数据批次和迭代器

  1. from torch.utils.data import DataLoader
  2. def collate_batch(batch):
  3. label_list, text_list, offsets = [], [], [0]
  4. for (_text, _label) in batch:
  5. # 标签列表
  6. label_list.append(label_pipeline(_label))
  7. # 文本列表
  8. processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
  9. text_list.append(processed_text)
  10. # 偏移量,即词汇的起始位置
  11. offsets.append(processed_text.size(0))
  12. label_list = torch.tensor(label_list, dtype=torch.int64)
  13. text_list = torch.cat(text_list)
  14. offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) # 累计偏移量dim中维度元素的累计和
  15. return text_list.to(device), label_list.to(device), offsets.to(device)
  16. # 数据加载器,调用示例
  17. dataloader = DataLoader(train_iter,
  18. batch_size=8,
  19. shuffle=False,
  20. collate_fn=collate_batch)

collate_batch函数用于处理数据加载器中的批次。它接收一个批次的数据,处理它,并返回适合模型训练的数据格式。
在这个函数内部,它遍历批次中的每个文本和标签对,将标签添加到label_list,将文本通过text_pipeline函数处理后转换为tensor,并添加到text_list。
offsets列表用于存储每个文本的长度,这对于后续的文本处理非常有用,尤其是当你需要知道每个文本在拼接的大tensor中的起始位置时。
text_list用torch.cat进行拼接,形成一个连续的tensor。
offsets列表的最后一个元素不包括,然后使用cumsum函数在第0维计算累积和,这为每个序列提供了一个累计的偏移量。

3.搭建模型与初始化

  1. from torch import nn
  2. class TextClassificationModel(nn.Module):
  3. def __init__(self, vocab_size, embed_dim, num_class):
  4. super(TextClassificationModel, self).__init__()
  5. self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
  6. self.fc = nn.Linear(embed_dim, num_class)
  7. self.init_weights()
  8. def init_weights(self):
  9. initrange = 0.5
  10. self.embedding.weight.data.uniform_(-initrange, initrange)
  11. self.fc.weight.data.uniform_(-initrange, initrange)
  12. self.fc.bias.data.zero_()
  13. def forward(self, text, offsets):
  14. embedded = self.embedding(text, offsets)
  15. return self.fc(embedded)
  16. num_class = len(label_name) # 类别数,根据label_name的长度确定
  17. vocab_size = len(vocab) # 词汇表的大小,根据vocab的长度确定
  18. em_size = 64 # 嵌入向量的维度设置为64
  19. model = TextClassificationModel(vocab_size, em_size, num_class).to(device) # 创建模型实例并移动到计算设备

4.模型训练及评估函数

train 和 evaluate分别用于训练和评估文本分类模型。

训练函数 train 的工作流程如下:

将模型设置为训练模式。
初始化总准确率、训练损失和总计数变量。
记录训练开始的时间。
遍历数据加载器,对每个批次:
进行预测。
清零优化器的梯度。
计算损失(使用一个损失函数,例如交叉熵)。
反向传播计算梯度。
通过梯度裁剪防止梯度爆炸。
执行一步优化器更新模型权重。
更新总准确率和总损失。
每隔一定间隔,打印训练进度和统计信息。
评估函数 evaluate 的工作流程如下:

将模型设置为评估模式。
初始化总准确率和总损失。
不计算梯度(为了节省内存和计算资源)。
遍历数据加载器,对每个批次:
进行预测。
计算损失。
更新总准确率和总损失。
返回整体的准确率和平均损失。
代码实现:

  1. import time
  2. def train(dataloader):
  3. model.train() # 切换到训练模式
  4. total_acc, train_loss, total_count = 0, 0, 0
  5. log_interval = 50
  6. start_time = time.time()
  7. for idx, (text, label, offsets) in enumerate(dataloader):
  8. predicted_label = model(text, offsets)
  9. optimizer.zero_grad() # 梯度归零
  10. loss = criterion(predicted_label, label) # 计算损失
  11. loss.backward() # 反向传播
  12. torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度裁剪
  13. optimizer.step() # 优化器更新权重
  14. # 记录acc和loss
  15. total_acc += (predicted_label.argmax(1) == label).sum().item()
  16. train_loss += loss.item()
  17. total_count += label.size(0)
  18. if idx % log_interval == 0 and idx > 0:
  19. elapsed = time.time() - start_time
  20. print('| epoch {:3d} | {:5d}/{:5d} batches '
  21. '| accuracy {:8.3f} | loss {:8.5f}'.format(
  22. epoch, idx, len(dataloader),
  23. total_acc/total_count, train_loss/total_count))
  24. total_acc, train_loss, total_count = 0, 0, 0
  25. start_time = time.time()
  26. def evaluate(dataloader):
  27. model.eval() # 切换到评估模式
  28. total_acc, total_count = 0, 0
  29. with torch.no_grad():
  30. for idx, (text, label, offsets) in enumerate(dataloader):
  31. predicted_label = model(text, offsets)
  32. loss = criterion(predicted_label, label) # 计算loss
  33. total_acc += (predicted_label.argmax(1) == label).sum().item()
  34. total_count += label.size(0)
  35. return total_acc/total_count, total_count

5.模型训练
设置训练的轮数、学习率和批次大小。
定义交叉熵损失函数、随机梯度下降优化器和学习率调度器。
将训练数据转换为一个map样式的数据集,并将其分成训练集和验证集。
创建训练和验证的数据加载器。
开始训练循环,每个epoch都会训练模型并在验证集上评估模型的准确率和损失。
如果验证准确率没有提高,则按计划降低学习率。
打印每个epoch结束时的统计信息,包括时间、准确率、损失和学习率。

  1. from torch.utils.data.dataset import random_split
  2. from torchtext.data.functional import to_map_style_dataset
  3. # 参数设置
  4. EPOCHS = 10 # epoch数量
  5. LR = 5 # 学习速率
  6. BATCH_SIZE = 64 # 训练的batch大小
  7. # 设置损失函数、优化器和调度器
  8. criterion = torch.nn.CrossEntropyLoss()
  9. optimizer = torch.optim.SGD(model.parameters(), lr=LR)
  10. scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  11. total_accu = None
  12. # 准备数据集
  13. train_iter = coustom_data_iter(train_data[0].values[:], train_data[1].values[:])
  14. train_dataset = to_map_style_dataset(train_iter)
  15. split_train_, split_valid_ = random_split(train_dataset,
  16. [int(len(train_dataset)*0.8), int(len(train_dataset)*0.2)])
  17. train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
  18. shuffle=True, collate_fn=collate_batch)
  19. valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
  20. shuffle=True, collate_fn=collate_batch)
  21. # 训练循环
  22. for epoch in range(1, EPOCHS + 1):
  23. epoch_start_time = time.time()
  24. train(train_dataloader)
  25. val_acc, val_loss = evaluate(valid_dataloader)
  26. # 更新学习率的策略
  27. lr = optimizer.state_dict()['param_groups'][0]['lr']
  28. if total_accu is not None and total_accu > val_acc:
  29. scheduler.step()
  30. else:
  31. total_accu = val_acc
  32. print('-' * 69)
  33. print('| end of epoch {:3d} | time: {:4.2f}s | '
  34. 'valid accuracy {:4.3f} | valid loss {:4.3f} | lr {:4.6f}'.format(
  35. epoch, time.time() - epoch_start_time, val_acc, val_loss, lr))
  36. print('-' * 69)

运行结果:

  1. | epoch 1 | 50/ 152 batches | accuracy 0.423 | loss 0.03079
  2. | epoch 1 | 100/ 152 batches | accuracy 0.700 | loss 0.01912
  3. | epoch 1 | 150/ 152 batches | accuracy 0.776 | loss 0.01347
  4. ---------------------------------------------------------------------
  5. | end of epoch 1 | time: 1.53s | valid accuracy 0.777 | valid loss 2420.000 | lr 5.000000
  6. | epoch 2 | 50/ 152 batches | accuracy 0.812 | loss 0.01056
  7. | epoch 2 | 100/ 152 batches | accuracy 0.843 | loss 0.00871
  8. | epoch 2 | 150/ 152 batches | accuracy 0.844 | loss 0.00846
  9. ---------------------------------------------------------------------
  10. | end of epoch 2 | time: 1.45s | valid accuracy 0.842 | valid loss 2420.000 | lr 5.000000
  11. | epoch 3 | 50/ 152 batches | accuracy 0.883 | loss 0.00653
  12. | epoch 3 | 100/ 152 batches | accuracy 0.879 | loss 0.00634
  13. | epoch 3 | 150/ 152 batches | accuracy 0.883 | loss 0.00627
  14. ---------------------------------------------------------------------
  15. | end of epoch 3 | time: 1.44s | valid accuracy 0.865 | valid loss 2420.000 | lr 5.000000
  16. | epoch 4 | 50/ 152 batches | accuracy 0.912 | loss 0.00498
  17. | epoch 4 | 100/ 152 batches | accuracy 0.906 | loss 0.00495
  18. | epoch 4 | 150/ 152 batches | accuracy 0.915 | loss 0.00461
  19. ---------------------------------------------------------------------
  20. | end of epoch 4 | time: 1.50s | valid accuracy 0.876 | valid loss 2420.000 | lr 5.000000
  21. | epoch 5 | 50/ 152 batches | accuracy 0.935 | loss 0.00386
  22. | epoch 5 | 100/ 152 batches | accuracy 0.934 | loss 0.00390
  23. | epoch 5 | 150/ 152 batches | accuracy 0.932 | loss 0.00362
  24. ---------------------------------------------------------------------
  25. | end of epoch 5 | time: 1.59s | valid accuracy 0.881 | valid loss 2420.000 | lr 5.000000
  26. | epoch 6 | 50/ 152 batches | accuracy 0.947 | loss 0.00313
  27. | epoch 6 | 100/ 152 batches | accuracy 0.949 | loss 0.00307
  28. | epoch 6 | 150/ 152 batches | accuracy 0.949 | loss 0.00286
  29. ---------------------------------------------------------------------
  30. | end of epoch 6 | time: 1.68s | valid accuracy 0.891 | valid loss 2420.000 | lr 5.000000
  31. | epoch 7 | 50/ 152 batches | accuracy 0.960 | loss 0.00243
  32. | epoch 7 | 100/ 152 batches | accuracy 0.963 | loss 0.00224
  33. | epoch 7 | 150/ 152 batches | accuracy 0.959 | loss 0.00252
  34. ---------------------------------------------------------------------
  35. | end of epoch 7 | time: 1.53s | valid accuracy 0.892 | valid loss 2420.000 | lr 5.000000
  36. | epoch 8 | 50/ 152 batches | accuracy 0.972 | loss 0.00186
  37. | epoch 8 | 100/ 152 batches | accuracy 0.974 | loss 0.00184
  38. | epoch 8 | 150/ 152 batches | accuracy 0.967 | loss 0.00201
  39. ---------------------------------------------------------------------
  40. | end of epoch 8 | time: 1.43s | valid accuracy 0.895 | valid loss 2420.000 | lr 5.000000
  41. | epoch 9 | 50/ 152 batches | accuracy 0.981 | loss 0.00138
  42. | epoch 9 | 100/ 152 batches | accuracy 0.977 | loss 0.00165
  43. | epoch 9 | 150/ 152 batches | accuracy 0.980 | loss 0.00147
  44. ---------------------------------------------------------------------
  45. | end of epoch 9 | time: 1.48s | valid accuracy 0.900 | valid loss 2420.000 | lr 5.000000
  46. | epoch 10 | 50/ 152 batches | accuracy 0.987 | loss 0.00117
  47. | epoch 10 | 100/ 152 batches | accuracy 0.985 | loss 0.00121
  48. | epoch 10 | 150/ 152 batches | accuracy 0.984 | loss 0.00121
  49. ---------------------------------------------------------------------
  50. | end of epoch 10 | time: 1.45s | valid accuracy 0.902 | valid loss 2420.000 | lr 5.000000
  51. ---------------------------------------------------------------------

6.模型评估

  1. test_acc, test_loss = evaluate(valid_dataloader)
  2. print('模型的准确率: {:5.4f}'.format(test_acc))

7.模型测试

  1. def predict(text, text_pipeline):
  2. with torch.no_grad():
  3. text = torch.tensor(text_pipeline(text))
  4. output = model(text, torch.tensor([0]))
  5. return output.argmax(1).item()
  6. # 示例文本字符串
  7. # ex_text_str = "例句输入——这是一个待预测类别的示例句子"
  8. ex_text_str = "这不仅影响到我们的方案是否可行13号的"
  9. model = model.to("cpu")
  10. print("该文本的类别是: %s" % label_name[predict(ex_text_str, text_pipeline)])

8.全部代码(部分修改):

  1. import torch
  2. import torch.nn as nn
  3. import torchvision
  4. from torchvision import transforms, datasets
  5. import os, PIL, pathlib, warnings
  6. warnings.filterwarnings("ignore") # 忽略警告信息
  7. # win10系统
  8. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  9. print(device)
  10. import pandas as pd
  11. # 加载自定义中文数据集
  12. train_data = pd.read_csv('D:/train.csv', sep='\t', header=None)
  13. train_data.head()
  14. # 构建数据集迭代器
  15. def custom_data_iter(texts, labels):
  16. for x, y in zip(texts, labels):
  17. yield x, y
  18. train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
  19. from torchtext.data.utils import get_tokenizer
  20. from torchtext.vocab import build_vocab_from_iterator
  21. import jieba
  22. # 中文分词方法
  23. tokenizer = jieba.lcut
  24. def yield_tokens(data_iter):
  25. for text,_ in data_iter:
  26. yield tokenizer(text)
  27. vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
  28. vocab.set_default_index(vocab["<unk>"])
  29. print(vocab(['我', '想', '看', '书', '和', '你', '一起', '看', '电影', '的', '新款', '视频']))
  30. label_name = list(set(train_data[1].values[:]))
  31. print(label_name)
  32. text_pipeline = lambda x: vocab(tokenizer(x))
  33. label_pipeline = lambda x: label_name.index(x)
  34. print(text_pipeline('我想看新闻或者上网站看最新的游戏视频'))
  35. print(label_pipeline('Video-Play'))
  36. from torch.utils.data import DataLoader
  37. def collate_batch(batch):
  38. label_list, text_list, offsets = [], [], [0]
  39. for (_text, _label) in batch:
  40. # 标签列表
  41. label_list.append(label_pipeline(_label))
  42. # 文本列表
  43. processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
  44. text_list.append(processed_text)
  45. # 偏移量,即词汇的起始位置
  46. offsets.append(processed_text.size(0))
  47. label_list = torch.tensor(label_list, dtype=torch.int64)
  48. text_list = torch.cat(text_list)
  49. offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) # 累计偏移量dim中维度元素的累计和
  50. return text_list.to(device), label_list.to(device), offsets.to(device)
  51. # 数据加载器,调用示例
  52. dataloader = DataLoader(train_iter,
  53. batch_size=8,
  54. shuffle=False,
  55. collate_fn=collate_batch)
  56. from torch import nn
  57. class TextClassificationModel(nn.Module):
  58. def __init__(self, vocab_size, embed_dim, num_class):
  59. super(TextClassificationModel, self).__init__()
  60. self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
  61. self.fc = nn.Linear(embed_dim, num_class)
  62. self.init_weights()
  63. def init_weights(self):
  64. initrange = 0.5
  65. self.embedding.weight.data.uniform_(-initrange, initrange)
  66. self.fc.weight.data.uniform_(-initrange, initrange)
  67. self.fc.bias.data.zero_()
  68. def forward(self, text, offsets):
  69. embedded = self.embedding(text, offsets)
  70. return self.fc(embedded)
  71. num_class = len(label_name)
  72. vocab_size = len(vocab)
  73. em_size = 64
  74. model = TextClassificationModel(vocab_size, em_size, num_class).to(device)
  75. import time
  76. def train(dataloader):
  77. model.train() # 切换到训练模式
  78. total_acc, train_loss, total_count = 0, 0, 0
  79. log_interval = 50
  80. start_time = time.time()
  81. for idx, (text, label, offsets) in enumerate(dataloader):
  82. predicted_label = model(text, offsets)
  83. optimizer.zero_grad() # 梯度归零
  84. loss = criterion(predicted_label, label) # 计算损失
  85. loss.backward() # 反向传播
  86. torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度裁剪
  87. optimizer.step() # 优化器更新权重
  88. # 记录acc和loss
  89. total_acc += (predicted_label.argmax(1) == label).sum().item()
  90. train_loss += loss.item()
  91. total_count += label.size(0)
  92. if idx % log_interval == 0 and idx > 0:
  93. elapsed = time.time() - start_time
  94. print('| epoch {:3d} | {:5d}/{:5d} batches '
  95. '| accuracy {:8.3f} | loss {:8.5f}'.format(
  96. epoch, idx, len(dataloader),
  97. total_acc/total_count, train_loss/total_count))
  98. total_acc, train_loss, total_count = 0, 0, 0
  99. start_time = time.time()
  100. def evaluate(dataloader):
  101. model.eval() # 切换到评估模式
  102. total_acc, total_count = 0, 0
  103. with torch.no_grad():
  104. for idx, (text, label, offsets) in enumerate(dataloader):
  105. predicted_label = model(text, offsets)
  106. loss = criterion(predicted_label, label) # 计算loss
  107. total_acc += (predicted_label.argmax(1) == label).sum().item()
  108. total_count += label.size(0)
  109. return total_acc/total_count, total_count
  110. from torch.utils.data.dataset import random_split
  111. from torchtext.data.functional import to_map_style_dataset
  112. # 参数设置
  113. EPOCHS = 10 # epoch数量
  114. LR = 5 # 学习速率
  115. BATCH_SIZE = 64 # 训练的batch大小
  116. # 设置损失函数、优化器和调度器
  117. criterion = torch.nn.CrossEntropyLoss()
  118. optimizer = torch.optim.SGD(model.parameters(), lr=LR)
  119. scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  120. total_accu = None
  121. # 准备数据集
  122. train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
  123. train_dataset = to_map_style_dataset(train_iter)
  124. split_train_, split_valid_ = random_split(train_dataset,
  125. [int(len(train_dataset)*0.8), int(len(train_dataset)*0.2)])
  126. train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
  127. shuffle=True, collate_fn=collate_batch)
  128. valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
  129. shuffle=True, collate_fn=collate_batch)
  130. # 训练循环
  131. for epoch in range(1, EPOCHS + 1):
  132. epoch_start_time = time.time()
  133. train(train_dataloader)
  134. val_acc, val_loss = evaluate(valid_dataloader)
  135. # 更新学习率的策略
  136. lr = optimizer.state_dict()['param_groups'][0]['lr']
  137. if total_accu is not None and total_accu > val_acc:
  138. scheduler.step()
  139. else:
  140. total_accu = val_acc
  141. print('-' * 69)
  142. print('| end of epoch {:3d} | time: {:4.2f}s | '
  143. 'valid accuracy {:4.3f} | valid loss {:4.3f} | lr {:4.6f}'.format(
  144. epoch, time.time() - epoch_start_time, val_acc, val_loss, lr))
  145. print('-' * 69)
  146. test_acc, test_loss = evaluate(valid_dataloader)
  147. print('模型的准确率: {:5.4f}'.format(test_acc))
  148. def predict(text, text_pipeline):
  149. with torch.no_grad():
  150. text = torch.tensor(text_pipeline(text))
  151. output = model(text, torch.tensor([0]))
  152. return output.argmax(1).item()
  153. # 示例文本字符串
  154. # ex_text_str = "例句输入——这是一个待预测类别的示例句子"
  155. ex_text_str = "这不仅影响到我们的方案是否可行13号的"
  156. model = model.to("cpu")
  157. print("该文本的类别是: %s" % label_name[predict(ex_text_str, text_pipeline)])

9.代码改进及优化

9.1优化器: 尝试不同的优化算法,如Adam、RMSprop替换原来的SGD优化器部分
9.1.1使用Adam优化器:
  1. import torch
  2. import torch.nn as nn
  3. import torchvision
  4. from torchvision import transforms, datasets
  5. import os, PIL, pathlib, warnings
  6. warnings.filterwarnings("ignore") # 忽略警告信息
  7. # win10系统
  8. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  9. print(device)
  10. import pandas as pd
  11. # 加载自定义中文数据集
  12. train_data = pd.read_csv('D:/train.csv', sep='\t', header=None)
  13. train_data.head()
  14. # 构建数据集迭代器
  15. def custom_data_iter(texts, labels):
  16. for x, y in zip(texts, labels):
  17. yield x, y
  18. train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
  19. from torchtext.data.utils import get_tokenizer
  20. from torchtext.vocab import build_vocab_from_iterator
  21. import jieba
  22. # 中文分词方法
  23. tokenizer = jieba.lcut
  24. def yield_tokens(data_iter):
  25. for text,_ in data_iter:
  26. yield tokenizer(text)
  27. vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
  28. vocab.set_default_index(vocab["<unk>"])
  29. print(vocab(['我', '想', '看', '书', '和', '你', '一起', '看', '电影', '的', '新款', '视频']))
  30. label_name = list(set(train_data[1].values[:]))
  31. print(label_name)
  32. text_pipeline = lambda x: vocab(tokenizer(x))
  33. label_pipeline = lambda x: label_name.index(x)
  34. print(text_pipeline('我想看新闻或者上网站看最新的游戏视频'))
  35. print(label_pipeline('Video-Play'))
  36. from torch.utils.data import DataLoader
  37. def collate_batch(batch):
  38. label_list, text_list, offsets = [], [], [0]
  39. for (_text, _label) in batch:
  40. # 标签列表
  41. label_list.append(label_pipeline(_label))
  42. # 文本列表
  43. processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
  44. text_list.append(processed_text)
  45. # 偏移量,即词汇的起始位置
  46. offsets.append(processed_text.size(0))
  47. label_list = torch.tensor(label_list, dtype=torch.int64)
  48. text_list = torch.cat(text_list)
  49. offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) # 累计偏移量dim中维度元素的累计和
  50. return text_list.to(device), label_list.to(device), offsets.to(device)
  51. # 数据加载器,调用示例
  52. dataloader = DataLoader(train_iter,
  53. batch_size=8,
  54. shuffle=False,
  55. collate_fn=collate_batch)
  56. from torch import nn
  57. class TextClassificationModel(nn.Module):
  58. def __init__(self, vocab_size, embed_dim, num_class):
  59. super(TextClassificationModel, self).__init__()
  60. self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
  61. self.fc = nn.Linear(embed_dim, num_class)
  62. self.init_weights()
  63. def init_weights(self):
  64. initrange = 0.5
  65. self.embedding.weight.data.uniform_(-initrange, initrange)
  66. self.fc.weight.data.uniform_(-initrange, initrange)
  67. self.fc.bias.data.zero_()
  68. def forward(self, text, offsets):
  69. embedded = self.embedding(text, offsets)
  70. return self.fc(embedded)
  71. num_class = len(label_name)
  72. vocab_size = len(vocab)
  73. em_size = 64
  74. model = TextClassificationModel(vocab_size, em_size, num_class).to(device)
  75. import time
  76. def train(dataloader):
  77. model.train() # 切换到训练模式
  78. total_acc, train_loss, total_count = 0, 0, 0
  79. log_interval = 50
  80. start_time = time.time()
  81. for idx, (text, label, offsets) in enumerate(dataloader):
  82. predicted_label = model(text, offsets)
  83. optimizer.zero_grad() # 梯度归零
  84. loss = criterion(predicted_label, label) # 计算损失
  85. loss.backward() # 反向传播
  86. torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) # 梯度裁剪
  87. optimizer.step() # 优化器更新权重
  88. # 记录acc和loss
  89. total_acc += (predicted_label.argmax(1) == label).sum().item()
  90. train_loss += loss.item()
  91. total_count += label.size(0)
  92. if idx % log_interval == 0 and idx > 0:
  93. elapsed = time.time() - start_time
  94. print('| epoch {:3d} | {:5d}/{:5d} batches '
  95. '| accuracy {:8.3f} | loss {:8.5f}'.format(
  96. epoch, idx, len(dataloader),
  97. total_acc/total_count, train_loss/total_count))
  98. total_acc, train_loss, total_count = 0, 0, 0
  99. start_time = time.time()
  100. def evaluate(dataloader):
  101. model.eval() # 切换到评估模式
  102. total_acc, total_count = 0, 0
  103. with torch.no_grad():
  104. for idx, (text, label, offsets) in enumerate(dataloader):
  105. predicted_label = model(text, offsets)
  106. loss = criterion(predicted_label, label) # 计算loss
  107. total_acc += (predicted_label.argmax(1) == label).sum().item()
  108. total_count += label.size(0)
  109. return total_acc/total_count, total_count
  110. from torch.utils.data.dataset import random_split
  111. from torchtext.data.functional import to_map_style_dataset
  112. # 参数设置
  113. EPOCHS = 10 # epoch数量
  114. LR = 5 # 学习速率
  115. BATCH_SIZE = 64 # 训练的batch大小
  116. # 设置损失函数、优化器和调度器
  117. criterion = torch.nn.CrossEntropyLoss()
  118. #optimizer = torch.optim.SGD(model.parameters(), lr=LR)
  119. optimizer = torch.optim.Adam(model.parameters(), lr=LR)
  120. scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  121. total_accu = None
  122. # 准备数据集
  123. train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
  124. train_dataset = to_map_style_dataset(train_iter)
  125. split_train_, split_valid_ = random_split(train_dataset,
  126. [int(len(train_dataset)*0.8), int(len(train_dataset)*0.2)])
  127. train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
  128. shuffle=True, collate_fn=collate_batch)
  129. valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
  130. shuffle=True, collate_fn=collate_batch)
  131. # 训练循环
  132. for epoch in range(1, EPOCHS + 1):
  133. epoch_start_time = time.time()
  134. train(train_dataloader)
  135. val_acc, val_loss = evaluate(valid_dataloader)
  136. # 更新学习率的策略
  137. lr = optimizer.state_dict()['param_groups'][0]['lr']
  138. if total_accu is not None and total_accu > val_acc:
  139. scheduler.step()
  140. else:
  141. total_accu = val_acc
  142. print('-' * 69)
  143. print('| end of epoch {:3d} | time: {:4.2f}s | '
  144. 'valid accuracy {:4.3f} | valid loss {:4.3f} | lr {:4.6f}'.format(
  145. epoch, time.time() - epoch_start_time, val_acc, val_loss, lr))
  146. print('-' * 69)
  147. test_acc, test_loss = evaluate(valid_dataloader)
  148. print('模型的准确率: {:5.4f}'.format(test_acc))
  149. def predict(text, text_pipeline):
  150. with torch.no_grad():
  151. text = torch.tensor(text_pipeline(text))
  152. output = model(text, torch.tensor([0]))
  153. return output.argmax(1).item()
  154. # 示例文本字符串
  155. # ex_text_str = "例句输入——这是一个待预测类别的示例句子"
  156. ex_text_str = "这不仅影响到我们的方案是否可行13号的"
  157. model = model.to("cpu")
  158. print("该文本的类别是: %s" % label_name[predict(ex_text_str, text_pipeline)])

需要下载的库

pip install jieba -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install torchtext -i https://pypi.tuna.tsinghua.edu.cn/simple


 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/193992
推荐阅读
相关标签
  

闽ICP备14008679号