赞
踩
简单描述:AG_NEWS是通过从原始语料库中选择4个最大的类别来构建的。每个类别包含30,000个训练样本和1,900个测试样本。训练样本的总数为120,000,测试样本为7,600。 https://github.com/mhjabreel/CharCNN/tree/master/data
from torchtext.vocab import build_vocab_from_iterator # 假设这是一个包含文本分词结果的列表 word_list = ["apple", "banana", "crange", "dear", "grape","kiwi", "strawberry","<unk>", "watermelon"] # 构建词汇表对象 vocab = build_vocab_from_iterator(iter([word_list])) # 设置默认索引为 "<unk>" 对应的索引,默认索引也就是起始索引,下标为0 vocab.set_default_index(vocab["<unk>"]) print(vocab(['apple', 'banana'])) # 将一个包含未知单词的列表转换为索引列表,也就是默认索引为初始下标0,其余字母按照首字母大小排列 index_list = [vocab[token] for token in word_list] # 打印转换后的索引列表 print(index_list) vocab_size = len(vocab)#计算词汇表大小 print(vocab_size)
输出结果
from torchtext.datasets import AG_NEWS import torch device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_iter = AG_NEWS(split='train') from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator tokenizer = get_tokenizer('basic_english') # 返回分词器函数,“get_tokenizer函数详解”在下述 #通过for循环遍历data_iter中的每个元素(即每个包含文本内容的元组)。由于只关心文本内容,因此使用下划线 _ 来表示元组中的第一个元素(通常是ID或索引),并将文本内容赋值给变量text. # 然后,调用之前定义的分词器函数 tokenizer 对文本进行分词,并使用 yield 关键字将分词结果依次返回给调用者。 def yield_tokens(data_iter): for _, text in data_iter:#获取每一条的标签label和内容text yield tokenizer(text)#对获取内容分词,并返回。yield返回一个迭代器对象 for i in range(3): print(list(next(yield_tokens(train_iter)))) vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) # 设置默认索引,如果找不到单词,则会选择默认索引 print(vocab(['here', 'is', 'an', 'example'])) #输出的是每个单词的索引 text_pipeline = lambda x: vocab(tokenizer(x)) #接受参数x,返回的是vocab(tokenizer(x))分词索引 label_pipeline = lambda x: int(x) - 1 #接受参数x,返回该数值减一 print(text_pipeline('here is the an example'))
PS:get_tokenizer() 函数通常是在某个 NLP(自然语言处理)库中定义的一个函数,它用于获取适当的分词器。分词是自然语言处理中的一个重要环节,它将一段文本切割成若干个有意义的单元,这些单元被称为“token”,是后续自然语言处理任务的基础。
不同的 NLP 库会提供不同的分词器,get_tokenizer() 函数就是用来获取相应的分词器。具体而言,这个函数会根据传入的参数(一般是语言或者语言加上特性),返回一个对应的分词器对象。
例如,假设我们使用了 PyTorch 的 torchtext 库,我们可以按照以下方式获取适当的英语分词器:
from torchtext.legacy.data.utils import get_tokenizer
tokenizer = get_tokenizer(‘basic_english’)
tokens = tokenizer(“Hello world, I’m an AI assistant.”)
print(tokens)
输出为
[‘hello’, ‘world’, ‘,’, ‘i’, “'m”, ‘an’, ‘ai’, ‘assistant’, ‘.’]
from torch.utils.data import DataLoader def collate_batch(batch): label_list, text_list, offsets = [], [], [0] for (_label, _text) in batch: # 标签列表 label_list.append(label_pipeline(_label)) # 文本列表 processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) text_list.append(processed_text) # 偏移量,即语句的总词汇量 offsets.append(processed_text.size(0)) label_list = torch.tensor(label_list, dtype=torch.int64) text_list = torch.cat(text_list) offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) # 返回维度dim中输入元素的累计和 return label_list.to(device), text_list.to(device), offsets.to(device) # 数据加载器 dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
from torch import nn #这个模型的输入是词汇表大小,嵌入维度,分类种类 # 模型的输出是一个大小为 num_class 的张量,表示每个类别的分数。 class TextClassificationModel(nn.Module): def __init__(self, vocab_size, embed_dim, num_class): super(TextClassificationModel, self).__init__() self.embedding = nn.EmbeddingBag(vocab_size, # 词典大小 embed_dim, # 嵌入的维度 sparse=False) # self.fc = nn.Linear(embed_dim, num_class) self.init_weights() def init_weights(self): initrange = 0.5 self.embedding.weight.data.uniform_(-initrange, initrange) self.fc.weight.data.uniform_(-initrange, initrange) self.fc.bias.data.zero_() def forward(self, text, offsets): embedded = self.embedding(text, offsets) return self.fc(embedded)
num_class = len(set([label for (label, text) in train_iter]))#利用集合去重的方式等到不同类别的数目 print(num_class) vocab_size = len(vocab)#计算词汇表大小 em_size = 64 #指定嵌入维度 model = TextClassificationModel(vocab_size, em_size, num_class).to(device) import time import torch.optim as optim def train(dataloader): model.train() # 切换为训练模式 total_acc, train_loss, total_count = 0, 0, 0 log_interval = 100 start_time = time.time() for idx, (label, text, offsets) in enumerate(dataloader): predicted_label = model(text, offsets) optimizer.zero_grad() # grad属性归零 loss = criterion(predicted_label, label) # 计算网络输出和真实值之间的差距,label为真实值 loss.backward() # 反向传播 optimizer.step() # 每一步自动更新 # 记录acc与loss total_acc += (predicted_label.argmax(1) == label).sum().item() train_loss += loss.item() total_count += label.size(0) if idx % log_interval == 0 and idx > 0: elapsed = time.time() - start_time print('| epoch {:1d} | {:4d}/{:4d} batches ' '| train_acc {:4.3f} train_loss {:4.5f}'.format(epoch, idx, len(dataloader), total_acc / total_count, train_loss / total_count)) total_acc, train_loss, total_count = 0, 0, 0 start_time = time.time() def evaluate(dataloader): model.eval() # 切换为测试模式 total_acc, train_loss, total_count = 0, 0, 0 with torch.no_grad(): for idx, (label, text, offsets) in enumerate(dataloader): predicted_label = model(text, offsets) loss = criterion(predicted_label, label) # 计算loss值 # 记录测试数据 total_acc += (predicted_label.argmax(1) == label).sum().item() train_loss += loss.item() total_count += label.size(0) return total_acc / total_count, train_loss / total_count from torch.utils.data.dataset import random_split from torchtext.data.functional import to_map_style_dataset # 超参数 EPOCHS = 100 # epoch LR = 3 # 学习率 BATCH_SIZE = 512 # batch size for training criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) total_accu = None train_iter, test_iter = AG_NEWS() # 加载数据 train_dataset = to_map_style_dataset(train_iter) test_dataset = to_map_style_dataset(test_iter) num_train = int(len(train_dataset) * 0.95) split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train]) train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) for epoch in range(1, EPOCHS + 1): epoch_start_time = time.time() train(train_dataloader) val_acc, val_loss = evaluate(valid_dataloader) if total_accu is not None and total_accu > val_acc: scheduler.step() else: total_accu = val_acc print('-' * 69) print('| epoch {:1d} | time: {:4.2f}s | ' 'valid_acc {:4.3f} valid_loss {:4.3f}'.format(epoch, time.time() - epoch_start_time, val_acc, val_loss)) print('-' * 69)
| epoch 1 | 100/ 223 batches | train_acc 0.449 train_loss 0.00251 | epoch 1 | 200/ 223 batches | train_acc 0.646 train_loss 0.00195 --------------------------------------------------------------------- | epoch 1 | time: 9.22s | valid_acc 0.727 valid_loss 0.002 --------------------------------------------------------------------- | epoch 2 | 100/ 223 batches | train_acc 0.767 train_loss 0.00136 | epoch 2 | 200/ 223 batches | train_acc 0.815 train_loss 0.00111 --------------------------------------------------------------------- | epoch 2 | time: 9.06s | valid_acc 0.821 valid_loss 0.001 --------------------------------------------------------------------- | epoch 3 | 100/ 223 batches | train_acc 0.842 train_loss 0.00094 | epoch 3 | 200/ 223 batches | train_acc 0.855 train_loss 0.00086 --------------------------------------------------------------------- | epoch 3 | time: 9.06s | valid_acc 0.854 valid_loss 0.001 --------------------------------------------------------------------- | epoch 4 | 100/ 223 batches | train_acc 0.867 train_loss 0.00080 | epoch 4 | 200/ 223 batches | train_acc 0.876 train_loss 0.00075 --------------------------------------------------------------------- | epoch 4 | time: 9.29s | valid_acc 0.868 valid_loss 0.001 **......** | epoch 98 | 100/ 223 batches | train_acc 0.913 train_loss 0.00052 | epoch 98 | 200/ 223 batches | train_acc 0.912 train_loss 0.00052 --------------------------------------------------------------------- | epoch 98 | time: 9.41s | valid_acc 0.896 valid_loss 0.001 --------------------------------------------------------------------- | epoch 99 | 100/ 223 batches | train_acc 0.914 train_loss 0.00052 | epoch 99 | 200/ 223 batches | train_acc 0.913 train_loss 0.00053 --------------------------------------------------------------------- | epoch 99 | time: 9.74s | valid_acc 0.896 valid_loss 0.001 --------------------------------------------------------------------- | epoch 100 | 100/ 223 batches | train_acc 0.913 train_loss 0.00052 | epoch 100 | 200/ 223 batches | train_acc 0.912 train_loss 0.00053 --------------------------------------------------------------------- | epoch 100 | time: 9.76s | valid_acc 0.896 valid_loss 0.001 --------------------------------------------------------------------- Process finished with exit code 0
可见经过简单的训练可以达到约90%准确率,后续可以继续调参或者调整模型网络进行增进。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。