赞
踩
Class torch.nn.LSTM(*args,**kwargs)
# pytorch中的输入参数 input_size – 数据的特征维度(使用embedding时就是指embedding dim) hidden_size – 隐向量的维度 num_layers – LSTM的层数,i层LSTM将使用i-1层LSTM的输出作为输入。默认为1 bias – 特征变换时是否使用bias,默认为True batch_first – 设置batch_size的位置,当为True时,输入的数据维度应该是 (batch, seq, feature) ,否则应该是 (seq, batch, feature)。默认是False dropout – 除了最后的输出层外,在每层LSTM的输出后设置一层dropout层,dropout_ratio默认为0(只有num_layers>1,才会起作用) bidirectional – 是否为双向LSTM,双向LSTM就是同时从句首正向和句尾反向进行LSTM编码,最后将同时刻两个方向得到的hidden state进行concat作为输出,默认为False proj_size – 对隐层ht加上一层映射,映射维度为设置值,默认为0,也就是不映射。
在embedding层之前要先将句子转换层token,对于英文来说最常见的方法是通过空格进行划分,但是效果不够好,torchtext中最常用的是使用spacy框架进行划分token。
# 定义了句子和标签的两种划分token的方法,text使用spacy的方法划分,
# 并且在返回token的时候还返回句子划分为token之后的长度,便于后续处理。
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
embedding层使token转换成向量,参与下面神经网络的计算,词向量通常有Skip-Gram和CBOW两种模型,从直观上理解:
为了加快训练过程有负采样和层级softmax两种训练方法。实际上,通常使用预训练的词向量,这里我们选取GloVe词向量,GloVe的全称是:Global Vectors for Word Representation。
# 在torchtext中可以在创建词库的时候,确定预训练的词向量。
# max_size确定了词库的容量(总量是max_size+2,因为还会多[pad] [unk]
# vectors确定使用哪种词向量
# unk_init确定unkown的token转化为词向量的方法
TEXT.build_vocab(train_data,
max_size = MAX_VOCAB_SIZE,
vectors = "glove.6B.100d",
unk_init = torch.Tensor.normal_)
因为每个句子的长度都不一样,所以数据中seq这一维是不一样的,为了便于计算,通常采用的方法是保证每一个batch中的seq这一维是一样的。
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
sort_within_batch = True,
device = device)
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
#text 的形状 [sent len, batch size]
embedded = self.dropout(self.embedding(text))
#embedded 的形状 [sent len, batch size, emb dim]
# pack sequence
# lengths need to be on CPU!
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
packed_output, (hidden, cell) = self.rnn(packed_embedded)
#hidden 的形状 [num layers * num directions, batch size, hid dim]
#unpack sequence
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output的形状[sent len, batch size, hid dim * num directions]
#由于[pad]的embedding vector是0,所以output中的 padding tokens对应的输出是数值为0的张量
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
#hidden 的形状 [batch size, hid dim * num directions]
在建立完模型之后,需要将embedding层的weight置换成前面glove的词向量。
# 用预训练的embedding词向量替换原始模型初始化的权重参数
model.embedding.weight.data.copy_(pretrained_embeddings)
还需要将[pad]和[unk]的词向量置为0,因为前面对于unk的token是采用随机设置词向量的方法。
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
import torch from torchtext.legacy import data from torchtext.legacy import datasets import torch.optim as optim import spacy import random import time # 设置随机种子 SEED = 1234 torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True # 定义torchtext中的field TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm', include_lengths = True) LABEL = data.LabelField(dtype = torch.float) # 下载IMDB数据,并进行切分 train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split(random_state = random.seed(SEED)) #建立词库 MAX_VOCAB_SIZE = 25000 TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_) LABEL.build_vocab(train_data) # 创建数据迭代器 BATCH_SIZE = 64 # 根据当前环境选择是否调用GPU进行训练 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size = BATCH_SIZE, sort_within_batch = True, device = device) # 建立双向LSTM模型 class RNN(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx): super().__init__() # embedding嵌入层(词向量) self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx) # 双向LSTM self.rnn = nn.LSTM(embedding_dim, # input_size hidden_dim, #output_size num_layers=n_layers, # 层数 bidirectional=bidirectional, #是否双向 dropout=dropout) #随机去除神经元 # 线性连接层 self.fc = nn.Linear(hidden_dim * 2, output_dim) # 因为前向传播+后向传播有两个hidden sate,且合并在一起,所以乘以2 # 随机去除神经元 self.dropout = nn.Dropout(dropout) def forward(self, text, text_lengths): #text 的形状 [sent len, batch size] embedded = self.dropout(self.embedding(text)) #embedded 的形状 [sent len, batch size, emb dim] # pack sequence # lengths need to be on CPU! packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu')) packed_output, (hidden, cell) = self.rnn(packed_embedded) #output的形状[sent len, batch size, hid dim * num directions] #hidden 的形状 [num layers * num directions, batch size, hid dim] #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) #hidden 的形状 [batch size, hid dim * num directions] return self.fc(hidden) # 实例化模型 INPUT_DIM = len(TEXT.vocab) # 250002: 之前设置的只取25000个最频繁的词,加上pad_token和unknown token EMBEDDING_DIM = 100 HIDDEN_DIM = 256 OUTPUT_DIM = 1 N_LAYERS = 2 BIDIRECTIONAL = True DROPOUT = 0.5 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] #指定参数,定义pad_token的index索引值,让模型不管pad token model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX) # 用预训练的embedding词向量替换原始模型初始化的权重参数 pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) #将unknown 和padding token设置为0 UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) # 模型优化器与损失函数 optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss() model = model.to(device) criterion = criterion.to(device) # 定义二分类准确率函数 def binary_accuracy(preds, y): """ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 """ #round predictions to the closest integer rounded_preds = torch.round(torch.sigmoid(preds)) correct = (rounded_preds == y).float() #convert into float for division acc = correct.sum() / len(correct) return acc # 定义一个epoch的训练过程 def train(model, iterator, optimizer, criterion): epoch_loss = 0 epoch_acc = 0 model.train() for batch in iterator: optimizer.zero_grad() # 梯度清零 text, text_lengths = batch.text # batch.text返回的是一个元组(数字化的张量,每个句子的长度) predictions = model(text, text_lengths).squeeze(1) loss = criterion(predictions, batch.label) acc = binary_accuracy(predictions, batch.label) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) # 定义评价函数 def evaluate(model, iterator, criterion): epoch_loss = 0 epoch_acc = 0 model.eval() with torch.no_grad(): for batch in iterator: text, text_lengths = batch.text #batch.text返回的是一个元组(数字化的张量,每个句子的长度) predictions = model(text, text_lengths).squeeze(1) loss = criterion(predictions, batch.label) acc = binary_accuracy(predictions, batch.label) epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) # 定义训练时间函数 def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs #比进行训练 N_EPOCHS = 5 best_valid_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) # 保留最好的训练结果的那个模型参数,之后加载这个进行预测 if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'tut2-model.pt') print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') # 保存模型参数 model.load_state_dict(torch.load('tut2-model.pt')) # 使用测试集评价模型 test_loss, test_acc = evaluate(model, test_iterator, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') # 定义单句情感分析的函数 def predict_sentiment(model, sentence): nlp = spacy.load('en_core_web_sm') model.eval() tokenized = [tok.text for tok in nlp.tokenizer(sentence)] indexed = [TEXT.vocab.stoi[t] for t in tokenized] length = [len(indexed)] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) length_tensor = torch.LongTensor(length) prediction = torch.sigmoid(model(tensor, length_tensor)) return prediction.item()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。