赞
踩
该项目我是在谷歌的Colab平台完成的,首先,要先查看数据
import os
os.chdir("drive/Colab Notebooks/NLP/Rotten Tomatoes movie review")
import pandas as pd
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test.tsv", sep='\t')
print(train.head(5))
import refrom nltk.corpus import stopwords # 定义清洗函数 def review_to_words(raw_review): letters_only = re.sub('[^a-zA-Z]', ' ', raw_review) # 只保留字母 words = letters_only.lower().split() # 转换成小写字母 stops = set(stopwords.words('english')) # 加载停止词 meaningful_words = [w for w in words if w not in stops] # 去除停止词 return (" ".join(meaningful_words)) # 数据清洗 import nltk nltk.download('stopwords') clean_train_reviews = [] num_reviews = train["Phrase"].size for i in range(0, num_reviews): if (i+1) % 10000 == 0: print("Review %d of %d\n" % (i+1, num_reviews)) clean_train_reviews.append(review_to_words(train["Phrase"][i])) # 查看清洗后的数据 print(clean_train_reviews[0]) # 'series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story' # 构造分词器 nltk.download('punkt') tokenizer = nltk.word_tokenize word = [] for i in range(len(clean_train_reviews)): word+=(tokenizer(clean_train_reviews[i])) # 创建词汇表 word_set = list(set(word)) # len(word_set) = 14992 # 由于上述操作使得所有词汇都存入了一个列表,而我们的训练数据需要保持原始数据的形状,即保证每条评论在一个列表中,需要进行下面的操作 word_df = [] for i in range(len(clean_train_reviews)): word_df.append(tokenizer(clean_train_reviews[i])) # 下面是关键的一步,将所有的词汇转换为one-hot向量 for i in range(len(word_df)): for w in word_df[i]: if w in word_set: word_df[i][word_df[i].index(w)] = word_set.index(w) # 找到最长的序列,后续要对所有序列进行统一填充,使得所有序列长度一致,因为LSTM模型的输入要序列长度一致 len_list = [] for i in range(len(word_df)): len_list.append(len(word_df[i])) max_len = max(len_list) # 填充序列,得到最终的训练特征 import numpy as np features = np.zeros((len(word_df), max_len), dtype=int) for i in range(len(word_df)): for j in range(len(word_df[i])): features[i][j] = word_df[i][j] train_labels = np.array([l for l in train['Sentiment']]) # 准备训练数据的标签 # 划分训练集和验证集 from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(features, train_labels, test_size=0.3, random_state=42) print('\t\t\tFeature Shape:') print('Train Set:\t\t{}'.format(X_train.shape), '\nVal Set:\t\t{}'.format(X_val.shape)) # Feature Shape: # Train Set: (109242, 30) # Val Set: (46818, 30)
下面构造LSTM模型,我之前的博客中对LSTM做过分析,这里我再次做一遍复习。
首先,我之前的博客中列出了LSTM的公式,公式中有4组权重,对应的还有4组偏置(公式中没有体现),这也就是我们要训练的LSTM的参数,我们注意到,4组权重中,分别各有一个权重与输入相乘,另一个权重与隐藏状态相乘,这就导致了权重维度的不同,下面通过一组代码来查看一下LSTM的权重。
rnn = nn.LSTM(input_size=400, hidden_size=256, num_layers=2)
for p in rnn.parameters():
print(p.size())
# 第一层
torch.Size([1024, 400])
torch.Size([1024, 256])
torch.Size([1024])
torch.Size([1024])
# 第二层
torch.Size([1024, 256])
torch.Size([1024, 256])
torch.Size([1024])
torch.Size([1024])
首先看第一层,torch.Size([1024, 400])代表的是与input相乘的权重,其大小是(256, 400),由于最终4个权重在列方向上拼接,所以大小是(1024, 400),公式是,其中o代表了4个公式的输出,
o
h
i
d
d
e
n
−
s
i
z
e
×
b
a
t
c
h
−
s
i
z
e
=
W
h
i
d
d
e
n
−
s
i
z
e
×
i
n
p
u
t
−
s
i
z
e
X
i
n
p
u
t
−
s
i
z
e
×
b
a
t
c
h
−
s
i
z
e
o_{hidden-size\times batch-size}=W_{hidden-size\times input-size}X_{input-size\times batch-size}
ohidden−size×batch−size=Whidden−size×input−sizeXinput−size×batch−size
而与hidden相乘的权重的大小是(256, 256),具体推导过程和上式相似。
然后第二层的输入是第一层的输出,第一层的输出的最后一个时间步的隐藏状态,大小是256,隐藏层节点个数也是256,所以第二层的输出如上所示。
下面具体分析LSTM的模型搭建过程,
import torch import torch.nn as nn from torch.utils.data import TensorDataset, DataLoader batch_size = 54 train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)) val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val)) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4) val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=4) print(len(train_loader)) print(len(val_loader)) class SentimentLSTM(nn.Module): def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5): super(SentimentLSTM, self).__init__() self.output_size = output_size self.n_layers = n_layers self.hidden_dim = hidden_dim self.embedding = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True) self.dropout = nn.Dropout(0.3) self.fc = nn.Linear(hidden_dim, output_size) def forward(self, x, hidden): # x_shape = (batch=54, time_step=30, input_size=400) # hidden_shape = (n_layers=2, batch=54, hidden_size=256) # output_shape = (batch=54, time_step=30, hidden_size=256) batch_size = x.size(0) embeds = self.embedding(x) lstm_out, hidden = self.lstm(embeds, hidden) #hidden包含(h_n和c_n), lstm_out = lstm_out.transpose(0,1) lstm_out = lstm_out[-1] # 等于h_n(-1, :, :) out = self.dropout(lstm_out) out = self.fc(out) return out, hidden def init_hidden(self, batch_size): weight = next(self.parameters()).data if (train_on_gpu): hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda()) else: hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()) return hidden vocab_size = len(word_set) + 1 output_size = 5 embedding_dim = 400 hidden_dim = 256 n_layers = 2 net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers) lr=0.003 criterion = nn.CrossEntropyLoss() # 交叉熵函数自动将label转换为one-hot向量 optimizer = torch.optim.Adam(net.parameters(), lr=lr) epoch = 5 counter = 0 print_every = 100 clip = 5 # 最大范数 if(train_on_gpu): net.cuda() net.train() # 开始训练 for e in range(epoch): h = net.init_hidden(batch_size) for inputs, labels in train_loader: counter += 1 if train_on_gpu: inputs, labels = inputs.cuda(), labels.cuda() h = tuple([each.data for each in h]) net.zero_grad() output, h = net(inputs, h) loss = criterion(output, labels) loss.backward() nn.utils.clip_grad_norm_(net.parameters(), clip) # 梯度裁剪 optimizer.step() if counter % 100 == 0: val_h = net.init_hidden(batch_size) val_losses = [] net.eval() for inputs, labels in val_loader: val_h = tuple([each.data for each in val_h]) if train_on_gpu: inputs, labels = inputs.cuda(), labels.cuda() output, val_h = net(inputs, val_h) val_h = tuple([each.data for each in val_h]) val_loss = criterion(output, labels) val_losses.append(val_loss.item()) net.train() print("Epoch: {}/{}...".format(e+1, epoch), "Step: {}...".format(counter), "Loss: {:.6f}...".format(loss.item()), "Val Loss: {:.6f}".format(np.mean(val_losses)))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。