当前位置:   article > 正文

NLP学习5——kaggle比赛入门之烂番茄电影评论情感分析

烂番茄电影评论情感分析

该项目我是在谷歌的Colab平台完成的,首先,要先查看数据

import os 
os.chdir("drive/Colab Notebooks/NLP/Rotten Tomatoes movie review")
import pandas as pd
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test.tsv", sep='\t')
print(train.head(5))
  • 1
  • 2
  • 3
  • 4
  • 5

在这里插入图片描述

import refrom nltk.corpus 
import stopwords
# 定义清洗函数
def review_to_words(raw_review):
	letters_only = re.sub('[^a-zA-Z]', ' ', raw_review) # 只保留字母
	words = letters_only.lower().split()  # 转换成小写字母
	stops = set(stopwords.words('english'))  # 加载停止词
	meaningful_words = [w for w in words if w not in stops]  # 去除停止词
	return (" ".join(meaningful_words))
# 数据清洗
import nltk
nltk.download('stopwords')
clean_train_reviews = []
num_reviews = train["Phrase"].size
for i in range(0, num_reviews):    
	if (i+1) % 10000 == 0:        
		print("Review %d of %d\n" % (i+1, num_reviews))    			  
	clean_train_reviews.append(review_to_words(train["Phrase"][i]))	
# 查看清洗后的数据
print(clean_train_reviews[0])
# 'series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story'	
# 构造分词器
nltk.download('punkt')
tokenizer = nltk.word_tokenize
word = []
for i in range(len(clean_train_reviews)):  
	word+=(tokenizer(clean_train_reviews[i]))
# 创建词汇表
word_set = list(set(word))  # len(word_set) = 14992
# 由于上述操作使得所有词汇都存入了一个列表,而我们的训练数据需要保持原始数据的形状,即保证每条评论在一个列表中,需要进行下面的操作
word_df = []
for i in range(len(clean_train_reviews)):  
	word_df.append(tokenizer(clean_train_reviews[i]))
# 下面是关键的一步,将所有的词汇转换为one-hot向量
for i in range(len(word_df)):  
	for w in word_df[i]:    
		if w in word_set:      
			word_df[i][word_df[i].index(w)] = word_set.index(w)	
# 找到最长的序列,后续要对所有序列进行统一填充,使得所有序列长度一致,因为LSTM模型的输入要序列长度一致
len_list = []
for i in range(len(word_df)):  
	len_list.append(len(word_df[i]))
max_len = max(len_list)
# 填充序列,得到最终的训练特征
import numpy as np
features = np.zeros((len(word_df), max_len), dtype=int)
for i in range(len(word_df)):  
	for j in range(len(word_df[i])):    
		features[i][j] = word_df[i][j]
train_labels = np.array([l for l in train['Sentiment']])  # 准备训练数据的标签
# 划分训练集和验证集
from sklearn.model_selection import train_test_split	
X_train, X_val, y_train, y_val = train_test_split(features, train_labels, test_size=0.3, random_state=42)
print('\t\t\tFeature Shape:')
print('Train Set:\t\t{}'.format(X_train.shape), '\nVal Set:\t\t{}'.format(X_val.shape))
#			Feature Shape:
# Train Set:		(109242, 30) 
# Val Set:		(46818, 30)			
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57

下面构造LSTM模型,我之前的博客中对LSTM做过分析,这里我再次做一遍复习。
首先,我之前的博客中列出了LSTM的公式,公式中有4组权重,对应的还有4组偏置(公式中没有体现),这也就是我们要训练的LSTM的参数,我们注意到,4组权重中,分别各有一个权重与输入相乘,另一个权重与隐藏状态相乘,这就导致了权重维度的不同,下面通过一组代码来查看一下LSTM的权重。

rnn = nn.LSTM(input_size=400, hidden_size=256, num_layers=2)
for p in rnn.parameters():  
	print(p.size())
# 第一层
torch.Size([1024, 400])
torch.Size([1024, 256])
torch.Size([1024])
torch.Size([1024])
# 第二层
torch.Size([1024, 256])
torch.Size([1024, 256])
torch.Size([1024])
torch.Size([1024])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

首先看第一层,torch.Size([1024, 400])代表的是与input相乘的权重,其大小是(256, 400),由于最终4个权重在列方向上拼接,所以大小是(1024, 400),公式是,其中o代表了4个公式的输出,
o h i d d e n − s i z e × b a t c h − s i z e = W h i d d e n − s i z e × i n p u t − s i z e X i n p u t − s i z e × b a t c h − s i z e o_{hidden-size\times batch-size}=W_{hidden-size\times input-size}X_{input-size\times batch-size} ohiddensize×batchsize=Whiddensize×inputsizeXinputsize×batchsize
而与hidden相乘的权重的大小是(256, 256),具体推导过程和上式相似。
然后第二层的输入是第一层的输出,第一层的输出的最后一个时间步的隐藏状态,大小是256,隐藏层节点个数也是256,所以第二层的输出如上所示。
下面具体分析LSTM的模型搭建过程,

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
batch_size = 54
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=4)
print(len(train_loader))
print(len(val_loader))

class SentimentLSTM(nn.Module):  
	def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):    
		super(SentimentLSTM, self).__init__()    
		self.output_size = output_size    
		self.n_layers = n_layers    
		self.hidden_dim = hidden_dim    
		self.embedding = nn.Embedding(vocab_size, embedding_dim)    
		self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)    
		self.dropout = nn.Dropout(0.3)    
		self.fc = nn.Linear(hidden_dim, output_size)
	def forward(self, x, hidden):
	# x_shape = (batch=54, time_step=30, input_size=400)
	# hidden_shape = (n_layers=2, batch=54, hidden_size=256)
	# output_shape = (batch=54, time_step=30, hidden_size=256)
		batch_size = x.size(0)    
		embeds = self.embedding(x)    
		lstm_out, hidden = self.lstm(embeds, hidden)    #hidden包含(h_n和c_n),
		lstm_out = lstm_out.transpose(0,1)   
		lstm_out = lstm_out[-1]    # 等于h_n(-1, :, :)
		out = self.dropout(lstm_out)    
		out = self.fc(out)    
		return out, hidden
	def init_hidden(self, batch_size):    
		weight = next(self.parameters()).data    
		if (train_on_gpu):      
			hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())    
		else:   
			hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())            
		return hidden

vocab_size = len(word_set) + 1
output_size = 5
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

lr=0.003
criterion = nn.CrossEntropyLoss()  # 交叉熵函数自动将label转换为one-hot向量
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
epoch = 5
counter = 0
print_every = 100
clip = 5  # 最大范数
if(train_on_gpu):  
	net.cuda()
net.train()
# 开始训练
for e in range(epoch):  
	h = net.init_hidden(batch_size)  
	for inputs, labels in train_loader:    
		counter += 1    
		if train_on_gpu:      
			inputs, labels = inputs.cuda(), labels.cuda()    
		h = tuple([each.data for each in h])    
		net.zero_grad()    
		output, h = net(inputs, h)    
		loss = criterion(output, labels)    
		loss.backward()    
		nn.utils.clip_grad_norm_(net.parameters(), clip)    # 梯度裁剪
		optimizer.step()    
		if counter % 100 == 0:      
			val_h = net.init_hidden(batch_size)      
			val_losses = []      
			net.eval()      
			for inputs, labels in val_loader:        
				val_h = tuple([each.data for each in val_h])        
				if train_on_gpu:          
					inputs, labels = inputs.cuda(), labels.cuda()        
				output, val_h = net(inputs, val_h)        
				val_h = tuple([each.data for each in val_h])        
				val_loss = criterion(output, labels)        
				val_losses.append(val_loss.item())      
			net.train()      
			print("Epoch: {}/{}...".format(e+1, epoch),            
				"Step: {}...".format(counter),            
				"Loss: {:.6f}...".format(loss.item()),            
				"Val Loss: {:.6f}".format(np.mean(val_losses)))		
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/443822
推荐阅读
相关标签
  

闽ICP备14008679号