NLP学习5——kaggle比赛入门之烂番茄电影评论情感分析

作者：IT小白 | 2024-04-18 03:33:58

踩

烂番茄电影评论情感分析

该项目我是在谷歌的Colab平台完成的，首先，要先查看数据

import os 
os.chdir("drive/Colab Notebooks/NLP/Rotten Tomatoes movie review")
import pandas as pd
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test.tsv", sep='\t')
print(train.head(5))1
2
3
4
5

在这里插入图片描述

import refrom nltk.corpus 
import stopwords
# 定义清洗函数
def review_to_words(raw_review):
	letters_only = re.sub('[^a-zA-Z]', ' ', raw_review) # 只保留字母
	words = letters_only.lower().split()  # 转换成小写字母
	stops = set(stopwords.words('english'))  # 加载停止词
	meaningful_words = [w for w in words if w not in stops]  # 去除停止词
	return (" ".join(meaningful_words))
# 数据清洗
import nltk
nltk.download('stopwords')
clean_train_reviews = []
num_reviews = train["Phrase"].size
for i in range(0, num_reviews):    
	if (i+1) % 10000 == 0:        
		print("Review %d of %d\n" % (i+1, num_reviews))    			  
	clean_train_reviews.append(review_to_words(train["Phrase"][i]))	
# 查看清洗后的数据
print(clean_train_reviews[0])
# 'series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story'	
# 构造分词器
nltk.download('punkt')
tokenizer = nltk.word_tokenize
word = []
for i in range(len(clean_train_reviews)):  
	word+=(tokenizer(clean_train_reviews[i]))
# 创建词汇表
word_set = list(set(word))  # len(word_set) = 14992
# 由于上述操作使得所有词汇都存入了一个列表，而我们的训练数据需要保持原始数据的形状，即保证每条评论在一个列表中，需要进行下面的操作
word_df = []
for i in range(len(clean_train_reviews)):  
	word_df.append(tokenizer(clean_train_reviews[i]))
# 下面是关键的一步，将所有的词汇转换为one-hot向量
for i in range(len(word_df)):  
	for w in word_df[i]:    
		if w in word_set:      
			word_df[i][word_df[i].index(w)] = word_set.index(w)	
# 找到最长的序列，后续要对所有序列进行统一填充，使得所有序列长度一致，因为LSTM模型的输入要序列长度一致
len_list = []
for i in range(len(word_df)):  
	len_list.append(len(word_df[i]))
max_len = max(len_list)
# 填充序列，得到最终的训练特征
import numpy as np
features = np.zeros((len(word_df), max_len), dtype=int)
for i in range(len(word_df)):  
	for j in range(len(word_df[i])):    
		features[i][j] = word_df[i][j]
train_labels = np.array([l for l in train['Sentiment']])  # 准备训练数据的标签
# 划分训练集和验证集
from sklearn.model_selection import train_test_split	
X_train, X_val, y_train, y_val = train_test_split(features, train_labels, test_size=0.3, random_state=42)
print('\t\t\tFeature Shape:')
print('Train Set:\t\t{}'.format(X_train.shape), '\nVal Set:\t\t{}'.format(X_val.shape))
#			Feature Shape:
# Train Set:		(109242, 30) 
# Val Set:		(46818, 30)			
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

下面构造LSTM模型，我之前的博客中对LSTM做过分析，这里我再次做一遍复习。
首先，我之前的博客中列出了LSTM的公式，公式中有4组权重，对应的还有4组偏置(公式中没有体现)，这也就是我们要训练的LSTM的参数，我们注意到，4组权重中，分别各有一个权重与输入相乘，另一个权重与隐藏状态相乘，这就导致了权重维度的不同，下面通过一组代码来查看一下LSTM的权重。

rnn = nn.LSTM(input_size=400, hidden_size=256, num_layers=2)
for p in rnn.parameters():  
	print(p.size())
# 第一层
torch.Size([1024, 400])
torch.Size([1024, 256])
torch.Size([1024])
torch.Size([1024])
# 第二层
torch.Size([1024, 256])
torch.Size([1024, 256])
torch.Size([1024])
torch.Size([1024])1
2
3
4
5
6
7
8
9
10
11
12

首先看第一层，torch.Size([1024, 400])代表的是与input相乘的权重，其大小是(256, 400)，由于最终4个权重在列方向上拼接，所以大小是(1024, 400)，公式是，其中o代表了4个公式的输出，
$o_{hidden-size\times batch-size}=W_{hidden-size\times input-size}X_{input-size\times batch-size}$
而与hidden相乘的权重的大小是(256, 256)，具体推导过程和上式相似。
然后第二层的输入是第一层的输出，第一层的输出的最后一个时间步的隐藏状态，大小是256，隐藏层节点个数也是256，所以第二层的输出如上所示。
下面具体分析LSTM的模型搭建过程，

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
batch_size = 54
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=4)
print(len(train_loader))
print(len(val_loader))

class SentimentLSTM(nn.Module):  
	def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):    
		super(SentimentLSTM, self).__init__()    
		self.output_size = output_size    
		self.n_layers = n_layers    
		self.hidden_dim = hidden_dim    
		self.embedding = nn.Embedding(vocab_size, embedding_dim)    
		self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)    
		self.dropout = nn.Dropout(0.3)    
		self.fc = nn.Linear(hidden_dim, output_size)
	def forward(self, x, hidden):
	# x_shape = (batch=54, time_step=30, input_size=400)
	# hidden_shape = (n_layers=2, batch=54, hidden_size=256)
	# output_shape = (batch=54, time_step=30, hidden_size=256)
		batch_size = x.size(0)    
		embeds = self.embedding(x)    
		lstm_out, hidden = self.lstm(embeds, hidden)    #hidden包含(h_n和c_n)，
		lstm_out = lstm_out.transpose(0,1)   
		lstm_out = lstm_out[-1]    # 等于h_n(-1, :, :)
		out = self.dropout(lstm_out)    
		out = self.fc(out)    
		return out, hidden
	def init_hidden(self, batch_size):    
		weight = next(self.parameters()).data    
		if (train_on_gpu):      
			hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())    
		else:   
			hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())            
		return hidden

vocab_size = len(word_set) + 1
output_size = 5
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

lr=0.003
criterion = nn.CrossEntropyLoss()  # 交叉熵函数自动将label转换为one-hot向量
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
epoch = 5
counter = 0
print_every = 100
clip = 5  # 最大范数
if(train_on_gpu):  
	net.cuda()
net.train()
# 开始训练
for e in range(epoch):  
	h = net.init_hidden(batch_size)  
	for inputs, labels in train_loader:    
		counter += 1    
		if train_on_gpu:      
			inputs, labels = inputs.cuda(), labels.cuda()    
		h = tuple([each.data for each in h])    
		net.zero_grad()    
		output, h = net(inputs, h)    
		loss = criterion(output, labels)    
		loss.backward()    
		nn.utils.clip_grad_norm_(net.parameters(), clip)    # 梯度裁剪
		optimizer.step()    
		if counter % 100 == 0:      
			val_h = net.init_hidden(batch_size)      
			val_losses = []      
			net.eval()      
			for inputs, labels in val_loader:        
				val_h = tuple([each.data for each in val_h])        
				if train_on_gpu:          
					inputs, labels = inputs.cuda(), labels.cuda()        
				output, val_h = net(inputs, val_h)        
				val_h = tuple([each.data for each in val_h])        
				val_loss = criterion(output, labels)        
				val_losses.append(val_loss.item())      
			net.train()      
			print("Epoch: {}/{}...".format(e+1, epoch),            
				"Step: {}...".format(counter),            
				"Loss: {:.6f}...".format(loss.item()),            
				"Val Loss: {:.6f}".format(np.mean(val_losses)))		
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/IT小白/article/detail/443822