赞
踩
使用Pytorch模型和TorchText做情感分类(检测一段文字的情感是正面情绪,还是负面情绪)
IMDb 数据集,即电影评论。
TorchText中的一个重要的概念是Field,Field决定你的数据将会被怎样处理
在情感分类任务中,每一个样本包含文本内容与label(‘pos’ or ‘neg’)
Field的参数制订了数据将会被怎样处理
我们使用Text Field来处理电影的评论
使用Label Field处理label
我们使用Text Field带有的参数 tokenize=‘spacy’ 这表示我们使用spaCy .tokenizer来tokenize英文句子,如果我们不特别
声明tokenize这个参数,那么默认的分词方法是使用空格。
安装 spacy
import torch
from torchtext import data
SEED = 5200
torch.manual_seed(SEED)# 为CPU设置随机种子
torch.cuda.manual_seed(SEED) # 为GPU设置随机种子
torch.backends.cudnn.deterministic = True # 在程序刚开始的时候加这条语句可以提升模型训练速度,并没有什么额外的开销
# 设置这个 flag 可以让内置的 cuDNN 的 auto-tuner 自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题。
# https://blog.csdn.net/xys430381_1/article/details/102694831
# 首先我们要创建两个Field对象:这两个对象包含了我们打算如何处理预处理文本数据的信息
TEXT = data.Field(tokenize='spacy')
# torchtext.data.Field:用来定义字段的处理方法(文本字段,标签字段)
LABEL = data.LabelField(dtype=torch.float)
# LabelField是一个Field类的特殊子类,专门用来处理标签
TorchText支持很多常见的自然语言处理数据集
下面的代码 会自动下载IMDB数据集,然后分成trian/test两个torchtext.datasets类别
数据被前面的Fields处理,IMDB数据集一共有50000电影评论,每一个评论样本都有标注(pos/neg)
from torchtext import datasets
train_data,test_data = datasets.IMDB.splits(TEXT,LABEL)
print("训练集样本数:",len(train_data))
print("测试集样本数:",len(test_data))
训练集样本数: 25000
测试集样本数: 25000
print(vars(train_data.examples[0]))
{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'line', ':', 'INSPECTOR', ':', 'I', "'m", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'STUDENT', ':', 'Welcome', 'to', 'Bromwell', 'High', '.', 'I', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'Bromwell', 'High', 'is', 'far', 'fetched', '.', 'What', 'a', 'pity', 'that', 'it', 'is', "n't", '!'], 'label': 'pos'}
import random
train_data,val_data = train_data.split(random_state= random.seed(SEED),split_ratio=0.7)
print("训练集样本数:",len(train_data))
print("测试集样本数:",len(test_data))
print("测试集样本数:",len(val_data))
训练集样本数: 17500
测试集样本数: 25000
测试集样本数: 7500
TEXT.build_vocab(train_data,max_size = 25000,vectors = "glove.6B.100d",unk_init = torch.Tensor.normal_)
# 从预训练的词向量中,将当前(corpus语料库)词汇表的词向量抽取出来,构成当前语料库的词汇表
# 预训练的vectors来自glove模型,每个单词有100维,glove模型训练的词向量参数来自很大的语料库
# 而我们的电影评论的语料库比较小,所以需要词向量更新,glove的词向量适合用作初始化参数
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2
print(TEXT.vocab.freqs.most_common(20))
[('the', 204445), (',', 194169), ('.', 166818), ('a', 110203), ('and', 110084), ('of', 101748), ('to', 94276), ('is', 76826), ('in', 61696), ('I', 54333), ('it', 53667), ('that', 49587), ('"', 44463), ("'s", 43727), ('this', 42351), ('-', 37223), ('/><br', 36060), ('was', 35327), ('as', 30951), ('with', 30018)]
使用stoi(string to int )或者itos(int to string)来查看单词表
print(TEXT.vocab.itos[:10])
print("------"*10)
print(list(LABEL.vocab.stoi.items())) # 只有两个类别值
print("------"*10)
print(list(TEXT.vocab.stoi.items())[:20])
#语料库单词频率越高,索引越靠前。前两个默认为unk和pad。
print("------"*10)
print(TEXT.vocab.freqs.most_common(20))
# 这里可以看到unk和pad没有计数
['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']
------------------------------------------------------------
[('pos', 0), ('neg', 1)]
------------------------------------------------------------
[('<unk>', 0), ('<pad>', 1), ('the', 2), (',', 3), ('.', 4), ('a', 5), ('and', 6), ('of', 7), ('to', 8), ('is', 9), ('in', 10), ('I', 11), ('it', 12), ('that', 13), ('"', 14), ("'s", 15), ('this', 16), ('-', 17), ('/><br', 18), ('was', 19)]
------------------------------------------------------------
[('the', 204445), (',', 194169), ('.', 166818), ('a', 110203), ('and', 110084), ('of', 101748), ('to', 94276), ('is', 76826), ('in', 61696), ('I', 54333), ('it', 53667), ('that', 49587), ('"', 44463), ("'s", 43727), ('this', 42351), ('-', 37223), ('/><br', 36060), ('was', 35327), ('as', 30951), ('with', 30018)]
print(LABEL.vocab.stoi)
defaultdict(None, {'pos': 0, 'neg': 1})
BATCH_SIZE = 16 # 相当于把样本划分batch,将长度相等的单词尽可能划分到一个batch,不够长的就用padding' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_iterator, val_iterator, test_iterator = data.BucketIterator.splits( (train_data, val_data, test_data), batch_size=BATCH_SIZE, device=device )# seq_len * batch_size """ Iterator:标准迭代器 BPTTIterator: 基于BPTT(基于时间的反向传播算法)的迭代器,一般用于语言模型中。 BucketIterator:相比与标准迭代器,会将类似长度的样本当做一批来处理 因为在文本处理中经常回需要将每一批样本长度补齐为当前批中最长序列的长度 因此当样本长度相差较大是,使用BucketIterator可以带来填充效率的提高 除此之外,我们还可以使用Field中通过fix_lenth参数来对样本进行截断补齐操作 """
# 取出一条评论
batch = next(iter(val_iterator))
print(batch.text.shape)
print([TEXT.vocab.itos[i] for i in batch.text[:, 0]])
# batch
torch.Size([38, 16])
['Please', 'avoid', 'this', 'movie', 'at', 'all', 'costs', '.', 'This', 'is', 'without', 'a', 'doubt', ',', 'the', 'worst', 'movie', 'I', "'ve", 'ever', 'seen', '.', 'Most', 'movies', 'have', 'at', 'least', 'one', 'redeeming', 'value', '.', 'This', 'has', 'none', '.', 'Totally', 'horrible', '!']
import torch
import torch.nn as nn
import torch.nn.functional as F
class WordAVGModel(nn.Module): def __init__(self,vocab_size,embedding_size,output_size,pad_idx): """ 初始化参数 vocab_size:词汇表长度 = 25002, embedding_size:每个单词的维度 = 100 pad_idx:如果提供的话,这里如果遇到padding的单词就用0填充 output_size:输出的维度,一个数就ok =1 """ # 初始化参数 super(WordAVGModel,self).__init__() self.embedding = nn.Embedding(vocab_size,embedding_size,padding_idx = pad_idx) self.linear = nn.Linear(embedding_size,output_size) def forward(self,text): # text.shape = (seq_len,batch_size) # text下面会指定 为一个batch的数据,seq_len为一条评论的长度 embedded = self.embedding(text) # embedded = [seq_len,batch_size,embedded_size] embedded = embedded.permute(1,0,2) # embedded = [batch_size,seq_len,embedded_size] # 更换顺序 pooled = F.avg_pool2d(embedded,(embedded.shape[1],1)).squeeze() # pooled = [batch_size,embedded_size] 把单词长度的维度压扁为1,并降维 return self.linear(pooled) # [batch_size,embedded_size]*[embedded_size,output_size] = [batch_size,output_size]
VOCAB_SIZE = len(TEXT.vocab)# 25002
EMBEDDING_SIZE = 100
OUTPUT_SIZE = 1 # 大于某个值是正,小于某个阈值为负
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = WordAVGModel(vocab_size=VOCAB_SIZE,
embedding_size = EMBEDDING_SIZE,
output_size=OUTPUT_SIZE,
pad_idx=PAD_IDX)
def count_parameters(model):#
return sum(p.numel() for p in model.parameters() if p.requires_grad )
count_parameters(model)
2500301
# 把模型参数初始化成glove的向量参数
pretrained_embeddings = TEXT.vocab.vectors # 取出glove embedding词向量的参数
model.embedding.weight.data.copy_(pretrained_embeddings) #遇到_的语句直接替换,不需要另外赋值=
#把上面vectors="glove.6B.100d"取出的词向量作为初始化参数,数量为25000*100个参数
tensor([[-0.5037, 1.0386, 0.3613, ..., 0.7303, -0.7817, -0.1195],
[-1.0005, 0.5805, 1.1986, ..., 0.0986, -2.3706, -0.2373],
[-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],
...,
[ 0.0955, -0.3806, 0.8517, ..., -1.3654, 1.2483, -0.4644],
[ 0.1091, -1.5083, 0.1929, ..., 0.5623, 0.7806, -0.7162],
[ 0.3174, 0.4156, 0.9727, ..., -0.6083, -0.3764, 1.4375]])
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] # UNK_IDX=0
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE) #
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
#词汇表25002个单词,前两个unk和pad也需要初始化成EMBEDDING_DIM维的向量
optimizer = torch.optim.Adam(model.parameters())# 定义优化器
crit = nn.BCEWithLogitsLoss()# 定义损失函数 BCEWithLogitsLoss特殊情况,二分类损失函数
# nn.BCEWithLogitsLoss()看这个:https://blog.csdn.net/qq_22210253/article/details/85222093
model = model.to(device) # 送到GPU上面
crit = crit.to(device)# 送到GPU上面
def binary_acc(preds,y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
def train(model,iterator,optimizer,crit): epoch_loss,epoch_acc = 0.,0. total_len = 0. model.train() # 设置模型的模式为训练模式 # 这一步一定要加上,是为了区分model训练,测试的模式 # 有时候训练的时候会用到dropout,归一化等方法,但是测试的时候不能用dropout等方法 for batch in iterator: preds = model(batch.text).squeeze() # batch.text 就是上面的forward函数的参数text # squeeze(1)压缩维度,不然跟batch.label维度对不上 loss = crit(preds,batch.label) # 计算损失 acc = binary_acc(preds,batch.label)# 每次迭代都计算一次准确率 # 随机梯度下降 optimizer.zero_grad()# 梯度清零,防止梯度叠加 loss.backward()# 反向传播 optimizer.step()# 梯度下降 epoch_loss += loss.item() * len(batch.label) # 二分类损失函数因为计算的时候已经平均化了,因此这里需要乘以len(batch,label) # 得到一个batch的损失,累加得到所有样本的损失 epoch_acc += acc.item() * len(batch.label) # (acc.item():一个batch的正确率)* batch数 = 正确数 total_len += len(batch.label) # 累加计算train_iterators所有的样本的数量,不出意外的话应该是17500 return epoch_loss / total_len,epoch_acc / total_len #epoch_loss / total_len :train_iterator所有batch的平均损失 #epoch_acc / total_len :train_iterator所有batch的平均正确率
def evaluate(model, iterator, crit): epoch_loss, epoch_acc = 0., 0. total_len = 0. model.eval()# 转换为测试模式 for batch in iterator: preds = model(batch.text).squeeze() # [batch_size] # 没有反向传播和梯度下降 loss = crit(preds, batch.label) acc = binary_acc(preds, batch.label) # 随机梯度下降 # optimizer.zero_grad() # loss.backward() # optimizer.step() epoch_loss += loss.item() * len(batch.label) epoch_acc += acc.item() * len(batch.label) total_len += len(batch.label) model.train()#调回训练模式 return epoch_loss / total_len, epoch_acc / total_len
N_EPOCHS = 10
best_val_acc = 0
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, crit)
# 得到训练集每一个epoch的平均损失和准确率
val_loss, val_acc = evaluate(model, val_iterator, crit)
# 得到验证集每一个epoch的平均损失和准确率,这个模型里传入的参数是训练完的参数
if val_acc > best_val_acc: #只要模型效果变好,就存模型
best_val_acc = val_acc
torch.save(model.state_dict(), "WordAvg_model.pth")
print("Epoch", epoch + 1, "Train Loss", train_loss, "Train Acc", train_acc)
print("Epoch", epoch + 1, "Val Loss", val_loss, "Val Acc", val_acc)
Epoch 1 Train Loss 0.6450334824562073 Train Acc 0.7002857143129622
Epoch 1 Val Loss 0.4862055212974548 Val Acc 0.7748000000635783
.........
Epoch 10 Train Loss 0.13541345774754882 Train Acc 0.9627428571428571
Epoch 10 Val Loss 0.538427003543719 Val Acc 0.8973333333333333
model.load_state_dict(torch.load("WordAvg_model.pth"))
IncompatibleKeys(missing_keys=[], unexpected_keys=[])
import spacy nlp = spacy.load("en") def predict_sentiment(sentence): # 传入预测的句子I love This film bad tokenized = [tok.text for tok in nlp.tokenizer(sentence)] # 分词 # print(tokenized) = ['I', 'love', 'This', 'film', 'bad'] indexed = [TEXT.vocab.stoi[t] for t in tokenized] # sentcence在25002正的索引 tensor = torch.LongTensor(indexed).to(device) # seq_len # 所有的向量都应该变成LongTensor tensor = tensor.unsqueeze(1) #模型的输入是默认有batch_size的,需要升维,seq_len * batch_size(1) pred = torch.sigmoid(model(tensor)) # 预测准确率,在0,1之间,需要sigmoid if pred.item() <= 0.5: return "Negative!!!" elif pred.item() > 0.5: return "Positive"
predict_sentiment("This film is terrific!")
'Negative!!!'
RNN模型经常会被用来编码一个sequence
h
t
=
RNN
(
x
t
,
h
t
−
1
)
h_t = \text{RNN}(x_t, h_{t-1})
ht=RNN(xt,ht−1)
使用最后一个隐藏层来表示整个句子
将隐藏层使用一个线性变换 f f f然后用来预测句子的情感
双向RNN网络
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LxZHfptg-1577278388518)(attachment:image.png)]
class RNNmodel(nn.Module): def __init__(self, vocab_size, embedding_size, output_size, pad_idx, hidden_size, dropout): """ 双向RNN vocab_size:词汇表的长度25002 embedding_size:每个单词的维度 output_size:输出层的维度 pad_idx:pad的索引 hidden_size:隐藏层维度 dropout: """ super(RNNmodel, self).__init__() self.embed = nn.Embedding( vocab_size, embedding_size, padding_idx=pad_idx) self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional=True, num_layers=2) self.linear = nn.Linear(hidden_size * 2, output_size) # 这里的hidden_size乘以2是因为是双向的,需要拼接两个方向,跟n_layers的层数无关 self.dropout = nn.Dropout(dropout) def forward(self, text): # text.shape=[seq_len, batch_size] embedded = self.embed(text)# [seq_len,batch_size,embedding_size] embedded = self.dropout(embedded)# [seq_len,batch_size,embedding_size] output, (hidden, cell) = self.lstm(embedded) # output = [seq_len,batch_size,hid_size*num directions] # hidden = [num_layers * num directions,batch_size,hid_size] # cell = [num_layers * num directions,batch_size,hid_size] # 这里的num_layers * num directions可以看上面的图,上面的图除掉输入层之后 只有两层双向网络 # num_layers = 2 表示需要纵向上再加2层双向 因此总共有4层神经元 # 连接最后的forward(hidden[-2,:,:]) backward (hidden[-1,:,:]) hidden layers hidden = torch.cat([hidden[-1], hidden[-2]], dim=1) # 使用dropout hidden = self.dropout(hidden.squeeze()) # hidden = [batch_size,hidden_size * num_directions] # 看上面的图示 最后向前和向后的输出和隐藏层会concat到输出层,4层神经元最后两层作为最终的输出 # 这里因为我们只需要得到最后一个时间序列的输出,所以最终的输出的hidden跟seq_len无关 return self.linear(hidden)# 连接一个全连接层 最终的输出[batch_size,output_size]
model = RNNmodel(vocab_size=VOCAB_SIZE,
embedding_size=EMBEDDING_SIZE,
output_size=OUTPUT_SIZE,
pad_idx=PAD_IDX,
hidden_size=100,
dropout=0.5)
TEXT.vocab.vectors.shape
torch.Size([25002, 100])
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)
tensor([[-0.5037, 1.0386, 0.3613, ..., 0.7303, -0.7817, -0.1195],
[-1.0005, 0.5805, 1.1986, ..., 0.0986, -2.3706, -0.2373],
[-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],
...,
[ 0.0955, -0.3806, 0.8517, ..., -1.3654, 1.2483, -0.4644],
[ 0.1091, -1.5083, 0.1929, ..., 0.5623, 0.7806, -0.7162],
[ 0.3174, 0.4156, 0.9727, ..., -0.6083, -0.3764, 1.4375]])
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad )
count_parameters(model)
2903601
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()
model = model.to(device)
crit = crit.to(device)
N_EPOCHS = 10
best_val_acc = 0
for epoch in range(N_EPOCHS):
train_loss,train_acc = train(model,train_iterator,optimizer,crit)
val_loss,val_acc = evaluate(model,val_iterator,crit)
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(),"lstm_model.path")
print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
print("Epoch", epoch, "Val Loss", val_loss, "Val Acc", val_acc)
outputs, (hidden, cell) = model.lstm(model.embed(batch.text))
outputs.shape # (seq_len, batch, num_directions * hidden_size)
model.load_state_dict(torch.load("lstm_model.path"))
import spacy
nlp = spacy.load("en")
def predict_sentiment(sentence):
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device) # seq_len
tensor = tensor.unsqueeze(1)
pred = torch.sigmoid(model(tensor))
return pred.item()
class CNN(nn.Module): def __init__(self, vocab_size, embedding_size, output_size, pad_idx, num_filters, filter_sizes, dropout): super(CNN, self).__init__() self.embed = nn.Embedding( vocab_size, embedding_size, padding_idx=pad_idx) self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_sizes, embedding_size)) self.linear=nn.Linear(embedding_size, output_size) self.dropout=nn.Dropout(dropout) def forward(self, text): text=text.permute(1, 0) # [batch_size,seq_len] embedded=self.embed(text) # [batch_size,seq_len,embedding_size] embedded=embedded.unsqueeze(1) # [batch_size,1,seq_len,embedding_size] conved=self.conv(embedded)# [batch_size,num_filters,seq_len-filter_size +1,embedding_size] conved=F.relu(conved)# [batch_size,num_filters,seq_len-filter_size +1,1] conved=conved.squeeze(3)# [batch_size,num_filters,seq_len-filter_size +1] # 最大池化层 pooled=F.max_pool1d(conved, conved.shape[2]) # [batch_size,num_filters,1] pooled=pooled.squeeze(2) # [batch_size,num_filters] pooled=self.dropout(pooled) return self.linear(pooled)
model = CNN(vocab_size = VOCAB_SIZE,
embedding_size = EMBEDDING_SIZE,
output_size = OUTPUT_SIZE,
pad_idx = PAD_IDX,
num_filters = 100,
filter_sizes = 3,
dropout = 0.5)
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)
tensor([[-0.5037, 1.0386, 0.3613, ..., 0.7303, -0.7817, -0.1195],
[-1.0005, 0.5805, 1.1986, ..., 0.0986, -2.3706, -0.2373],
[-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],
...,
[ 0.0955, -0.3806, 0.8517, ..., -1.3654, 1.2483, -0.4644],
[ 0.1091, -1.5083, 0.1929, ..., 0.5623, 0.7806, -0.7162],
[ 0.3174, 0.4156, 0.9727, ..., -0.6083, -0.3764, 1.4375]])
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad )
count_parameters(model)
2530401
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()
model = model.to(device)
crit = crit.to(device)
N_EPOCHS = 10
best_val_acc = 0
for epoch in range(N_EPOCHS):
train_loss,train_acc = train(model,train_iterator,optimizer,crit)
val_loss,val_acc = evaluate(model,val_iterator,crit)
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(),"CNN_Single_Conv_model.path")
print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
print("Epoch", epoch, "Val Loss", val_loss, "Val Acc", val_acc)
Epoch 0 Train Loss 0.0840941589936614 Train Acc 0.9708
Epoch 0 Val Loss 0.5070471989204486 Val Acc 0.8509333333333333
.........
Epoch 9 Train Loss 0.01795802642274804 Train Acc 0.994
Epoch 9 Val Loss 1.0368120583824503 Val Acc 0.8441333333333333
model.load_state_dict(torch.load("CNN_Single_Conv_model.path"))
IncompatibleKeys(missing_keys=[], unexpected_keys=[])
import spacy
nlp = spacy.load("en")
def predict_sentiment(sentence):
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device) # seq_len
tensor = tensor.unsqueeze(1)
pred = torch.sigmoid(model(tensor))
if pred.item() <= 0.5:
return "Negative!!!"
elif pred.item() > 0.5:
return "Positive!!!"
predict_sentiment("This film is terrific!")
'Negative!!!'
predict_sentiment("This film is happy!")
'Negative!!!'# 判断错误
class CNN(nn.Module): def __init__(self, vocab_size, embedding_size, output_size, pad_idx, num_filters, filter_sizes, dropout): super(CNN,self).__init__() self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx = pad_idx) self.convs = nn.ModuleList([ nn.Conv2d(in_channels = 1, out_channels = num_filters, kernel_size = (fs,embedding_size)) for fs in filter_sizes ]) # 3个CNN self.linear = nn.Linear(num_filters * len(filter_sizes), output_size) self.dropout = nn.Dropout(dropout) def forward(self,text): text = text.permute(1,0) #[batch_size,seq_len] embedded = self.embed(text)# [batch_size,seq_len,embedding_size] embedded = embedded.unsqueeze(1)# [batch_size,1,seq_len,embedding_size] conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] pooled = [F.max_pool1d(conv,conv.shape[2]).squeeze(2) for conv in conved] pooled = torch.cat(pooled,dim = 1) pooled = self.dropout(pooled) return self.linear(pooled)
class CNN(nn.Module): def __init__(self, vocab_size, embedding_size, output_size, pad_idx, num_filters, filter_sizes, dropout): super(CNN, self).__init__() self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=pad_idx) self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_size)) for fs in filter_sizes ]) # 3个CNN # self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size)) self.linear = nn.Linear(num_filters * len(filter_sizes), output_size) self.dropout = nn.Dropout(dropout) def forward(self, text): text = text.permute(1, 0) # [batch_size, seq_len] embedded = self.embed(text) # [batch_size, seq_len, embedding_size] embedded = embedded.unsqueeze(1) # # [batch_size, 1, seq_len, embedding_size] # conved = F.relu(self.conv(embedded)) # [batch_size, num_filters, seq_len-filter_size+1, 1] # conved = conved.squeeze(3) # [batch_size, num_filters, seq_len-filter_size+1] conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] # max over time pooling # pooled = F.max_pool1d(conved, conved.shape[2]) # [batch_size, num_filters, 1] # pooled = pooled.squeeze(2) pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] pooled = torch.cat(pooled, dim=1) # batch_size, 3*num_filters pooled = self.dropout(pooled) return self.linear(pooled) # Conv1d? 1x1 Conv2d?
model = CNN(vocab_size=VOCAB_SIZE,
embedding_size=EMBEDDING_SIZE,
output_size=OUTPUT_SIZE,
pad_idx=PAD_IDX,
num_filters=100,
filter_sizes=[3,4,5],
dropout=0.5)
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)
tensor([[-0.5037, 1.0386, 0.3613, ..., 0.7303, -0.7817, -0.1195],
[-1.0005, 0.5805, 1.1986, ..., 0.0986, -2.3706, -0.2373],
[-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],
...,
[ 0.0955, -0.3806, 0.8517, ..., -1.3654, 1.2483, -0.4644],
[ 0.1091, -1.5083, 0.1929, ..., 0.5623, 0.7806, -0.7162],
[ 0.3174, 0.4156, 0.9727, ..., -0.6083, -0.3764, 1.4375]])
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad )
count_parameters(model)
2620801
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()
model = model.to(device)
crit = crit.to(device)
N_EPOCHS = 10
best_val_acc = 0
for epoch in range(N_EPOCHS):
train_loss,train_acc = train(model,train_iterator,optimizer,crit)
val_loss,val_acc = evaluate(model,val_iterator,crit)
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(),"CNN_B_model.path")
print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
print("Epoch", epoch, "Val Loss", val_loss, "Val Acc", val_acc)
Epoch 0 Train Loss 0.5881891923359462 Train Acc 0.6718285714558193
Epoch 0 Val Loss 0.3878168618520101 Val Acc 0.8290666666666666
.........
Epoch 9 Train Loss 0.02244742302199008 Train Acc 0.9921714285714286
Epoch 9 Val Loss 0.7518003792511765 Val Acc 0.8646666666666667
随机隐去一些单词
import torch
import torchtext
from torchtext import data
SEED = 5200
torch.manual_seed(SEED)
torch.cuda.manual_seed((SEED))
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize="spacy")
LABEL = data.LabelField()
TorchText支持很多常见的自然语言处理数据集
下面的代码会自动下载IMDB数据集,然后分成train/test两个torchtext.datasets类别,数据被前面的Fields处理,
IMDB数据集一共有50000电影评论,每一个评论都被标注为pos/neg
from torchtext import datasets
train_data,test_data = datasets.IMDB.splits(TEXT,LABEL)
创建一个val_data数据集,对train数据集按照train:val = 8:2进行切分
import random
train_data,val_data = train_data.split(random_state = random.seed(SEED),split_ratio=0.8)
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(val_data)}')
print(f'Number of testing examples: {len(test_data)}')
Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000
创建vocabulary,将每一个单词映射成一个数字
使用最常见的25k个单词来构建词汇表,将max_size设置成25000,所有的其他的单词都使用unk来表示
TEXT.build_vocab(train_data,max_size = 25000,vectors = "glove.6B.100d",unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2
每个iterator都会返回一个batch的examples
使用BucketIterator,将长度差不多的句子放到一个batch中,确保每个句子当中不会出现太多的padding
之前的代码将pad当做了模型的输入进行训练,更好的做法是在模型中将由产生的输出给消除掉
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
(train_data, val_data, test_data),
batch_size=BATCH_SIZE,
device=device,
repeat=False
)
batch = next(iter(train_iterator))
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
mask = batch.text == PAD_IDX
import torch.nn as nn import torch.nn.functional as F class WordAVGModel(nn.Module): def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx): super().__init__() self.embedding = nn.Embedding( vocab_size, embedding_dim, padding_idx=pad_idx) self.fc = nn.Linear(embedding_dim, output_dim) def forward(self, text, mask): embedded = self.embedding(text) # [batch_size, seq_len, emb_dim] sent_embed = torch.sum(embedded * mask.unsqueeze(2), 1) / mask.sum(1).unsqueeze(1) # [batch size, embedding_dim] return self.fc(sent_embed)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
model = WordAVGModel(INPUT_DIM,EMBEDDING_DIM,OUTPUT_DIM,PAD_IDX)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
The model has 2,500,301 trainable parameters
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
def binary_accuracy(preds,y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds ==y).float()
acc = correct.sum()/len(correct)
return acc
def train(model, iterator, optimizer, criterion): epoch_loss = 0 epoch_acc = 0 model.train() i = 0 for batch in iterator: optimizer.zero_grad() # CHANGED text = batch.text.permute(1, 0) # [batch_size, seq_length] mask = 1. - (text == PAD_IDX).float() # [batch_size, seq_len] predictions = model(text, mask).squeeze(1) loss = criterion(predictions, batch.label.float()) acc = binary_accuracy(predictions, batch.label.float()) loss.backward() optimizer.step() if i % 100 == 0: print("batch {}, loss {}".format(i, loss.item())) i += 1 epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion): epoch_loss = 0 epoch_acc = 0 model.eval() i = 0 with torch.no_grad(): for batch in iterator: text = batch.text.permute(1, 0) # [batch_size, seq_length] mask = 1. - (text == PAD_IDX).float() # [batch_size, seq_len] predictions = model(text, mask).squeeze(1) loss = criterion(predictions, batch.label.float()) acc = binary_accuracy(predictions, batch.label.float()) if i % 100 == 0: print("batch {}, loss {}".format(i, loss.item())) i += 1 epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator)
import time
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
N_EPOCHS = 10 best_valid_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, val_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'wordavg-model.pt') # print("Epoch", epoch + 1, "Train Loss", train_loss, "Train Acc", train_acc) # print("Epoch", epoch + 1, "Val Loss", valid_loss, "Val Acc", valid_loss) print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
batch 0, loss 0.7043583393096924 batch 100, loss 0.6792033314704895 batch 200, loss 0.6611101627349854 batch 300, loss 0.6489359140396118 batch 0, loss 0.5432980060577393 Epoch: 01 | Epoch Time: 0m 6s Train Loss: 0.658 | Train Acc: 69.58% Val. Loss: 0.603 | Val. Acc: 76.94% batch 0, loss 0.5841051340103149 batch 100, loss 0.5488921403884888 batch 200, loss 0.4851570129394531 batch 300, loss 0.41645267605781555 batch 0, loss 0.34798464179039 Epoch: 02 | Epoch Time: 0m 6s Train Loss: 0.519 | Train Acc: 81.97% Val. Loss: 0.451 | Val. Acc: 83.96% batch 0, loss 0.3906698524951935 batch 100, loss 0.4007859230041504
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。