赞
踩
import torch import torchtext from torchtext import data from torchtext import datasets from torchtext.vocab import GloVe import spacy from spacy.lang.en import English import random import torch.nn as nn import torch.nn.functional as F SEED = 1234 torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True
nlp = English() #确定分词方式
TEXT = data.Field(tokenize=nlp)
# TEXT = data.Field(lower= True)
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) #载入数据
train_data, valid_data = train_data.split(split_ratio=0.7,random_state=random.seed(SEED)) #再拆分一个validation_set出来
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
print(vars(train_data[0]),'\n',len(vars(train_data[0])['text']))
print(vars(train_data[0])['text'][0],'\n',type((vars(train_data[0])['text'][0])))
{'text': This movie has got to be one of the worst I have ever seen make it to DVD!!! The story line might have clicked if the film had more funding and writers that would have cut the nonsense and sickly scenes that I highly caution parents on.... But the story line is like a loose cannon. If there was such a thing as a drive thru movie maker-this one would have sprung from that.It reminded me a lot of the quickie films that were put out in the 1960's, poor script writing and filming. <br /><br />The only sensible characters in the whole movie was the bartender and beaver. The rest of the film, could have easily been made by middle school children. I give this film a rating of 1 as it is truly awful and left my entire family with a sense of being cheated. My advice-Don't Watch It!!!, 'label': 'neg'}
173
This
<class 'spacy.tokens.token.Token'>
# 从上面可以看出,使用English()这种方法切出来的每个单词并不是string类型,
# 它的type是token,因此我们要把切出来的词的type都转化为str
for data in [train_data,valid_data,test_data]:
for i in range(len(data)):
a = data[i]
a.text = [str(j) for j in a.text]
#建立词典
TEXT.build_vocab(train_data, max_size=25000,vectors='glove.6B.100d',unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(len(TEXT.vocab),len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(20))
25002 2
[('the', 203566), (',', 192495), ('.', 165539), ('and', 109443), ('a', 109116), ('of', 100702), ('to', 93766), ('is', 76328), ('in', 61255), ('I', 54004), ('it', 53508), ('that', 49187), ('"', 44285), ("'s", 43329), ('this', 42445), ('-', 37165), ('/><br', 35752), ('was', 35034), ('as', 30384), ('with', 29774)]
# 建立iterator(其实就是dataloader)
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=batch_size,
device=device)
class WordAvgModel(nn.Module):
def __init__(self,vocab_size,embed_size,output_size,pad_idx):
super(WordAvgModel,self).__init__()
self.embed = nn.Embedding(vocab_size,embed_size,pad_idx)
self.linear = nn.Linear(embed_size,output_size)
def forward(self,text):
#text 的size是(sq_length,batch_size),即一列是一句话(原始长度达不到sq_length的,用pad来填充)
embedded = self.embed(text) # (sq_length,batch_size,embed_size)
embedded = embedded.permute(1,0,2) # 把第一维度和第二维度交换一下,size变成(batch_size,seq_length,embed_size)
pooled = F.avg_pool2d(embedded,(embedded.shape[1],1)).squeeze() # kernel的size为(seq_length,1),那么结果池化之后,原来的矩阵的size就变成(batch_size,1,embed_size),然后squeeze,变成(batch_size,embed_size)
return self.linear(pooled)
batch_size = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(TEXT.vocab)
embed_size = 100
output_size = 1
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
avg_model = WordAvgModel(vocab_size=vocab_size,embed_size=embed_size,
output_size=output_size,pad_idx=pad_idx)
avg_model.to(device)
WordAvgModel(
(embed): Embedding(25002, 100, padding_idx=1)
(linear): Linear(in_features=100, out_features=1, bias=True)
)
num_parameters = sum(p.numel() for p in avg_model.parameters() if p.requires_grad)
print(num_parameters) #看一下待训练的参数个数
2500301
pretrained_embed = TEXT.vocab.vectors
#pretrained_embed的size是(25002,100),25002是字典中单词的个数,100是因为我们在建立字典时,对参数vectors设定的是glove.6b.100d 即100维
avg_model.embed.weight.data.copy_(pretrained_embed)
tensor([[-0.6946, 0.0269, 0.0063, ..., 1.2692, -1.3969, -0.4796],
[-2.2822, 0.1412, -1.3277, ..., -0.0465, -1.0185, -0.1024],
[-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],
...,
[-0.3617, 0.6201, 0.1105, ..., 0.2994, -0.5920, 1.0949],
[-0.3312, 0.9364, -0.1638, ..., 0.9859, -1.0950, -1.1516],
[-0.1954, 0.5692, -0.0671, ..., 0.2170, 0.7001, -0.1479]],
device='cuda:0')
avg_model.embed.weight.data.size()
#embed的size是(25002,100) 每一行代表vocab里面的一个单词,
#其中第一个单词是<unk>,第二个单词是<pad>
#一般情况下,我们会把这两个单词的weight初始化为0
torch.Size([25002, 100])
avg_model.embed.weight.data[pad_idx] = torch.zeros(embed_size)
avg_model.embed.weight.data[unk_idx] = torch.zeros(embed_size)
# avg_model.embed.weight.data[pad_idx]指的就是<pad>所代表的那行
def train(model,dataset,optimizer,loss_fn): epoch_loss,epoch_count,epoch_acc_count=0.,0.,0. model.train() total_len = 0 for batch in dataset: preds = model(batch.text).squeeze() #model出来的size为(batch_size,1),把那个1 squeeze掉,size变成batch_size loss = loss_fn(preds,batch.label) acc = binary_accuracy(preds,batch.label) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item()*len(batch.label) #用于计算整个epoch个loss epoch_count += len(batch.label) #用于计算整个epoch上的样本数 epoch_acc_count += acc.item()*len(batch.label) #用于计算整个epoch上预测正确的样本数 return epoch_loss/epoch_count,epoch_acc_count/epoch_count def evaluate(model,dataset,loss_fn): epoch_loss,epoch_count,epoch_acc_count=0.,0.,0. model.eval() total_len = 0 for batch in dataset: preds = model(batch.text).squeeze() #model出来的size为(batch_size,1),把那个1 squeeze掉,size变成batch_size loss = loss_fn(preds,batch.label) acc = binary_accuracy(preds,batch.label) epoch_loss += loss.item()*len(batch.label) #用于计算整个epoch个loss epoch_count += len(batch.label) #用于计算整个epoch上的样本数 epoch_acc_count += acc.item()*len(batch.label) #用于计算整个epoch上预测正确的样本数 model.eval() return epoch_loss/epoch_count,epoch_acc_count/epoch_count def binary_accuracy(preds,y): rounded_preds = torch.round(torch.sigmoid(preds)) num_correct = (rounded_preds==y).float() acc = num_correct.sum()/len(y) return acc
optimizer = torch.optim.Adam(avg_model.parameters(),lr=0.005) loss_fn = nn.BCEWithLogitsLoss() epochs = 10 best_valid_acc = 0. for epoch in range(epochs): train_loss,train_acc = train(avg_model,train_iterator,optimizer,loss_fn) valid_loss,valid_acc = evaluate(avg_model,valid_iterator,loss_fn) if valid_acc>best_valid_acc: best_valid_acc = valid_acc best_epoch = epoch torch.save(avg_model.state_dict(),'./wordavg_model.txt') # print('模型在验证集上的正确率({})有所提高,已将模型保存'.format(valid_acc)) print("Epoch:", epoch, "Train_Loss:", train_loss, "Train_Acc:", train_acc, "Valid_Loss", valid_loss, "Valid_Acc", valid_acc) print("training has finished,the best epoch is {},the best valid_acc is {}".format(best_epoch,best_valid_acc))
Epoch: 0 Train_Loss: 0.5956396281242371 Train_Acc: 0.694228571496691 Valid_Loss 0.4051449816385905 Valid_Acc 0.8417333333969116
Epoch: 1 Train_Loss: 0.3593949766363416 Train_Acc: 0.8733142857960292 Valid_Loss 0.46664917748769125 Valid_Acc 0.8840000000317891
Epoch: 2 Train_Loss: 0.2551696341242109 Train_Acc: 0.913428571510315 Valid_Loss 0.5249438627560934 Valid_Acc 0.8950666667302449
Epoch: 3 Train_Loss: 0.196742424092974 Train_Acc: 0.9325142858232771 Valid_Loss 0.6135396106402079 Valid_Acc 0.8957333333969116
Epoch: 4 Train_Loss: 0.15810192627225603 Train_Acc: 0.9501142857687814 Valid_Loss 0.6637696914672852 Valid_Acc 0.9009333333969116
Epoch: 5 Train_Loss: 0.1267459169966834 Train_Acc: 0.9622285714830671 Valid_Loss 0.7350258693695069 Valid_Acc 0.9008000000635783
Epoch: 6 Train_Loss: 0.10385001053469521 Train_Acc: 0.9716 Valid_Loss 0.835720943514506 Valid_Acc 0.8982666667302449
Epoch: 7 Train_Loss: 0.08529832897612026 Train_Acc: 0.9776 Valid_Loss 0.8945791959762573 Valid_Acc 0.8969333333969116
Epoch: 8 Train_Loss: 0.0711212798680578 Train_Acc: 0.9828571428843907 Valid_Loss 0.9895696968078613 Valid_Acc 0.8968000000635783
Epoch: 9 Train_Loss: 0.05655052126603467 Train_Acc: 0.9883428571428572 Valid_Loss 1.065309889539083 Valid_Acc 0.8962666667302449
training has finished,the best epoch is 4,the best valid_acc is 0.9009333333969116
best_model = WordAvgModel(vocab_size=vocab_size,embed_size=embed_size,
output_size=output_size,pad_idx=pad_idx)
best_model.load_state_dict(torch.load('./wordavg_model.txt'))
best_model.to(device)
WordAvgModel(
(embed): Embedding(25002, 100, padding_idx=1)
(linear): Linear(in_features=100, out_features=1, bias=True)
)
def predict_sentiment(sentence):
tokennized = [str(tok) for tok in TEXT.tokenize(sentence)]
print(tokennized)
indexed = torch.LongTensor([TEXT.vocab.stoi[t] for t in tokennized]).to(device).unsqueeze(1)
pred = torch.sigmoid(best_model(indexed))
return pred.item()
sentence = input('please input the sentence you want to predict(in English):')
print('输入语句表达正向情感的概率为:{}'.format(predict_sentiment(sentence)))
please input the sentence you want to predict(in English): this is a good movie
['this', 'is', 'a', 'good', 'movie']
输入语句表达正向情感的概率为:1.0
sentence = input('please input the sentence you want to predict(in English):')
print('输入语句表达正向情感的概率为:{}'.format(predict_sentiment(sentence)))
please input the sentence you want to predict(in English): the film is great while the stars are awful
['the', 'film', 'is', 'great', 'while', 'the', 'stars', 'are', 'awful']
输入语句表达正向情感的概率为:3.232804579589299e-10
sentence = input('please input the sentence you want to predict(in English):')
print('输入语句表达正向情感的概率为:{}'.format(predict_sentiment(sentence)))
please input the sentence you want to predict(in English): the film is great and the stars are good
[' ', 'the', 'film', 'is', 'great', 'and', 'the', 'stars', 'are', 'good']
输入语句表达正向情感的概率为:1.0
class LstmModel(nn.Module): def __init__(self,vocab_size,embed_size,output_size,pad_idx,hidden_size,dropout_ratio): super(LstmModel,self).__init__() self.embed = nn.Embedding(vocab_size,embed_size,padding_idx=pad_idx) self.lstm = nn.LSTM(embed_size,hidden_size,bidirectional=True,num_layers=1) self.linear = nn.Linear(hidden_size*2,output_size) self.dropout = nn.Dropout(dropout_ratio) def forward(self,text): embedded = self.dropout(self.embed(text)) output,(hidden,cell) = self.lstm(embedded) # output size: (seq_length,batch_size,num_directions*num_layers) # hidden 和 cell的size: (num_layers * num_directions, batch_size, hidden_size) hidden = torch.cat([hidden[-1],hidden[-2]],dim=1) # hidden[-1] 和 hidden[-2]的size都是(batch_size,hidden_size), #cat之后,hidden的size变成(batch_size,hidden_size*2) # print(hidden.size()) hidden = self.dropout(hidden.squeeze()) return self.linear(hidden)
vocab_size = len(TEXT.vocab)
embed_size = 100
output_size = 1
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
hidden_size = 100
dropout_ratio= 0.5
lstm_model = LstmModel(vocab_size,embed_size,output_size,pad_idx,hidden_size,dropout_ratio).to(device)
num_parameters = sum(p.numel() for p in lstm_model.parameters() if p.requires_grad)
print(num_parameters)
2662001
# lstm_model.to(device)
pretrained_embed = TEXT.vocab.vectors
lstm_model.embed.weight.data.copy_(pretrained_embed)
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
lstm_model.embed.weight.data[pad_idx] = torch.zeros(embed_size)
lstm_model.embed.weight.data[unk_idx] = torch.zeros(embed_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(lstm_model.parameters(),lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()
# lstm_model.to(device)
# loss_fn.to(device)
epochs = 2 best_valid_acc = 0. for epoch in range(epochs): train_loss,train_acc = train(lstm_model,train_iterator,optimizer,loss_fn) valid_loss,valid_acc = evaluate(lstm_model,valid_iterator,loss_fn) if valid_acc>best_valid_acc: best_valid_acc = valid_acc best_epoch = epoch torch.save(avg_model.state_dict(),'./lstm_model.txt') # print('模型在验证集上的正确率({})有所提高,已将模型保存'.format(valid_acc)) print("Epoch:", epoch, "Train_Loss:", train_loss, "Train_Acc:", train_acc, "Valid_Loss", valid_loss, "Valid_Acc", valid_acc) print("training has finished,the best epoch is {},the best valid_acc is {}".format(best_epoch,best_valid_acc))
class CNNModel(nn.Module): def __init__(self,vocab_size,embedding_size,output_size,pad_idx,num_filters,filter_size,dropout_ratio): super(CNNModel,self).__init__() self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx) self.conv = nn.Conv2d(in_channels=1,out_channels=num_filters,kernel_size=(filter_size,embedding_size)) self.dropout = nn.Dropout(dropout_ratio) self.linear = nn.Linear(num_filters,ouput_size) def forward(self,text): text = text.permute(1,0) # 把batch_size换到第一维 embedded = self.embed(text) # (batch_size,seq_length,embed_size) embedded = embedded.unsqueeze(1) #(batch_size,1,seq_length,embde_size) 这是因为cnn的input size是(batch_size,c_in,h_in,w_in) #其中c_in表示输入通道个数,比如灰度照片时,就为1;rgb照片时,就为3 conved = F.relu(self.conv(embeded)) # (batch_size,num_filters,seq_length-fliter_size+1,1) conved = conved.squeeze() #把最后那个维度的1 给squeeze掉 (batch_size,num_filters,seq_length-filter_size+1) pooled = F.max_pool1d(conved,conved.shape[2]) # max_pool1d kernel的size是(seq_length-filter_size+1,1),这个池化操作就是将每个样本的每个filter上的最大值取出来, # 所以经过这步池化之后,size为:(batch_size,num_filters,1) pooled = pooled.squeeze() #把那个1 squeeze掉 pooled = self.dropout(pooled) return self.linear(pooled)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。