赞
踩
# -*- coding:utf-8 -*- import random from collections import defaultdict, Counter import numpy as np import torch import torch.nn as nn import torch.nn.functional as F def set_random_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True set_random_seed(6688) device = torch.device('cuda' if torch.cuda.is_available else 'cpu') data_path = ''
设计词典
from collections import Counter
def build_vocab(sents, max_words=50000):
word_counts = Counter()
for word in sents:
word_counts[word] += 1
itos = [w for w, c in word_counts.most_common(max_words)]
itos = itos + ["UNK"]
stoi = {w:i for i, w in enumerate(itos)}
return itos,stoi
tokenize = lambda x: x.split()
text = open(data_path +'senti.train.tsv').read()
vob = tokenize(text.lower())
itos,stoi = build_vocab(vob)
简单检验一下词典
itos[0:5]
['1', '0', 'the', ',', 'a']
stoi['the']
2
设计数据集
class Corpus: def __init__(self, data_path, sort_by_len=False): self.vocab = vob self.sort_by_len = sort_by_len self.train_data,self.train_label = self.tokenize(data_path + 'train.tsv') self.valid_data,self.valid_label = self.tokenize(data_path + 'dev.tsv') self.test_data,self.test_label = self.tokenize(data_path + 'test.tsv') def tokenize(self, text_path): with open(text_path) as f: index_data = [] # 索引数据,存储每个样本的单词索引列表 labels = [] for line in f.readlines(): sentence, label = line.split('\t') index_data.append( self.sentence_to_index(sentence.lower()) ) labels.append( int(label[0]) ) if self.sort_by_len: # 为了提升训练速度,可以考虑将样本按照长度排序,这样可以减少padding index_data = sorted(index_data, key=lambda x: len(x), reverse=True) return index_data,labels def sentence_to_index(self, s): a = [] for w in s.split(): if w in stoi.keys(): a.append(stoi[w]) else: a.append(stoi["UNK"]) return a def index_to_sentence(self, x): return ' '.join([itos[i] for i in x]) corpus = Corpus(data_path, sort_by_len=False)
简单检验一下词典训练数据和测试数据
corpus.train_data[0:5]
[[4459, 94, 12436, 37, 2, 7263, 9002],
[2905, 62, 327, 3, 90, 1970, 549],
[11, 1770, 18, 55, 5, 5990, 97, 185, 267, 35, 179, 622],
[593, 673, 5991, 8, 1971, 2, 290, 696],
[25, 2, 255, 5127, 261, 2, 360, 118, 4753, 54]]
corpus.train_label[0:5]
[0, 0, 1, 0, 0]
设计最小batch
def get_minibatches(text_idx, labels, batch_size=64, sort=True): if sort: text_idx_and_labels = sorted(list(zip(text_idx, labels)), key=lambda x: len(x[0])) else: text_idx_and_labels = (list(zip(text_idx, labels))) text_idx_batches = [] label_batches = [] for i in range(0, len(text_idx), batch_size): text_batch = [t for t, l in text_idx_and_labels[i:i + batch_size]] label_batch = [l for t, l in text_idx_and_labels[i:i + batch_size]] text_idx_batches.append(text_batch) label_batches.append(label_batch) return text_idx_batches, label_batches BATCH_SIZE = 64 train_batches, train_label_batches = get_minibatches(corpus.train_data, corpus.train_label, BATCH_SIZE) dev_batches, dev_label_batches = get_minibatches(corpus.valid_data,corpus.valid_label,BATCH_SIZE) test_batches, test_label_batches = get_minibatches(corpus.test_data,corpus.test_label, BATCH_SIZE)
简单测试一下train_batches, train_label_batches
print(train_batches[0][0:5]) #由于我们是按 评论长度从大到小排序,第0个batch 从0到5 每句话的长度为0
print(train_label_batches[0][0:5])
[[2782], [5996], [4464], [2907], [3796]]
[1, 1, 0, 1, 0]
class WordAVGModel(nn.Module): def __init__(self, vocab_size, embedding_size, output_size, dropout_p=0.5): super(WordAVGModel, self).__init__() self.embed = nn.Embedding(vocab_size, embedding_size) initrange = 0.1 self.embed.weight.data.uniform_(-initrange, initrange) self.u = nn.Parameter(torch.rand(embedding_size)) self.linear = nn.Linear(embedding_size, output_size) self.dropout = nn.Dropout(dropout_p) # 这个参数经常拿来调节 def forward(self, text, mask): # text: [batch_size * max_seq_len] # mask: [batch_size * max_seq_len] embedded = self.embed(text) # [batch_size, max_seq_len, embedding_size] embedded = self.dropout(embedded) #self.u.view(1,-1).size(),embedded.view(-1,embedded.shape[2]).size() a = torch.exp(torch.cosine_similarity(self.u.view(1,-1).to(device), embedded.view(-1,embedded.shape[2]), dim=1)).view(embedded.shape[0],embedded.shape[1]) embedded = ((a.unsqueeze(2).repeat(1, 1, embedded.size(2))) * embedded) # embedded:[batch_size * max_seq_len * embedding_size] embedded = self.dropout(embedded) # 1 represents word, 0 represents padding mask = mask.float().unsqueeze(2) # [batch_size, seq_len, 1] embedded = embedded * mask # [batch_size, seq_len, embedding_size] # 求平均 sent_embed = embedded.sum(1) # 防止mask.sum为0,那么不能除以零。 return self.linear(sent_embed) def attention(self, text): embedded = self.embed(text) # [ seq_len, embedding_size] embedded = self.dropout(embedded) a = torch.cosine_similarity(self.u.view(1,-1),embedded) a_weight = F.softmax(a, dim=-1) return a_weight.cpu().numpy()
初始化模型
VOCAB_SIZE = len(itos)
EMBEDDING_SIZE = 200
OUTPUT_SIZE = 1
model = WordAVGModel(vocab_size=VOCAB_SIZE,
embedding_size=EMBEDDING_SIZE,
output_size=OUTPUT_SIZE,
dropout_p=0.5
)
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()
model = model.to(device)
设计计算准确率函数
def binary_accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
设计模型训练函数
def train(model, text_idxs, labels, optimizer, crit): epoch_loss, epoch_acc = 0., 0. model.train() total_len = 0. for text, label in zip(text_idxs, labels): text = [torch.tensor(x).long().to(device) for x in (text)] label = [torch.tensor(label).long().to(device)] lengths = torch.tensor([len(x) for x in text]).long().to(device) text = nn.utils.rnn.pad_sequence(text, batch_first=True) mask = (text != 0).float().to(device) # 在之后的训练中因为还要进行pack_padded_sequence操作,所以在这里按照长度降序排列 lengths, perm_index = lengths.sort(descending=True) text = text[perm_index] label = label[0][perm_index] preds = model(text, mask).squeeze() # [batch_size, sent_length] loss = crit(preds, label.float()) acc = binary_accuracy(preds, label) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item() * len(label) epoch_acc += acc.item() * len(label) total_len += len(label) return epoch_loss / total_len, epoch_acc / total_len
设计模型评估函数
def evaluate(model, text_idxs, labels, crit): epoch_loss, epoch_acc = 0., 0. model.eval() total_len = 0. for text, label in zip(text_idxs, labels): text = [torch.tensor(x).long().to(device) for x in (text)] label = [torch.tensor(label).long().to(device)] lengths = torch.tensor([len(x) for x in text]).long().to(device) text = nn.utils.rnn.pad_sequence(text, batch_first=True) mask = (text != 0).float().to(device) # 在之后的训练中因为还要进行pack_padded_sequence操作,所以在这里按照长度降序排列 lengths, perm_index = lengths.sort(descending=True) text = text[perm_index] label = label[0][perm_index] with torch.no_grad(): preds = model(text, mask).squeeze() # [batch_size, sent_length] loss = crit(preds, label.float()) acc = binary_accuracy(preds, label) epoch_loss += loss.item() * len(label) epoch_acc += acc.item() * len(label) total_len += len(label) model.train() return epoch_loss / total_len, epoch_acc / total_len
训练模型
N_EPOCHS = 5
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_batches, train_label_batches, optimizer, crit)
valid_loss, valid_acc = evaluate(model, dev_batches, dev_label_batches, crit)
if valid_acc > best_valid_acc:
best_valid_acc = valid_acc
torch.save(model.state_dict(), "wordavg-model-Adam.pth")
print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)
Epoch 0 Train Loss 0.421612024540009 Train Acc 0.7994625013027427
Epoch 0 Valid Loss 0.46337967454840284 Valid Acc 0.8096330269761042
Epoch 1 Train Loss 0.26165380607506367 Train Acc 0.8996718585323913
Epoch 1 Valid Loss 0.5158094452061784 Valid Acc 0.8130733950422444
Epoch 2 Train Loss 0.21914521883101462 Train Acc 0.9156928833416801
Epoch 2 Valid Loss 0.5733908010185311 Valid Acc 0.8130733950422444
Epoch 3 Train Loss 0.19722330785677017 Train Acc 0.9246610937094599
Epoch 3 Valid Loss 0.6202372706264531 Valid Acc 0.8130733939485812
Epoch 4 Train Loss 0.18302929015256092 Train Acc 0.9310160507217569
Epoch 4 Valid Loss 0.6619091307351349 Valid Acc 0.807339450088116
取出正确率最高的模型预测测试集上的数据
model.load_state_dict(torch.load("wordavg-model-Adam.pth"))
test_loss, test_acc = evaluate(model, test_batches, test_label_batches, crit)
print("Test Loss", test_loss, "Test Acc", test_acc)
Test Loss 0.45551196396972765 Test Acc 0.8220757820133584
首先去掉词典中的标点符号以及标签
import re
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", " ",text)
vob = tokenize(text.lower())
vob.remove('0')
vob.remove('1')
vob= set(vob)
vob = [stoi[e] for e in vob if e in itos]
len(vob)
14690
model.load_state_dict(torch.load("wordavg-model-Adam.pth")) # embedding 字典向量 embedded = torch.tensor(list(vob)).to(device) #得到cosine_similarity列表 Cosin_List = torch.exp(torch.cosine_similarity(model.u.view(1,-1).to(device), model.embed(embedded), dim=1)) #在列表中取出L2 norm最大的15个数 Biggest15_nu = Cosin_List.sort(descending=True)[1][0:15] #把one hot 编码转换成为单词 Biggest15 = [itos[e] for e in Biggest15_nu] #在列表中取出L2 norm最小的15个数 Smallest15_nu = Cosin_List.sort(descending=False)[1][0:15] #把one hot 编码转换成为单词 Smallest15 = [itos[e] for e in Smallest15_nu] print("余弦相似度最大的15个单词:",Biggest15) print("余弦相似度最小的15个单词:",Smallest15) # 训练次数比较多的单词,以及出频率比较高的单词余弦相似度最小。 训练中出现次数比较少的单词余弦相似度最大。
余弦相似度最大的15个单词: ['march', 'finely', 'hilary', 'enthusiasm', 'nonbelievers', 'again', 'blowing', 'expressionistic', 'cannibal', 'direction', 'therapeutic', 'painterly', 'stunning', 'hardy', 'cineasts']
余弦相似度最小的15个单词: ['chapter', 'discipline', 'profane', 'deblois', 'ear-splitting', 'rating', 'entree', 'unfulfilling', 'under-inspired', 'swipe', 'period-piece', 'distinctions', 'drama/character', 'freezers', 'instructive']
from collections import Counter
from itertools import chain
train_data = list(chain.from_iterable(corpus.train_data))
train_data = dict(Counter(train_data))
count = 0
for e in Smallest15_nu:
#print(count)
count += 1
if e.item() in stoi.values() and e.item() in train_data.keys():
print(count,itos[e.item()],train_data[e.item()])
print("有",100*count/15,"%的L2_norm 最小词在训练集里面")
1 chapter 3 2 discipline 17 3 profane 21 4 deblois 3 5 ear-splitting 2 6 rating 28 7 entree 5 8 unfulfilling 29 9 under-inspired 12 10 swipe 8 11 period-piece 5 12 distinctions 7 13 drama/character 2 14 freezers 7 15 instructive 10 有 100.0 %的L2_norm 最小词在训练集里面
count = 0
for e in Biggest15_nu:
#print(count)
count += 1
if e.item() in stoi.values() and e.item() in train_data.keys():
print(count,itos[e.item()],train_data[e.item()])
print("有",100*count/15,"%的L2_norm最大词在训练集里面")
1 march 3 2 finely 32 3 hilary 9 4 enthusiasm 44 5 nonbelievers 3 6 again 247 7 blowing 9 8 expressionistic 7 9 cannibal 8 10 direction 293 11 therapeutic 4 12 painterly 11 13 stunning 66 14 hardy 7 15 cineasts 1 有 100.0 %的L2_norm最大词在训练集里面
from collections import Counter
word_attention_list = []
word_index_list = []
with torch.no_grad():
for text in (corpus.train_data):
word_index_list.extend(text)
text = torch.tensor(text).long().to(device)
a_weight= model.attention(text)
word_attention_list.extend(a_weight)
len(word_attention_list)
633698
from collections import defaultdict
import numpy as np
word_att_dict = defaultdict(list)
for i in range(len(word_index_list)):
word = word_index_list[i]
att = word_attention_list[i]
word_att_dict[word].append(att)
# 单词出现次数大于100次 且每个单词的 attention权重的平均值 和 标准差
word_att_processed = [( (itos[k],np.mean(v)),
np.std(v) )
for k,v in word_att_dict.items() if len(v)>100]
print('len(word_att_processed) : ',len(word_att_processed))
len(word_att_processed) : 660
30个标准差最大的单词
max_std_topK = 30
#(word,mean,std)
word_att_processed = sorted(word_att_processed,key=lambda x:x[1],reverse=True)[:max_std_topK]
print("\tmean\t\tstd\tword\t")
print("-"*60)
for i,(word_mean,std) in enumerate(word_att_processed):
print("{}\t{}\t\t{}\t{}".format(i+1,'%.2f'%word_mean[1],'%.2f'%std,word_mean[0]))
mean std word ------------------------------------------------------------ 1 0.24 0.20 stupid 2 0.23 0.19 terrific 3 0.24 0.19 unfunny 4 0.21 0.19 watchable 5 0.19 0.19 tedious 6 0.20 0.19 painful 7 0.20 0.19 bland 8 0.23 0.19 awful 9 0.21 0.18 inventive 10 0.25 0.18 mess 11 0.20 0.18 flat 12 0.19 0.18 gorgeous 13 0.18 0.18 worse 14 0.20 0.18 excellent 15 0.17 0.18 worthy 16 0.20 0.18 appealing 17 0.20 0.18 remarkable 18 0.21 0.17 beautifully 19 0.21 0.17 waste 20 0.20 0.17 intriguing 21 0.21 0.17 provocative 22 0.16 0.17 cool 23 0.17 0.17 hackneyed 24 0.19 0.17 slow 25 0.18 0.17 lacking 26 0.19 0.17 decent 27 0.18 0.17 boring 28 0.20 0.17 wonderful 29 0.22 0.17 engrossing 30 0.17 0.17 perfectly
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。