赞
踩
数据集为英文文本数据,其中Label为其情感标签,正负类样本各有12500个。总计样本数量为两万五千条。
获取停用词表,用于过滤不想关的单词
数据分词、词干提取、词性还原、过滤停用词等操作
lemmatizer = nltk.WordNetLemmatizer() strem = PorterStemmer() def is_contain_number(word): for i in range(10): if str(i) in word: return True return False def preprocess_sentence(sentence): # print("开始分词...") processed_tokens = nltk.word_tokenize(sentence) # 词干提取 processed_tokens = [strem.stem(w.lower()) for w in processed_tokens] # print("去除部分tokens...") processed_tokens2 = [w for w in processed_tokens if ((w not in stop_list) and (not is_contain_number(w)))] # print("词性还原...") processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens2] return processed_tokens def tokenize(string): if not type(string) is str: return [] return preprocess_sentence(string)
这里的Glove预训练模型官网下载较慢,私聊我可获取完整资料!
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.word2vec.txt'
(count, dimensions) = glove2word2vec(glove_input_file, word2vec_output_file)
print(count, '\n', dimensions)
根据预训练模型的词典对数据进行过滤,并保证数据长度一致,过长的数据将其截断,过短的数据则以“PAD”为填充符。
max_seq_len = 0
for i,sentence in enumerate(tqdm(sentence_list)):
sent_list = sentence.split()
sent_list = [x for x in sent_list if x in word2idx.keys()]
max_seq_len = max(max_seq_len,len(sent_list))
sentence_list[i] = " ".join(sent_list)
print(max_seq_len)
max_seq_len = min(max_seq_len,200)
print(max_seq_len)
如下所示:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
def make_data(sentences, labels):
inputs = []
for sen in tqdm(sentences):
inputs.append([word2idx[n] for n in sen.split()])
targets = labels.tolist()
return np.array(inputs), np.array(targets)
input_batch, target_batch = make_data(sentence_list, labels)
class Bi_Lstm(nn.Module): def __init__(self): super(Bi_Lstm,self).__init__() self.embeddings = torch.tensor(glove_model.vectors) self.add_embedding('<PAD>') self.embeding = nn.Embedding(self.embeddings.shape[0],self.embeddings.shape[1], padding_idx=word2idx['<PAD>']) self.embeding.weight.data.copy_(self.embeddings) self.embeding.weight.requires_grad = False self.lstm = nn.LSTM(input_size = self.embeddings.shape[1], hidden_size = 100,num_layers = 2, bidirectional = True, batch_first=True, dropout=0.3)#加了双向,输出的节点数翻2倍 self.l1 = nn.BatchNorm1d(self.embeddings.shape[1]*2) self.l2 = nn.ReLU() self.dense = nn.Sequential( nn.Linear(self.embeddings.shape[1]*2,128), nn.BatchNorm1d(128), nn.Dropout(0.2), nn.Linear(128,64), nn.BatchNorm1d(64), nn.Linear(64,n_class), nn.BatchNorm1d(n_class) ) def add_embedding(self, word): vector = torch.empty(1, self.embeddings.shape[1])#生成空的 torch.nn.init.uniform_(vector)#随机生成 self.embeddings = torch.cat([self.embeddings, vector], 0)#在embedding_matrix中加入新的vector def forward(self, x): x = self.embeding(x) out,_ = self.lstm(x) #选择最后一个时间点的output out = self.l1(out[:,-1,:]) out = self.l2(out) out = self.dense(out) return out
from sklearn.metrics import classification_report, confusion_matrix import warnings warnings.filterwarnings("ignore") training_step = 50#迭代次数 batch_size = 512#每个批次的大小 kf = KFold(n_splits=5)#5折交叉验证 def res2array(xx): data = [] [data.extend(x.tolist()) for x in xx] return np.array(data) for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)): max_acc = 0.0 print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15) x_train, x_val = train[train_idx], train[test_idx] y_train, y_val = targets[train_idx], targets[test_idx] M_train = len(x_train)-1 x_train = torch.from_numpy(x_train).to(torch.long).to(device) x_val = torch.from_numpy(x_val).to(torch.long).to(device) y_train = torch.from_numpy(y_train).to(torch.long).to(device) y_val = torch.from_numpy(y_val).to(torch.long).to(device) model = Bi_Lstm() model.to(device) optimizer = torch.optim.Adam(model.parameters(),lr=0.001) loss_func = nn.CrossEntropyLoss()#多分类的任务 model.train()#模型中有BN和Droupout一定要添加这个说明 #开始迭代 for step in range(training_step): ytrain_pre_epoch = [] ytrain_true_epoch = [] yval_pre_epoch = [] yval_true_epoch = [] train_acc_per_epoch = 0.0 val_acc_per_epoch = 0.0 train_loss_per_epoch = 0.0 val_loss_per_epoch = 0.0 all_step = 0 with tqdm(np.arange(0,M_train,batch_size), desc='Training...') as tbar: for index in tbar: L = index R = min(M_train,index+batch_size) if (R-L) <= 1: continue #-----------------训练内容------------------ model.train() optimizer.zero_grad() # 清空上一步的残余更新参数值 train_pre = model(x_train[L:R]) # 喂给 model训练数据 x, 输出预测值 train_loss = loss_func(train_pre, y_train[L:R]) #-----------------反向传播更新--------------- train_loss.backward() # 以训练集的误差进行反向传播, 计算参数更新值 optimizer.step() # 将参数更新值施加到 net 的 parameters 上 #----------- -----计算准确率---------------- ytrain_pre = np.argmax(np.array(train_pre.data.cpu()),axis=1) train_true = np.array(y_train[L:R].data.cpu()) ytrain_pre_epoch.append(ytrain_pre) ytrain_true_epoch.append(train_true) train_acc = np.sum(np.argmax(np.array(train_pre.data.cpu()),axis=1) == np.array(y_train[L:R].data.cpu()))/(R-L) #---------------打印在进度条上-------------- tbar.set_postfix( train_loss=float(train_loss.data.cpu()), train_acc=train_acc ) tbar.update() # 默认参数n=1,每update一次,进度+n train_acc_per_epoch += train_acc train_loss_per_epoch += train_loss.data.cpu() all_step += 1 # --------------------每个epoch结束就要验证一次模型的准确度----------------------------- with torch.no_grad(): M_val = len(x_val) - 1 all_step = 0 with tqdm(np.arange(0,M_val,batch_size), desc='Validation...') as tbar: for index in tbar: L_val = index R_val = min(M_val,L_val + batch_size) if (R_val-L_val) <= 1: continue #-----------------验证------------------ model.eval() val_pre = model(x_val[L_val:R_val])#验证集也得分批次,不然数据量太大内存爆炸 val_loss = loss_func(val_pre, y_val[L_val:R_val]) yval_pre = np.argmax(np.array(val_pre.data.cpu()),axis=1) val_true = np.array(y_val[L_val:R_val].data.cpu()) yval_pre_epoch.append(yval_pre) yval_true_epoch.append(val_true) val_acc = np.sum(np.argmax(np.array(val_pre.data.cpu()),axis=1) == np.array(y_val[L_val:R_val].data.cpu()))/(R_val-L_val) #---------------打印在进度条上-------------- tbar.set_postfix( val_loss=float(val_loss.data.cpu()), val_acc=val_acc ) tbar.update() # 默认参数n=1,每update一次,进度+n val_acc_per_epoch += val_acc val_loss_per_epoch += val_loss.data.cpu() all_step += 1 res = classification_report(res2array(ytrain_pre_epoch),res2array(ytrain_true_epoch),digits=3,output_dict=True) # print(res) train_stat = res["weighted avg"] train_stat["accuracy"] = res["accuracy"] res = classification_report(res2array(yval_pre_epoch),res2array(yval_true_epoch),digits=3,output_dict=True) # print(res) val_stat = res["weighted avg"] val_stat["accuracy"] = res["accuracy"] print("Epoch %d/%d: trainning acc: %.3f validation acc: %.3f"%(step,training_step,train_stat["accuracy"],val_stat["accuracy"])) model_path = "model_fold_"+str(fold)+".pt" if res["accuracy"] > max_acc: max_acc = res["accuracy"] torch.save(model,model_path)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。