赞
踩
本菜鸟在学习NLP过程中,入门任务中有这么一个任务:用RNN实现文本分类
有如下几个知识点:
1.CNN/RNN
2.pytorch
3.词嵌入
4.Dropout
在这里我就不细说RNN了,毕竟我也不是很熟悉啊哈哈哈,给出一个讲的比较好的博文链接:
RNN认识,RNN如何训练的
这里使用的数据集是:Classify the sentiment of sentences from the Rotten Tomatoes dataset
这是一个影评预料库,本次的训练的目的就是对语料库中的影评进行训练,得出观众的情感(类似你平时点外卖给外卖小哥的评价是1星还是5星,在1星的时候你就会吐槽这个外卖小哥送餐很慢)
在获得语料后开始进行预处理,一开始我是使用了one-hot,然后用softmax实现预测,但是这样会忽略词与词之间的影响,例如:Lack of good taste,good service,good 、、、这里面有很多个good如果不考虑到lack的影响的话,会导致预测的结果为正面的,但事实是负面的。所以我在这里采用了GloVe预训练的embedding来进行初始化:
# -*- coding: utf-8 -*- """ Created on Sun Feb 9 19:52:26 2020 @author: Frierice 加载模型 """ import gensim import os import shutil from sys import platform #计算行数,就是单词数 def getFileLineNums(filename): f = open(filename, 'r',encoding='UTF-8') count = 0 for line in f: count += 1 return count #Linux或者Windows下打开词向量文件,在开始增加一行 def prepend_line(infile, outfile, line): with open(infile, 'r') as old: with open(outfile, 'w') as new: new.write(str(line) + "\n") shutil.copyfileobj(old, new) def prepend_slow(infile, outfile, line): with open(infile, 'r',encoding='UTF-8') as fin: with open(outfile, 'w',encoding='UTF-8') as fout: fout.write(line + "\n") for line in fin: fout.write(line) def load(filename): num_lines = getFileLineNums(filename) gensim_file = 'D:/spyderProject/Task2/gloveModel/glove_model.txt' gensim_first_line = "{} {}".format(num_lines, 300) # Prepends the line. if platform == "linux" or platform == "linux2": prepend_line(filename, gensim_file, gensim_first_line) else: prepend_slow(filename, gensim_file, gensim_first_line) model = gensim.models.KeyedVectors.load_word2vec_format(gensim_file) # model.save('D:/spyderProject/Task2/gloveModel/glove.model') # print(model['unk']) load('D:/spyderProject/Task2/gloveModel/glove.6B.300d.txt')
上面的代码是在数据第一行加上400000 300使得能够直接用word2vec
接下来是将上面的模型的字典和本次训练数据的字典交集(这个可有可无,原本以为嵌入需要些时间,结果发现时间都在加载模型上,所以这个就可有可无了)
def getDicWordEmbedding(): gensim_file = 'D:/spyderProject/Task2/gloveModel/glove_model.txt' model = gensim.models.KeyedVectors.load_word2vec_format(gensim_file) sentences,label = getTrainData() # bow_bags = CountVectorizer() # trainX = bow_bags.fit_transform(sentences) # dic = bow_bags.get_feature_names() dic = [] for words in sentences: words = words.lower() word = words.split(' ') for key in word: if key not in dic: dic.append(key) Dic = {} for word in dic: if word in model: vector = model.get_vector(word) Dic[word] = vector Dic['unk']=model.get_vector('unk') length = len(Dic) path = 'D:/spyderProject/Task2/gloveModel/task_model.txt' #文件路径 f = open(path,'w',encoding='utf-8') gensim_first_line = "{} {}".format(length, 300) f.write(gensim_first_line+'\n') for word,vector in Dic.items(): f.write(word+' ') for vec in vector[:-1]: f.write(str(vec)+' ') f.write(str(vector[-1])+'\n') f.close()
句子的转化为向量表示:
def embeddingSeq(model,sentences): all_sentences = [] # sentence_embedding = [] for words in sentences: words = words.lower() # print(words) word = words.split(' ') sentence_embedding = [] for key in word: if key not in model: emword = model.get_vector('unk')#若没有这个词则使用unk的向量,一开始想过置为全0的但又感觉不合适 else: emword = model.get_vector(key) sentence_embedding.append(emword) all_sentences.append(sentence_embedding) return all_sentences # print(np.shape(all_sentences[0][:][:]))
模型的实现主要依赖了pytorch中封装好的,原本想手码的,结果发现公式推导推着推着就不会了。
这里Dropout设置成了0.5,参考发现在0.5的时候有比较好的效果
# -*- coding: utf-8 -*- """ Created on Tue Feb 11 13:31:57 2020 @author: Frierice """ import torch from torch import nn import random import datahandle import gensim import math from sklearn.externals import joblib import os class SeqRNN(nn.Module): ''' vocab_size:词向量维度 hidden_size:隐藏单元数量决定输出长度 output_size:输出类别为5,维数为1 ''' def __init__(self,vocab_size,hidden_size,output_size): super(SeqRNN,self).__init__() self.vocab_size = vocab_size#这个为词向量的维数,GLove中为300维 self.hidden_size = hidden_size#隐藏单元数 self.output_size = output_size#最后要输出的 self.rnn = nn.RNN(self.vocab_size,self.hidden_size,batch_first=True,dropout=0.5) self.linear = nn.Linear(self.hidden_size,self.output_size) def forward(self,input): h0 = torch.zeros(1,1,self.hidden_size) output , hidden = self.rnn(input,h0) output = output[ : ,-1, : ] output = self.linear(output) output = torch.nn.functional.softmax(output,dim=1) return output class RNNClassificationModel: def __init__(self,epoches = 100): self.model = SeqRNN(300,128,5) self.epoches = epoches self.loss_func = nn.CrossEntropyLoss() self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.0003) def fit(self,trainSet,labels): for epoch in range(self.epoches): for i in range(200): index = random.randint(0,len(labels)-1) sentence = trainSet[index][:][:] label = labels[index] sentence_tensor = torch.tensor([sentence],dtype = torch.float) label_tensor = torch.tensor([label],dtype = torch.long) self.optimizer.zero_grad() pred = self.model(sentence_tensor) loss = self.loss_func(pred,label_tensor) loss.backward() self.optimizer.step() def predict_single(self,sentence): sentence_tensor = torch.tensor([sentence],dtype=torch.float) with torch.no_grad(): out = self.model(sentence_tensor) out = torch.argmax(out).item() return out def predict(self,sentences): results = [] for sentence in sentences: result = self.predict_single(sentence) results.append(result) return results def scores(self,train,label): results = self.predict(train) t = 0 for i in range(len(label)): if int(label[i]) == int(results[i]): t+=1 return t/len(label)
if __name__ == '__main__': gensim_file = 'D:/spyderProject/Task2/gloveModel/task_model.txt' model = gensim.models.KeyedVectors.load_word2vec_format(gensim_file) sentences,labels = datahandle.getTrainData() rnn = RNNClassificationModel(100) sentences = list(sentences) length = len(sentences) N = 20 dataset = [] labelset = [] ''' 将数据集分为二十份,然后进行训练 ''' for i in range(N): onedata = sentences[math.floor(i / N * length):math.floor((i + 1) / N * length)] onelabel = labels[math.floor(i / N * length):math.floor((i + 1) / N * length)] dataset.append(onedata) labelset.append(onelabel) for i in range(N): print('第'+str(i)+'次') train = datahandle.embeddingSeq(model,dataset[i]) rnn.fit(train,labelset[i]) dirs = 'D:/spyderProject/Task2/data/testModel' if not os.path.exists(dirs): os.makedirs(dirs) # 保存模型 joblib.dump(rnn, dirs+'/RNN.pkl')
from sklearn.externals import joblib import os import datahandle import gensim import math import pandas as pd def test(): dirs = 'D:/spyderProject/Task2/data/testModel' rnn = joblib.load(dirs+'/RNN.pkl') gensim_file = 'D:/spyderProject/Task2/gloveModel/task_model.txt' model = gensim.models.KeyedVectors.load_word2vec_format(gensim_file) testData = datahandle.getTestData() lenOftest = len(testData) testResult = [] for i in range(10): onetest = testData[math.floor(i / 10 * lenOftest):math.floor((i + 1) / 10 * lenOftest)] test1 = datahandle.embeddingSeq(model,onetest) result = rnn.predict(test1) testResult = testResult + result df = pd.DataFrame(testResult) df.to_csv("D:/spyderProject/Task2/data/testResult3.csv") print('ok') test()
提交结果:
正确率为0.59720,比之前使用softmax和one-hot训练的0.52提高了些许
本次实验主要熟悉pytorch的使用和熟悉RNN,第一次写有点不熟悉,有错望支出,共同学习~
本文部分参考和借鉴了以下文章,感谢这些大佬们的分享:
Gensim加载:
https://blog.csdn.net/zhujiyao/article/details/81112545
Glove模型理解:
https://blog.csdn.net/u014665013/article/details/79642083
Glove模型载入:
https://blog.csdn.net/sscssz/article/details/53333225
如何训练RNN和LSTM的:
https://blog.csdn.net/zhaojc1995/article/details/80572098
Pytorch实现RNN代码进行字符级分析
https://blog.csdn.net/zzulp/article/details/84971395
word embedding 的方式初始化
Word embedding 和 word2Vec:
https://blog.csdn.net/baimafujinji/article/details/77836142
参考
https://pytorch.org/
Convolutional Neural Networks for Sentence Classification https://arxiv.org/abs/1408.5882
https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。