赞
踩
这是对涂铭等老师撰写的《Python自然语言处理实战:核心技术与算法》中第8章情感分类技术的学习笔记。
"""
首先引入2个数据集合,400 000的词典以及400 000×50维的嵌入矩阵。
"""
import numpy as np
wordsList = np.load('./wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('./wordVectors.npy')
print ('Loaded the word vectors!')
print(len(wordsList))
print(wordVectors.shape)
Loaded the word list!
Loaded the word vectors!
400000
(400000, 50)
# 可以在词库中搜索单词,比如 “baseball”,然后可以通过访问嵌入矩阵来得到相应的向量,如下
baseballIndex = wordsList.index('baseball')
wordVectors[baseballIndex]
# 的第一步就是输入一个句子,然后构造它的向量表示。假设我们现在的输入句子是 “I thought the movie was incredible and inspiring”。 import tensorflow as tf maxSeqLength = 10 #Maximum length of sentence numDimensions = 300 #Dimensions for each word vector firstSentence = np.zeros((maxSeqLength), dtype='int32') firstSentence[0] = wordsList.index("i") firstSentence[1] = wordsList.index("thought") firstSentence[2] = wordsList.index("the") firstSentence[3] = wordsList.index("movie") firstSentence[4] = wordsList.index("was") firstSentence[5] = wordsList.index("incredible") firstSentence[6] = wordsList.index("and") firstSentence[7] = wordsList.index("inspiring") #firstSentence[8] and firstSentence[9] are going to be 0 print(firstSentence.shape) print(firstSentence) #Shows the row index for each word
"""
在构造整个训练集索引之前,需要先可视化和分析数据的情况从而确定并设置最好的序列长度。
训练集我们用的是 IMDB数据集。这个数据集合包含25000条评价,其中12500条是正面的评价,另外12500是负面的评价。
这些数据存放在一个文本文件下面,首先需要解析这个文件。下面是预处理的具体过程:
"""
from os import listdir from os.path import isfile, join positiveFiles = ['pos/' + f for f in listdir('pos/') if isfile(join('pos/', f))] negativeFiles = ['neg/' + f for f in listdir('neg/') if isfile(join('neg/', f))] numWords = [] for pf in positiveFiles: with open(pf, "r", encoding='utf-8') as f: line=f.readline() counter = len(line.split()) numWords.append(counter) print('Positive files finished') for nf in negativeFiles: with open(nf, "r", encoding='utf-8') as f: line=f.readline() counter = len(line.split()) numWords.append(counter) print('Negative files finished') numFiles = len(numWords) print('The total number of files is', numFiles) print('The total number of words in the files is', sum(numWords)) print('The average number of words in the files is', sum(numWords)/len(numWords)) import matplotlib.pyplot as plt %matplotlib inline plt.hist(numWords, 50) plt.xlabel('Sequence Length') plt.ylabel('Frequency') plt.axis([0, 1200, 0, 8000]) plt.show()
maxSeqLength = 250
fname = positiveFiles[3] #Can use any valid index (not just 3)
with open(fname) as f:
for lines in f:
print(lines)
exit
# 删除标点符号、括号、问号等,只留下字母数字字符 import re strip_special_chars = re.compile("[^A-Za-z0-9 ]+") def cleanSentences(string): string = string.lower().replace("<br />", " ") return re.sub(strip_special_chars, "", string.lower()) firstFile = np.zeros((maxSeqLength), dtype='int32') with open(fname) as f: indexCounter = 0 line=f.readline() cleanedLine = cleanSentences(line) split = cleanedLine.split() for word in split: try: firstFile[indexCounter] = wordsList.index(word) except ValueError: firstFile[indexCounter] = 399999 #Vector for unknown words indexCounter = indexCounter + 1 firstFile
ids = np.load('./idsMatrix.npy')
辅助函数
from random import randint def getTrainBatch(): labels = [] arr = np.zeros([batchSize, maxSeqLength]) for i in range(batchSize): if (i % 2 == 0): num = randint(1,11499) labels.append([1,0]) else: num = randint(13499,24999) labels.append([0,1]) arr[i] = ids[num-1:num] return arr, labels def getTestBatch(): labels = [] arr = np.zeros([batchSize, maxSeqLength]) for i in range(batchSize): num = randint(11499,13499) if (num <= 12499): labels.append([1,0]) else: labels.append([0,1]) arr[i] = ids[num-1:num] return arr, labels
RNN Mode
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 50000
import tensorflow as tf # 安装的是1.4的版本
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
#取最终的结果值
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)
sess = tf.InteractiveSession() saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) for i in range(iterations): #Next Batch of reviews nextBatch, nextBatchLabels = getTrainBatch(); sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels}) if (i % 1000 == 0 and i != 0): loss_ = sess.run(loss, {input_data: nextBatch, labels: nextBatchLabels}) accuracy_ = sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels}) print("iteration {}/{}...".format(i+1, iterations), "loss {}...".format(loss_), "accuracy {}...".format(accuracy_)) #Save the network every 10,000 training iterations if (i % 10000 == 0 and i != 0): save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i) print("saved to %s" % save_path)
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))
结果输出
iterations = 10
for i in range(iterations):
nextBatch, nextBatchLabels = getTestBatch();
print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)
iterations = 10 for i in range(iterations): nextBatch, nextBatchLabels = getTestBatch(); print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100) 1 iterations = 10 2 for i in range(iterations): 3 nextBatch, nextBatchLabels = getTestBatch(); 4 print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100) Accuracy for this batch: 91.66666865348816 Accuracy for this batch: 79.16666865348816 Accuracy for this batch: 75.0 Accuracy for this batch: 87.5 Accuracy for this batch: 87.5 Accuracy for this batch: 83.33333134651184 Accuracy for this batch: 66.66666865348816 Accuracy for this batch: 83.33333134651184 Accuracy for this batch: 83.33333134651184 Accuracy for this batch: 87.5
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。