·多对一:输入数据是一个序列,但输出数据不是序列而是固定的向 量。例如,情感分析的输入基于文本,而输出是分类标签。
import pandas as pd import pyprind from string import punctuation import re import numpy as np import os from collections import Counter import tensorflow.compat.v1 as tf tf.disable_v2_behavior() # import tensorflow as tf df = pd.read_csv('xxx\movie_data.csv', encoding='utf-8') print(df.head(3)) # 将文本转换成整数列表 counts = Counter() pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurences') for i,review in enumerate(df['review']): # join()方法将序列中的元素以指定的字符连接生成一个新的字符串。 # 标点符号两边加空格,方便后续将单词分割出来 text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower() df.loc[i,'review'] = text pbar.update() # Counter类定义一个counts对象,该对象收集文本中每个独立单词出现的频率。 counts.update(text.split()) ## Create a mapping: ## Map each unique word to an integer # 基于词频对独立词排序,然后进行映射 # counts.get方法获取元素出现的次数 word_counts = sorted(counts, key=counts.get, reverse=True) print(word_counts[:5]) word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)} # 将文本转换成整数列表 mapped_reviews = [] pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints') for review in df['review']: mapped_reviews.append([word_to_int[word] for word in review.split()]) pbar.update() # 创建相同长度的序列,生成与RNN体系结构兼容的输入数据 sequence_length = 200 ## sequence length (or T in our formulas) sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int) for i, row in enumerate(mapped_reviews): review_arr = np.array(row) sequences[i, -len(row):] = review_arr[-sequence_length:] # 数据集已经洗牌,可以简单地将数据集的前半部分用于训练,后半部分用于测试 X_train = sequences[:25000, :] y_train = df.loc[:25000, 'sentiment'].values X_test = sequences[25000:, :] y_test = df.loc[25000:, 'sentiment'].values # 小批量 np.random.seed(123) # for reproducibility ## Function to generate minibatches: def create_batch_generator(x, y=None, batch_size=64): # python中与除法相关的三个运算符是// 和 / 和 %,下面逐一介绍。 # “/”,这是传统的除法,3/2=1.5 # “//”,在python中,这个叫“地板除”,3//2=1 # “%”,这个是取模操作,也就是区余数,4%2=0,5%2=1 n_batches = len(x)//batch_size x= x[:n_batches*batch_size] if y is not None: y = y[:n_batches*batch_size] for ii in range(0, len(x), batch_size): if y is not None: yield x[ii:ii+batch_size], y[ii:ii+batch_size] else: yield x[ii:ii+batch_size] # 构建一个RNN模型 class SentimentRNN(object): def __init__(self, n_words, seq_len=200, lstm_size=256, num_layers=1, batch_size=64, learning_rate=0.0001, embed_size=200): self.n_words = n_words self.seq_len = seq_len self.lstm_size = lstm_size ## number of hidden units self.num_layers = num_layers self.batch_size = batch_size self.learning_rate = learning_rate self.embed_size = embed_size self.g = tf.Graph() with self.g.as_default(): tf.set_random_seed(123) self.build() self.saver = tf.train.Saver() self.init_op = tf.global_variables_initializer() def build(self): ## Define the placeholders tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len), name='tf_x') tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y') tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob') ## Create the embedding layer # 增加嵌入层并构建嵌入式表示embed_x # 创建大小为[n_words×embedding_size]的矩阵作为张量变量, # 该变量被称为embedding,用[-1,1]之间的随机浮点数来初始化其元素 embedding = tf.Variable( tf.random_uniform( (self.n_words, self.embed_size), minval=-1, maxval=1), name='embedding') # tf.nn.embedding_lookup函数在嵌入矩阵中查找与tf_x的每个元素相关联的行 embed_x = tf.nn.embedding_lookup( embedding, tf_x, name='embeded_x') ## Define LSTM cell and stack them together # 首先定义多层RNN单元 # 用f.contrib.rnn.BasicLSTMCell来创建RNN单元 # 用tf.contrib.rnn.DropoutWrapper对RNN单元应用淘汰策略 # 调用MultiRNNCell封装类堆叠起来形成多层RNN # 这里的cell即为一个LSTM网络 cells = tf.nn.rnn_cell.MultiRNNCell( [tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell(self.lstm_size), output_keep_prob=tf_keepprob) for i in range(self.num_layers)]) # 定义CNN单元的初态 ## Define the initial state: # 这里批量处理100个句子,LSTM网络的神经元为128个,每个句子对应128个状态值。 self.initial_state = cells.zero_state( self.batch_size, tf.float32) # << initial state >> (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, # h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),) print(' << initial state >> ', self.initial_state) # 用RNN单元及其初始化值创建RNN # 用tf.nn.dynamic_rnn函数组合所有的组件, # 整合嵌入数据、RNN单元及其初态,并根据LSTM单元所展现的体系结构为其创建管道 # 处理完一个小批量之后,调用tf.nn.dynamic_rnn函数,将 状态更新为终态。更新后的状态将用于执行下一个小批量。 # 反复进行该过程 并在整个迭代过程中不断地更新当前的状态。 lstm_outputs, self.final_state = tf.nn.dynamic_rnn( cells, embed_x, initial_state=self.initial_state) ## Note: lstm_outputs shape: # 参考: # https://www.jianshu.com/p/79021e23d683?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation ## [batch_size, max_time, cells.output_size] # << lstm_output >> Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32) print('\n << lstm_output >> ', lstm_outputs) # << final state >> (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, # h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),) print('\n << final state >> ', self.final_state) ## Apply a FC layer after on top of RNN output: logits = tf.layers.dense( inputs=lstm_outputs[:, -1], # 100*128 units=1, activation=None, name='logits') logits = tf.squeeze(logits, name='logits_squeezed') # 二维变一维 # << logits >> Tensor("logits_squeezed:0", shape=(100,), dtype=float32) print('\n << logits >> ', logits) # 应用sigmoid函数可以将输出压缩至0~1的范围 y_proba = tf.nn.sigmoid(logits, name='probabilities') # tf.cast():将x的数据格式转化成dtype数据类型 # tf.round():四舍五入 predictions = { 'probabilities': y_proba, 'labels': tf.cast(tf.round(y_proba), tf.int32, name='labels') } # << predictions >> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, # 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>} print('\n << predictions >> ', predictions) ## Define the cost function # tf.nn.sigmoid_cross_entropy_with_logits()预测越准确,结果的值越小 # tf.reduce_mean操作,对向量求均值 cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf_y, logits=logits), name='cost') ## Define the optimizer # tf.train.AdamOptimizer()函数是Adam优化算法:是一个寻找全局最优点的优化算法,引入了二次方梯度校正。 # Adam优化器是一个是强大的基于梯度的优化方法,适合非顶点优化和机器学习。 optimizer = tf.train.AdamOptimizer(self.learning_rate) train_op = optimizer.minimize(cost, name='train_op') def train(self, X_train, y_train, num_epochs): with tf.Session(graph=self.g) as sess: sess.run(self.init_op) iteration = 1 for epoch in range(num_epochs): state = sess.run(self.initial_state) for batch_x, batch_y in create_batch_generator( X_train, y_train, self.batch_size): feed = {'tf_x:0': batch_x, 'tf_y:0': batch_y, 'tf_keepprob:0': 0.5, self.initial_state: state} loss, _, state = sess.run( ['cost:0', 'train_op', self.final_state], feed_dict=feed) if iteration % 20 == 0: print("Epoch: %d/%d Iteration: %d " "| Train loss: %.5f" % ( epoch + 1, num_epochs, iteration, loss)) iteration += 1 if (epoch + 1) % 10 == 0: self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch) # 与train方法类似,需要不断地更新当前的状态 def predict(self, X_data, return_proba=False): preds = [] with tf.Session(graph=self.g) as sess: # tf.train.latest_checkpoint()自动寻找最新的checkpoint self.saver.restore( sess, tf.train.latest_checkpoint('model/')) test_state = sess.run(self.initial_state) for ii, batch_x in enumerate( create_batch_generator( X_data, None, batch_size=self.batch_size), 1): feed = {'tf_x:0': batch_x, 'tf_keepprob:0': 1.0, self.initial_state: test_state} if return_proba: pred, test_state = sess.run( ['probabilities:0', self.final_state], feed_dict=feed) else: pred, test_state = sess.run( ['labels:0', self.final_state], feed_dict=feed) preds.append(pred) return np.concatenate(preds) ## Train: # 设置参数n_words使其等于独立单词的数目+1(加上1是因为当序列长度小于200时用0来填充) n_words = max(list(word_to_int.values())) + 1 rnn = SentimentRNN(n_words=n_words, seq_len=sequence_length, embed_size=256, lstm_size=128, # lstm_size决定了每个RNN层中隐藏单元的数量 num_layers=1, # 设置num_layers=1来使用单层RNN batch_size=100, learning_rate=0.001) # 40次迭代来训练模型 rnn.train(X_train, y_train, num_epochs=40) ## Test: preds = rnn.predict(X_test) y_true = y_test[:len(preds)] print('Test Acc.: %.3f' % ( np.sum(preds == y_true) / len(y_true))) ## Get probabilities: proba = rnn.predict(X_test, return_proba=True)
review sentiment
0 In 1974, the teenager Martha Moxley (Maggie Gr… 1
1 OK… so… I really like Kris Kristofferson a… 0
2 SPOILER Do not read this, if you think a… 0
Counting words occurences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:35
[‘the’, ‘.’, ‘,’, ‘and’, ‘a’]
Map reviews to ints
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:02
<< initial state >> (LSTMStateTuple(c=<tf.Tensor ‘MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0’ shape=(100, 128) dtype=float32>, h=<tf.Tensor ‘MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0’ shape=(100, 128) dtype=float32>),)
<< lstm_output >> Tensor(“rnn/transpose_1:0”, shape=(100, 200, 128), dtype=float32)
<< final state >> (LSTMStateTuple(c=<tf.Tensor ‘rnn/while/Exit_3:0’ shape=(100, 128) dtype=float32>, h=<tf.Tensor ‘rnn/while/Exit_4:0’ shape=(100, 128) dtype=float32>),)
<< logits >> Tensor(“logits_squeezed:0”, shape=(100,), dtype=float32)
<< predictions >> {‘probabilities’: <tf.Tensor ‘probabilities:0’ shape=(100,) dtype=float32>, ‘labels’: <tf.Tensor ‘labels:0’ shape=(100,) dtype=int32>}
Epoch: 1/40 Iteration: 20 | Train loss: 0.70443
Epoch: 1/40 Iteration: 40 | Train loss: 0.58803
Epoch: 1/40 Iteration: 60 | Train loss: 0.64603
Epoch: 1/40 Iteration: 80 | Train loss: 0.55665
Epoch: 1/40 Iteration: 100 | Train loss: 0.53824
Epoch: 1/40 Iteration: 120 | Train loss: 0.54014
Epoch: 1/40 Iteration: 140 | Train loss: 0.60872
Epoch: 1/40 Iteration: 160 | Train loss: 0.54468
Epoch: 1/40 Iteration: 180 | Train loss: 0.56579
Epoch: 1/40 Iteration: 200 | Train loss: 0.46205
Epoch: 1/40 Iteration: 220 | Train loss: 0.36559
Epoch: 1/40 Iteration: 240 | Train loss: 0.46353
Epoch: 40/40 Iteration: 9760 | Train loss: 0.00081
Epoch: 40/40 Iteration: 9780 | Train loss: 0.00019
Epoch: 40/40 Iteration: 9800 | Train loss: 0.00124
Epoch: 40/40 Iteration: 9820 | Train loss: 0.00006
Epoch: 40/40 Iteration: 9840 | Train loss: 0.00017
Epoch: 40/40 Iteration: 9860 | Train loss: 0.00005
Epoch: 40/40 Iteration: 9880 | Train loss: 0.00036
Epoch: 40/40 Iteration: 9900 | Train loss: 0.00004
Epoch: 40/40 Iteration: 9920 | Train loss: 0.00032
Epoch: 40/40 Iteration: 9940 | Train loss: 0.00011
Epoch: 40/40 Iteration: 9960 | Train loss: 0.00173
Epoch: 40/40 Iteration: 9980 | Train loss: 0.00032
Epoch: 40/40 Iteration: 10000 | Train loss: 0.00009
Test Acc.: 0.852
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。