赞
踩
循环神经网络对长度可变的序列数据有较强的处理能力,在NLP领域独领风骚。
10.1 循环神经网络简介
1.循环神经网络的前向传播程序设计
import numpy as np x=[0.8,0.1] init_state=[0.3,0.6] W=np.asarray([[0.2,0.4],[0.7,0.3]]) U=np.asarray([0.8,0.1]) b_h=np.asarray([0.2,0.1]) V=np.asarray([[0.5],[0.5]]) b_o=0.1 for i in range(len(x)): before_activation=np.dot(init_state,W)+x[i]*U+b_h state=np.tanh(before_activation) init_state=state final_output=np.dot(state,V)+b_o print("t%s state : %s"%(i+1,state)) print("t%s output: %s\n"%(i+1,final_output))
2.循环神经网络的梯度
3.循环神经网络的不同设计模式
(1)每个时刻都有输出,并且在隐藏层之间引入定向循环。
(2)每个时刻都有输出,且在该时刻的输出到下一时刻的隐藏层之间有循环连接。相对于前一种一般效果较差,因为o比h而言缺少对过去的重要信息。
(3)隐藏层之间存在着循环连接,但是输出在若干时刻后,而不是每一个时刻都有输出。
10.2 自然语言建模和词向量
Word2Vec 是Google推出的一款计算词向量的工具。
1.统计学语言模型
2.Word2Vec
VSM(向量空间模型),通过统计语义假说(Statistical Semantics Hypothesis,语言的统计特征隐藏着语义的信息)。比较流行的两个派生版本为:Bag of Worlds Hypothesis(统计一篇文章的词频,使用较高频次出现的词代表文档的主题) 和Distributional Hypothesis(上下文环境相似的两个词语义也相近)。
向量空间模型可大致分为两类:一类是计数模型(Latent Sematic Analysis):统计在语料库中相邻出现的词的频率,再把这些计数的结果转化为小而稠密的矩阵。另一类是预测模型(Neural Probabistic Language Models):根据某个词相邻的词推测出这个词及其空间向量。
3.使用TensorFlow实现Word2Vec
import numpy as np import tensorflow as tf import collections import random import zipfile vocabulary_size=50000 file="./Word2vec/text8.zip" def read_data(file): with zipfile.ZipFile(file = file) as f: original_data=tf.compat.as_str(f.read(f.namelist()[0])).split() return original_data original_words=read_data(file) print("len of original word:",len(original_words)) def build_vocabulary(original_words): count=[["unknown",-1]] count.extend(collections.Counter(original_words).most_common(vocabulary_size-1)) print(count) dictionary=dict() for word, _ in count: dictionary[word]=len(dictionary) data=list() unknown_count=0 for word in original_words: if word in dictionary: index=dictionary[word] else: index=0 unknown_count+=1 data.append(index) count[0][1]=unknown_count reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys())) return data,count,dictionary,reverse_dictionary data,count,dictionary,reverse_dictionary=build_vocabulary(original_words) print("Most common words (+unkonwn)",count[:5]) print("Sample data",data[:10],[reverse_dictionary[i] for i in data[:10]]) data_index=0 data,count,dictionary,reverse_dictionary=build_vocabulary(original_words) def generate_batch(batch_size,num_of_samples,skip_distance): global data_index batch=np.ndarray(shape=(batch_size),dtype=np.int32) labels=np.ndarray(shape=(batch_size,1),dtype=np.int32) num_of_sample_words=2*skip_distance+1 buffer=collections.deque(maxlen==num_of_sample_words) for _ in range(num_of_sample_words): buffer.append(data[data_index]) data_index=(data_index+1) for i in range(batch_size // num_of_samples): target=skip_distance targets_to_avoid=[skip_distance] for j in range(num_of_samples): while target in targets_to_avoid: target=random.randint(0,num_of_sample_words-1) targets_to_avoid.append(target) batch[i*num_of_samples +j]=buffer[skip_distance] labels[i*num_of_samples+j,0]=buffer[target] buffer.append(data[data_index]) data_index=(data_index+1) return batch,labels batch,labels=generate_batch(batch_size=8,num_of_samples=2,skip_distance=1) for i in range(8): print(batch[i],reverse_dictionary[batch[i]],"->",labels[i,0],reverse_dictionary[labels[i,0]])
import numpy as np import tensorflow.compat.v1 as tf import math import vocabulary max_steps=10000 batch_size=128 embedding_size=128 skip_distance=1 num_of_samples=2 vocabulary_size=50000 valid_sample=np.random.choice(100,16,replace=False) num_sampled=64 with tf.Graph().as_default(): train_inputs=tf.placeholder(tf.int32,shape=[batch_size]) train_labels=tf.placeholder(tf.int32,shape=[batch_size,1]) embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0)) embed=tf.nn.embedding_lookup(embeddings,train_inputs) nce_weight=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size))) nce_biases=tf.Variable(tf.zeros([vocabulary_size])) nec_loss=tf.nn.nce_loss(weights=nce_weight,biases=nce_biases,labels=train_labels,inputs=embed,num_sampled=num_sampled,num_classes=vocabulary_size) loss=tf.reduce_mean(nec_loss) optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss) norm=tf.sqrt(tf.reduce_mean(tf.square(embeddings),1,keep_dims=True)) normal_embeddings=embeddings/norm valid_inputs=tf.constant(valid_sample,dtype=tf.int32) valid_embeddings=tf.nn.embedding_lookup(normal_embeddings,valid_inputs) similarity=tf.matmul(valid_embeddings,normal_embeddings,transpose_b=True) with tf.Session() as sess: tf.global_variables_initializer().run() #total_loss -- avg_loss total_loss=0 average_loss=0 for step in range(max_steps+1): batch_inputs,batch_labels=vocabulary.generate_batch(batch_size,num_of_samples,skip_distance) loss_val, _ =sess.run([loss,optimizer],feed_dict={ train_inputs:batch_inputs, train_labels:batch_labels}) total_loss+=loss_val if step>0 and step%1000==0: average_loss=total_loss/1000 print("Average loss at %d step is : %f "%(step,average_loss)) average_loss=0 total_loss=0 if step>0 and step%5000==0: similar=similarity.eval() for i in range(16): nearest=(-similar[i,:]).argsort()[1:8+1] valid_word=vocabulary.reverse_dictionary[valid_sample[i]] nearset_information="Nearest to %s is : " %valid_word for j in range(8): close_word=vocabulary.reverse_dictionary[nearest[j]] nearset_information="%s %s"(nearset_information,close_word) print("valid_word is: %s"%valid_word) print(nearset_information) final_embeddings=normalize_embeddings.eval()
from sklearn.manifold import TSNE import matplotlib.pyplot as plt import vocabulary import Word2Vec_skip tsne=TSNE(perplexity=30,n_comonents=2,init="pca",n_iter=5000) plot_only=100 low_dim_embbs=tsne.fit_transform(Word2Vec_skip.final_embeddings[:plot_only,:]) labels=list() for i in range(plot_only): labels.append(vocabulary.reverse_dictionary[i]) plt.figure(figsize=(20,20)) for j,label in enumerate(labels): x,y=low_dim_embbs[j,:] plt.scatter(x,y) plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords="offset points",ha="right",va="bottom") plt.savefig(fname="after_tsne.png")
10.3 LSTM实现自然语言建模
import tensorflow as tf
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.mnist.load_data()
train_images,test_images=train_images/255.0,test_images/255.0
sample,sample_label=train_images[0],train_labels[0]
def build_model():
rnn_layer=tf.keras.layers.RNN(tf.keras.layers.LSTMCell(units=64),input_shape=(None,28))
model=tf.keras.models.Sequential([rnn_layer,tf.keras.layers.BatchNormalization(),tf.keras.layers,Dense(units=10,activation='softmax')])
return model
model=build_model()
model.compile(loss='sparse_categorical_crossentropy',optimizer='SGD',metrics=['accuracy'])
model.fit(train_images,train_labels,validation_data=(test_images,test_labels),batch_size=100,epochs=20)
model.summary()
LSTM在自然语言建模中的应用
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
git clone https://github.com/tensorflow/models.git
使用Models库中/tutorials/rnn/ptb路径下的reader.py文件来操作PTB数据集的内容。
import numpy as np import time import tensorflow.compat.v1 as tf import reader class Config(object): init_scale=0.1 learning_rate=1.0 max_grad_norm=5 num_layers=2 num_steps=20 word_dimension=200 max_epoch=4 total_epoch=13 keep_prob=1.0 lr_decay=0.5 batch_size=20 vocab_size=10000 class PTBModel(object): def __init__(self,is_training,config,data,name=None): self.batch_size=config.batch_size self.num_steps=config.num_steps self.epoch_size=((len(data)//self.batch_size)-1)//self.num_steps self.input_data,self.targets=reader.ptb_producer(data,self.batch_size,self.num_steps,name=name) self.keep_prob=config.keep_prob self.word_dimension=config.word_dimension lstm_cell=tf.nn.rnn_cell,BasicLSTMCell(self.word_dimension,forget_bias=0.0,state_is_tuple=True) if is_training and config.keep_prob <1: lstm_cell=tf.nn.rnn_cell.DropoutWrapper(lstm_cell,output_keep_prob=self.keep_prob) self.num_layers=config.num_steps cell_layer=tf.nn.rnn_cell.MultiRNNCell([lstm_cell for _ in range(self.num_layers)],state_is_tuple=True) self.initial_state=cell_layer.zero_state(self.batch_size,tf.float32) self.vocab_size=config.vocab_size embedding=tf.get_variable("embedding", [self.vocab_size, self.word_dimension], dtype=tf.float32) inputs=tf.nn.embedding_lookup(embedding,self.input_data) outputs=[] state=self.initial_state with tf.variable_scope("RNN"): for time_step in range(self.num_steps): if time_step>0: tf.get_variable_scope().reuse_variable() cell_output,state=cell_layer(inputs[:,time_step,:],state) outputs.append(cell_output) output=tf.rehape(tf.concat(outputs,1),[-1,self.word_dimension]) weight=tf.get_variable("softmax_w",[self.word_dimension,self.vocab_size],dtype=tf.float32) bias=tf.get_variable("softmax_b",[self.vocab_size],dtype=tf.float32) logits=tf.matmul(output,weight)+bias loss=tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.rehape(self.targets,[-1])], [tf.ones([self.batch_size*self.num_steps],dtype=tf.float32)]) self.cost=tf.reduce_sum(loss)/self.batch_size self.final_state=state if not is_training: return self.learning_rate=tf.Variables(.0,trainable=False) gradients=tf.gradients(self.cost,tf.trainable_variables()) clipped_grads, _ =tf.clip_by_global_norm(gradients,config.max_grad_norm) SGDOPtimizer=tf.train.GradientDescentOptimizer(self.learning_rate) self.train_op=SGDOPtimizer.apply_gradients(zip(clipped_grads,tf.trainable_variables()), global_step=tf.train.get_or_create_global_step()) self.new_learning_rate=tf.placeholder(tf.float32,shape=[],name="new_learning_rate") self.learning_rate_update=tf.assign(self.learning_rate,self.new_learning_rate) def assign_lr(selfself,session,lr_value): session.run(self.learning_rate_update,feed_dict={self.new_learning_rate:lr_value}) def run_epoch(session,model,train_op=None,output_log=False): start_time=time.time() costs=0 iters=0 state=session.run(model.initial_state) fetches={ "costs":model.cost, "final_stat":model.final_state, } if train_op is not None: fetches["train_op"]=train_op for step in range(model.epoch_size): feed_dict={} for i,(c,h) in enumerate(model.initial_state): feed_dict[c]=state[i].c feed_dict[h]=state[i].h result=session.run(fetches,feed_dict) cost=result["cost"] state=result["final_state"] cost += cost iters += model.num_steps if output_log and step % (model.epoch_size//10)==10: print("step%.3f perplexity: %.3f speed: %.0f words/sec"%(step,np.exp(costs/iters),iters*model.batch_size/(time.time()-start_time))) return np.exp(costs/iters) train_data,valid_data,test_data,_=reader.ptb_raw_data("../PTB/simplep-examples/data/") train_config=Config() valid_config=Config() test_config=Config() test_config.batch_size=1 test_config.num_steps=1 with tf.Graph().as_default(): initializer=tf.random_uniform_initializer(-config.init_scale,config.init_scale) with tf.name_scope("Train"): with tf.variable_scope("Model",reuse=None,initializer=initializer): Model_train=PTBModel(is_training=True, config=train_config, data=train_data, name="TrainModel") with tf.name_scope("Valid"): with tf.variable_scope("Model", reuse=None, initializer=initializer): Model_train = PTBModel(is_training=False, config=valid_config, data=valid_data, name="ValidModel") with tf.name_scope("Test"): with tf.variable_scope("Model", reuse=None, initializer=initializer): Model_train = PTBModel(is_training=False, config=test_config, data=test_data, name="TestModel") sv=tf.train.Supervisor() with sv.managed_session() as session: for i in range(config.total_epoch): lr_decay=config.lr_decay ** max(i+1-config.max_epoch,0.0) Model_train.assign_lr(session,config.learning_rate*lr_decay) print("Epoch: %d Learnong rate : %.3f" %(i+1,session.run(Model_train.learning_rate))) train_perplexity=run_epoch(session,Model_train,train_op=Model_train.train_op,output_log=True) print("Epoch: %d Train Perplexity: %.3f" % (i+1,train_perplexity)) valid_perplexity=run_epoch(session,Model_valid) print("Epoch: %d Valid Perplexity: %.3f"%(i+1,valid_perplexity)) test_perplexity=run_epoch(session,Model_test) print("Test Perplexity : %.3f"% test_perplexity)
Dropout
import tensorflow.compat.v1 as tf
lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
dropout_lstm=tf.nn.run_cell.DropoutWrapper(lstm,output_keep_prob=0.5)
statcked_lstm=tf.nn.run_cell.MultiRNNCell([dropout_lstm]*number_of_layers)
10.4 循环神经网络的变种
深层循环神经网络
import tensorflow.compat.v1 as tf
lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
statcked_lstm=tf.nn.run_cell.MultiRNNCell([lstm]*number_of_layers)
for i in range(num_steps):
if i>0:
tf.get_variable_scope().reuse_variables()
stacked_lstm_output,state=stacked_lstm(current_input,state)
final_output=fc(statcked_lstm_output)
loss += calculate_loss(final_output,expexted_output)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。