赞
踩
Seq2Seq技术,全称Sequence to Sequence,该技术突破了传统的固定大小输入问题框架,开通了将经典深度神经网络模型(DNNs)运用于在翻译,文本自动摘要和机器人自动问答以及一些回归预测任务上,并被证实在英语-法语翻译、英语-德语翻译以及人机短问快答的应用中有着不俗的表现。
Seq2Seq解决问题的主要思路是通过深度神经网络模型(常用的是LSTM,长短记忆网络,一种循环神经网络)http://dataxujing.coding.me/深度学习之RNN/。将一个作为输入的序列映射为一个作为输出的序列,这一过程由编码(Encoder)输入与解码(Decoder)输出两个环节组成, 前者负责把序列编码成一个固定长度的向量,这个向量作为输入传给后者,输出可变长度的向量。
(1)训练代码
import numpy as np import tensorflow as tf from tensorflow import keras from keras import layers batch_size=64 epochs=30 latent_dim=128 num_samples=10000 data_path='../dataset/fra-eng/fra.txt' #prepare data input_texts=[] target_texts=[] input_characters=set() target_characters=set() with open(data_path, 'r', encoding='utf-8') as f: lines=f.read().split('\n') for line in lines[: min(num_samples, len(lines) - 1)]: input_text, target_text, _ = line.split("\t") # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. target_text = "\t" + target_text + "\n" input_texts.append(input_text) target_texts.append(target_text) for char in input_text: if char not in input_characters: input_characters.add(char) for char in target_text: if char not in target_characters: target_characters.add(char) input_characters=sorted(list(input_characters)) target_characters=sorted(list(target_characters)) num_encoder_tokens=len(input_characters) num_decoder_tokens=len(target_characters) max_encoder_seq_length=max([len(txt) for txt in input_texts]) max_decoder_seq_length=max([len(txt) for txt in target_texts]) input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict([(char, i) for i, char in enumerate(target_characters)]) # print(input_token_index) # print(target_token_index) encoder_input_data = np.zeros( (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32" ) decoder_input_data = np.zeros( (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32" ) decoder_target_data = np.zeros( (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32" ) #encoder输入是一句话,decoder输出也是一句话,即predict输出的outputs是一句话。 for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): for t, char in enumerate(input_text): encoder_input_data[i, t, input_token_index[char]] = 1.0 encoder_input_data[i, t + 1:, input_token_index[" "]] = 1.0 #句子补齐 for t, char in enumerate(target_text): # decoder_target_data is ahead of decoder_input_data by one timestep decoder_input_data[i, t, target_token_index[char]] = 1.0 if t > 0: # decoder_target_data will be ahead by one timestep # and will not include the start character. decoder_target_data[i, t - 1, target_token_index[char]] = 1.0 decoder_input_data[i, t + 1:, target_token_index[" "]] = 1.0 #句子补齐 decoder_target_data[i, t:, target_token_index[" "]] = 1.0 # 句子补齐 # build model # Define an input sequence and process it. encoder_inputs = keras.Input(shape=(None, num_encoder_tokens)) print('in',encoder_inputs.shape) encoder = layers.LSTM(latent_dim,return_state=True) encoder_outputs, state_h, state_c = encoder(encoder_inputs) #这里如果return_sequences是true,那么输出的会是矩阵.默认是false,输出是一个向量hn,即只记录最后一个向量。后面会遇到attention模型,输出是一个矩阵,原因就是需要记住之前的所有状态h1,h2...hn,并和之前所有状态h1,h2...hn作对比。 print(encoder_outputs.shape) # We discard `encoder_outputs` and only keep the states. encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. decoder_inputs = keras.Input(shape=(None, num_decoder_tokens)) # We set up our decoder to return full output sequences, # and to return internal states as well. We don't use the # return states in the training model, but we will use them in inference. decoder_lstm = layers.LSTM( latent_dim, return_sequences=True, return_state=True) decoder_outputs, _, _ = decoder_lstm( decoder_inputs, initial_state=encoder_states) print(decoder_outputs.shape) decoder_dense = layers.Dense(num_decoder_tokens, activation="softmax") decoder_outputs = decoder_dense(decoder_outputs) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) # train model.compile( optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"] ) model.fit( [encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2, ) # Save model model.save("s2s")
(2)预测代码
model = keras.models.load_model("s2s") model.summary() encoder_inputs = model.input[0] # input_1 encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output # lstm_1 encoder_states = [state_h_enc, state_c_enc] #这个地方model的输出是encoder_states,与下面对应。decoder只需要接受一个来自encoder的状态就好。 encoder_model = keras.Model(encoder_inputs, encoder_states) decoder_inputs = model.input[1] # input_2 decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3") decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_4") decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] decoder_lstm = model.layers[3] decoder_outputs, state_h_dec, state_c_dec = decoder_lstm( decoder_inputs, initial_state=decoder_states_inputs ) decoder_states = [state_h_dec, state_c_dec] decoder_dense = model.layers[4] decoder_outputs = decoder_dense(decoder_outputs) # print('decoder_outputs', decoder_outputs.shape) decoder_model = keras.Model( [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states ) # Reverse-lookup token index to decode sequences back to # something readable. reverse_input_char_index = dict((i, char) for char, i in input_token_index.items()) reverse_target_char_index = dict((i, char) for char, i in target_token_index.items()) def decode_sequence(input_seq): # Encode the input as state vectors. states_value = encoder_model.predict(input_seq) # Generate empty target sequence of length 1. target_seq = np.zeros((1, 1, num_decoder_tokens)) # Populate the first character of target sequence with the start character. target_seq[0, 0, target_token_index["\t"]] = 1.0 # Sampling loop for a batch of sequences # (to simplify, here we assume a batch of size 1). stop_condition = False decoded_sentence = "" while not stop_condition: output_tokens, h, c = decoder_model.predict( [target_seq] + states_value) # 此处的意思是将两个list(target_seq与states_value)合并 # print('target seq',[target_seq]) # print('states value:',states_value) # print('output_tokens', output_tokens.shape) # Sample a token sampled_token_index = np.argmax(output_tokens[0, -1, :]) sampled_char = reverse_target_char_index[sampled_token_index] decoded_sentence += sampled_char # Exit condition: either hit max length # or find stop character. if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length: stop_condition = True # Update the target sequence (of length 1). target_seq = np.zeros((1, 1, num_decoder_tokens)) target_seq[0, 0, sampled_token_index] = 1.0 # just for test # target_seq = np.zeros((1, len(decoded_sentence), num_decoder_tokens)) # for t, char in enumerate(decoded_sentence): # target_seq[0,t,target_token_index[char]]=1.0 # Update states states_value = [h, c] return decoded_sentence for seq_index in range(40): # Take one sequence (part of the training set) # for trying out decoding. input_seq = encoder_input_data[seq_index: seq_index + 1] decoded_sentence = decode_sequence(input_seq) print("-") print("Input sentence:", input_texts[seq_index]) print("Decoded sentence:", decoded_sentence)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。