赞
踩
我个人没训练,太久了,我就看波源码把,代码作者有训练好的模型,想用的可以去试试。
我主要是跑通代码,至少没有错。跑的时候注意内存大小,你可以自己调超参数防止内存溢出,内存不够也有报错,所以我只能强制cpu上跑,个人显存不够。
PS: 原代码跑起来很慢,如果我们单纯只是想跑通,或者看某一个节点的情况,我们可以删减它原本的train文件(import 那些没有加,是因为我自己Import 的model文件名是自己起的,我怕误导你们),类似于这样,跑起来又快,又利于理解代码,简直不要太爽:
hparams = Hparams() parser = hparams.parser hp = parser.parse_args() save_hparams(hp, hp.logdir) # 返回dataset train_batches, num_train_batches, num_train_samples = get_batch(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 100000, 100000, hp.vocab, hp.batch_size, shuffle=False) # create a iterator of the correct shape and type iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) eval_init_op = iter.make_initializer(eval_batches) logging.info("# Load model") m = Transformer(hp) loss, train_op, global_step = m.train(xs, ys) # 使用cpu os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "-1" with tf.Session() as sess: # 数据init sess.run(train_init_op) sess.run(tf.global_variables_initializer()) _loss = sess.run(loss) # train loss print(_loss)
preprocessing后结果
segmenting with sentencepice后的结果
该文件包含所有关于加载数据以及批量化数据的函数
载入数据
def load_data(fpath1, fpath2, maxlen1, maxlen2): '''Loads source and target data and filters out too lengthy samples. fpath1: source file path. string. fpath2: target file path. string. maxlen1: source sent maximum length. scalar. maxlen2: target sent maximum length. scalar. Returns sents1: list of source sents sents2: list of target sents ''' sents1, sents2 = [], [] with open(fpath1, 'r') as f1, open(fpath2, 'r') as f2: for sent1, sent2 in zip(f1, f2): if len(sent1.split()) + 1 > maxlen1: continue # 1: </s> if len(sent2.split()) + 1 > maxlen2: continue # 1: </s> sents1.append(sent1.strip()) sents2.append(sent2.strip()) return sents1, sents2
def encode(inp, type, dict):
'''Converts string to number. Used for `generator_fn`.
inp: 1d byte array.
type: "x" (source side) or "y" (target side)
dict: token2idx dictionary
Returns
list of numbers
'''
inp_str = inp
if type=="x": tokens = inp_str.split() + ["</s>"]
else: tokens = ["<s>"] + inp_str.split() + ["</s>"]
x = [dict.get(t, dict["<unk>"]) for t in tokens]
return x
def generator_fn(sents1, sents2, vocab_fpath):
token2idx, _ = load_vocab(vocab_fpath)
for sent1, sent2 in zip(sents1, sents2):
x = encode(sent1, "x", token2idx)
y = encode(sent2, "y", token2idx)
decoder_input, y = y[:-1], y[1:]
x_seqlen, y_seqlen = len(x), len(y)
yield (x, x_seqlen, sent1), (decoder_input, y, y_seqlen, sent2)
进入最重要的一个函数
def input_fn(sents1, sents2, vocab_fpath, batch_size, shuffle=False): '''Batchify data sents1: list of source sents sents2: list of target sents vocab_fpath: string. vocabulary file path. batch_size: scalar shuffle: boolean Returns xs: tuple of x: int32 tensor. (N, T1) x_seqlens: int32 tensor. (N,) sents1: str tensor. (N,) ys: tuple of decoder_input: int32 tensor. (N, T2) y: int32 tensor. (N, T2) y_seqlen: int32 tensor. (N, ) sents2: str tensor. (N,) ''' shapes = (([None], (), ()), ([None], [None], (), ())) types = ((tf.int32, tf.int32, tf.string), (tf.int32, tf.int32, tf.int32, tf.string)) paddings = ((0, 0, ''), (0, 0, 0, '')) dataset = tf.data.Dataset.from_generator( lambda:generator_fn(sents1, sents2, vocab_fpath), output_shapes=shapes, output_types=types) # 参数参考[5] if shuffle: # for training dataset = dataset.shuffle(128*batch_size) # 多个epoch dataset = dataset.repeat() # iterate forever # 填充为固定长度,这个shpe没有固定,经验证,默认batch内最大句子的维度 dataset = dataset.padded_batch(batch_size, shapes, paddings).prefetch(1) return dataset
一、Tensorflow读入数据的三种方式
1)Feeding:Python代码在运行每一步时提供数据
2)从文件中读取:输入管道从TensorFlow图形的开头读取文件中的数据。
3)预加载数据:TensorFlow图中的常量或变量保存所有数据(对于小数据集)
def get_batch(fpath1, fpath2, maxlen1, maxlen2, vocab_fpath, batch_size, shuffle=False): '''Gets training / evaluation mini-batches fpath1: source file path. string. fpath2: target file path. string. maxlen1: source sent maximum length. scalar. maxlen2: target sent maximum length. scalar. vocab_fpath: string. vocabulary file path. batch_size: scalar shuffle: boolean Returns batches num_batches: number of mini-batches num_samples ''' # ....这还不是一次性加载到内存里了 sents1, sents2 = load_data(fpath1, fpath2, maxlen1, maxlen2) batches = input_fn(sents1, sents2, vocab_fpath, batch_size, shuffle=shuffle) # 计算batch个数 num_batches = calc_num_batches(len(sents1), batch_size) return batches, num_batches, len(sents1)
self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True)
def get_token_embeddings(vocab_size, num_units, zero_pad=True): '''Constructs token embedding matrix. Note that the column of index 0's are set to zeros. vocab_size: scalar. V. num_units: embedding dimensionalty. E. zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero To apply query/key masks easily, zero pad is turned on. Returns weight variable: (V, E) ''' with tf.variable_scope("shared_weight_matrix"): embeddings = tf.get_variable('weight_mat', dtype=tf.float32, shape=(vocab_size, num_units), initializer=tf.contrib.layers.xavier_initializer()) if zero_pad: embeddings = tf.concat((tf.zeros(shape=[1, num_units]), embeddings[1:, :]), 0) return embeddings
def positional_encoding(inputs, maxlen, masking=True, scope="positional_encoding"): '''Sinusoidal Positional_Encoding. See 3.5 inputs: 3d tensor. (N, T, E) maxlen: scalar. Must be >= T masking: Boolean. If True, padding positions are set to zeros. scope: Optional scope for `variable_scope`. returns 3d tensor that has the same shape as inputs. ''' # 和embedding一样,方便相加 E = inputs.get_shape().as_list()[-1] # static N, T = tf.shape(inputs)[0], tf.shape(inputs)[1] # dynamic with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): # position indices,T最大长度,N:batc_size,纵向维度 position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) # (N, T) # First part of the PE function: sin and cos argument position_enc = np.array([ [pos / np.power(10000, (i-i%2)/E) for i in range(E)] for pos in range(maxlen)]) # Second part, apply the cosine to even columns and sin to odds. position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1 position_enc = tf.convert_to_tensor(position_enc, tf.float32) # (maxlen, E) # lookup # position_enc整个长度都编码,这里是找对应的编码 outputs = tf.nn.embedding_lookup(position_enc, position_ind) # masks # 原先位置为0的位置,仍然为0 if masking: outputs = tf.where(tf.equal(inputs, 0), inputs, outputs) return tf.to_float(outputs)
其实就是根据公式进行位置编码,但是其中的维度变换亮瞎了我的眼,还好老子会print这种大法。
position_ind维度是[ N,T ] N,T是变化的,它究竟是什么?
类似于这种,它的作用其实就是索引,没其他用。
position_enc你可以看作position的embedding,他是根据上面的公式计算的,关键是,他的维度是[maxlen, E],请注意,maxlen不等于T。把这个信息利用起来的,是look_up这个我小瞧了的函数,我只用过一维索引,二维索引是什么效果?见下:
你可以理解为,这时候我索引的第一维度表示句子个数(batch),第二维度表示句子里的单词数(T),所以它直接将整个batch的位置编码直接给弄出来了。
所以最后的维度是?
[N,T,E]
但是我一直有一点疑惑是,这里我在load_data里填充过数据了,这里为什么还是不定长的T1,个人分析是,padding_batch时用的不定长维度,所以默认该batch内最大长度为标准填充,而不是整个都用一个维度填充,自己这个函数用的不熟,暂时只是猜测。这里对padding的部分编码了也没问题,我们有mask标记。
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks # 对比这两个矩阵或者向量的相等的元素,如果是相等的那就返回True,反正返回False,返回的值的矩阵维度和A是一样的 # 找句子里面填充为0的位置 # (N, T1) src_masks = tf.equal(x, 0) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) # 去除维度的影响 enc *= self.hp.d_model**0.5 # scale # 句子加上位置信息 enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) # 最后输出:(N, T1, d_model) memory = enc return memory, sents1, src_masks
def multihead_attention(queries, keys, values, key_masks, num_heads=8, dropout_rate=0, training=True, causality=False, scope="multihead_attention"): '''Applies multihead attention. See 3.2.2 queries: A 3d tensor with shape of [N, T_q, d_model]. keys: A 3d tensor with shape of [N, T_k, d_model]. values: A 3d tensor with shape of [N, T_k, d_model]. key_masks: A 2d tensor with shape of [N, key_seqlen] num_heads: An int. Number of heads. dropout_rate: A floating point number. training: Boolean. Controller of mechanism for dropout. causality: Boolean. If true, units that reference the future are masked. scope: Optional scope for `variable_scope`. Returns A 3d tensor with shape of (N, T_q, C) ''' d_model = queries.get_shape().as_list()[-1] with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): # Linear projections # 前向传播,Q,K,V计算 Q = tf.layers.dense(queries, d_model, use_bias=True) # (N, T_q, d_model) K = tf.layers.dense(keys, d_model, use_bias=True) # (N, T_k, d_model) V = tf.layers.dense(values, d_model, use_bias=True) # (N, T_k, d_model) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h) # Q_, K_, V_ 计算Attention outputs = scaled_dot_product_attention(Q_, K_, V_, key_masks, causality, dropout_rate, training) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, d_model) # Residual connection outputs += queries # Normalize (N, T_q, d_model) outputs = ln(outputs) return outputs
计算Q,K,V
计算相似度+softmax
def scaled_dot_product_attention(Q, K, V, key_masks, causality=False, dropout_rate=0., training=True, scope="scaled_dot_product_attention"): '''See 3.2.1. Q: Packed queries. 3d tensor. [N, T_q, d_k]. K: Packed keys. 3d tensor. [N, T_k, d_k]. V: Packed values. 3d tensor. [N, T_k, d_v]. key_masks: A 2d tensor with shape of [N, key_seqlen] causality: If True, applies masking for future blinding dropout_rate: A floating point number of [0, 1]. training: boolean for controlling droput scope: Optional scope for `variable_scope`. ''' with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): d_k = Q.get_shape().as_list()[-1] # 计算Q,K相似度。 # Q: (h*N, T_q, d_model/h) V: (h*N, T_k, d_model/h),其中 T_q == T_k # tf.transpose,高维度矩阵转置,输出维度:(h*N, d_model/h,T_k) # tf.matmul,最后两维度做矩阵乘法,所以最后维度为: # (h*N, T_q, T_k) outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # scale,同样,对值scale有点不清楚为啥 outputs /= d_k ** 0.5 # key_masks: [N, key_seqlen] # outputs维度不会变化 outputs = mask(outputs, key_masks=key_masks, type="key") # causality or future blinding masking if causality: outputs = mask(outputs, type="future") # softmax,数值转化为概率 outputs = tf.nn.softmax(outputs) # (h*N, T_k,T_q),这个转变只为了下面画图 attention = tf.transpose(outputs, [0, 2, 1]) # tensorboard记录,相当于attention可视化,但是注意: # TensorBord中看到的image summary永远是最后一个global step的 tf.summary.image("attention", tf.expand_dims(attention[:1], -1)) # dropout outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training) # weighted sum (context vectors) outputs = tf.matmul(outputs, V) # (N, T_q, d_v) return outputs
def mask(inputs, key_masks=None, type=None): """Masks paddings on keys or queries to inputs inputs: 3d tensor. (h*N, T_q, T_k) key_masks: 3d tensor. (N, 1, T_k) type: string. "key" | "future" e.g., >> inputs = tf.zeros([2, 2, 3], dtype=tf.float32) >> key_masks = tf.constant([[0., 0., 1.], [0., 1., 1.]]) >> mask(inputs, key_masks=key_masks, type="key") array([[[ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09], [ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09]], [[ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09], [ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09]], [[ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09], [ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09]], [[ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09], [ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09]]], dtype=float32) """ padding_num = -2 ** 32 + 1 # padding_mask if type in ("k", "key", "keys"): # [N,T1] # True矩阵转化为float key_masks = tf.to_float(key_masks) # tf.title(key_masks,[h,1]) #同一维度上复制的次数 # 目的是:对应多头的attention # 输出: (h*N, T1) key_masks = tf.tile(key_masks, [tf.shape(inputs)[0] // tf.shape(key_masks)[0], 1]) # 扩充维度为:( h*N, 1, T1),行之间广播 # 要mask的目标: (h*N, T_q, T_k) key_masks = tf.expand_dims(key_masks, 1) # 最后效果如上,需要mask的地方,全都为很小的负数。 outputs = inputs + key_masks * padding_num # 屏蔽未来信息 elif type in ("f", "future", "right"): diag_vals = tf.ones_like(inputs[0, :, :]) # (T_q, T_k) # 右上叫全为0的矩阵 tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k) # 将上面的操作扩充到batch里 future_masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1]) # (N, T_q, T_k) paddings = tf.ones_like(future_masks) * padding_num # 太强了 outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs) else: print("Check if you entered type correctly!") return outputs
def ln(inputs, epsilon = 1e-8, scope="ln"): '''Applies layer normalization. See https://arxiv.org/abs/1607.06450. inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. epsilon: A floating number. A very small number for preventing ZeroDivision Error. scope: Optional scope for `variable_scope`. Returns: A tensor with the same shape and data dtype as `inputs`. ''' with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): inputs_shape = inputs.get_shape() params_shape = inputs_shape[-1:] # 求均值,方差,以最后一维度求解, (N, T_q, d_model) # 意味着只针对对应位置的输出 mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer()) gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer()) normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) outputs = gamma * normalized + beta return outputs
def ff(inputs, num_units, scope="positionwise_feedforward"): '''position-wise feed forward net. See 3.3 inputs: A 3d tensor with shape of [N, T, C]. num_units: A list of two integers. scope: Optional scope for `variable_scope`. Returns: A 3d tensor with the same shape and dtype as inputs ''' with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): # Inner layer # num_units : [self.hp.d_ff, self.hp.d_model], 默认为[2048,512] # outputs: [N,T,d_ff] outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu) # Outer layer # [N,T,d_model] outputs = tf.layers.dense(outputs, num_units[1]) # Residual connection outputs += inputs # Normalize outputs = ln(outputs) return outputs
def decode(self, ys, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # tgt_masks tgt_masks = tf.equal(decoder_inputs, 0) # (N, T2) # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention,中间层 dec = multihead_attention(queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) # 也是一种矩阵乘法,三维和二维之间 logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) # (N, T2) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
上面说了,我对填充完后的数据表示很疑惑,到底是最大长度填充,还是batch内最大句子长度填充,所以我就来了波玄学探索
32是我的batch_size,44是句子长度,但我的最大句子长度是100,噢或!我看到了什么,我猜对了!
我们打印两次:
验证一个猜想后,我又一个疑惑出来了,输出的句子维度动态变化,我的模型接受数据也动态变化?不可能啊(可能我都是填充一个相同维度惯了)
再探索encode的维度
Q = tf.layers.dense(queries, d_model, use_bias=True) # (N, T_q, d_model)
enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
# Inner layer
outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
# Outer layer
outputs = tf.layers.dense(outputs, num_units[1])
# Residual connection
outputs += inputs
# Normalize
outputs = ln(outputs)
这个num_units是人工设置的,相当于这里又是,句子长度变化,不影响这个权重参数。
然后encode里就没参数了…???
所以我句子长度一直变,是没事儿的,虽然它常理上违反我的认知(给我一种那参数权重不久一直变的错觉!!!)
上一张图自己笔记里的图:
一种自己没仔细思考,自己吃苦果的mmp心情…
那我觉得我还想看一下,decoder里,有一个attention很特殊接受两个信息,我还想看看它,就是下面这个:
上面那三个箭头对应:
queries=dec,
keys=memory,
values=memory
def label_smoothing(inputs, epsilon=0.1): '''Applies label smoothing. See 5.4 and https://arxiv.org/abs/1512.00567. inputs: 3d tensor. [N, T, V], where V is the number of vocabulary. epsilon: Smoothing rate. For example, ``` import tensorflow as tf inputs = tf.convert_to_tensor([[[0, 0, 1], [0, 1, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0], [0, 1, 0]]], tf.float32) 完毕 outputs = label_smoothing(inputs) with tf.Session() as sess: print(sess.run([outputs])) >> [array([[[ 0.03333334, 0.03333334, 0.93333334], [ 0.03333334, 0.93333334, 0.03333334], [ 0.93333334, 0.03333334, 0.03333334]], [[ 0.93333334, 0.03333334, 0.03333334], [ 0.93333334, 0.03333334, 0.03333334], [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)] ``` ''' V = inputs.get_shape().as_list()[-1] # number of channels return ((1-epsilon) * inputs) + (epsilon / V)
本文完!
[ 1 ]The Annotated Transformer
[ 2 ]A TensorFlow Implementation of the Transformer: Attention Is All You Need
[ 4 ]ensorflow中API------tf.data.Dataset使用
[ 5 ]tf.data.Dataset.shuffle(buffer_size)中buffer_size的理解
[ 6 ] tensorflow 多张量计算
[ 7 ]einsum的基础使用
[ 8 ]浅谈BLEU评分
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。