赞
踩
Input Embedding 和 Output Embedding
这一块主要是将对于编码器以及解码器输入的单词进行编码,通过torch.nn,embedding()方法进行编码,由于各语种单词的数量太多,如果使用传统的使用one-hot(独热码)编码就会使得编码的长度太长了,使得计算量增大,而且编码矩阵内的元素多半都是0。
Positional Encoding
位置编码,因为语句是一个序列,序列是存在前后输入的关系的,为了更好的拟合语句单词简单的关系,将位置信息也进行编码,然后与输入的单词编码进行叠加,这里也可以采用封装好的torch.nn.embedding()方法进行编码。
总体
将单词的编码与位置的编码经过叠加,这里本人猜想也可以采用各种叠加方式,例如权重等等,在下方代码中只采用了简单的线性求和方式。 然后将叠加后的矩阵复制成三份,输入到(下方Mutil-Head-Attention)多路注意力模块中。
Muliti-Head Attention
这一块进行多注意力机制的构造,在此模块中,在上一层模块中的输出复制三份,然后喂入到三个全连接层(相当于和三个矩阵进行相乘),形成query, key, value, 然后将query与key进行点积操作,过softmax形成对于每一个单词value的注意力权重,然后分别与value相乘求和。
Feed Forward
这一块将标准化后的输出进行两层全连接层。
Add & Norm
这一块是残差结构以及标准化,将多路注意力/全连接层(上边两个模块)的输入与输出进行残差连接,然后过标准化层。
总体
注意看左方的(N×)表示此Encoder模块按照(Multi-Head Attention ->Normal->Feed Forward->Normal)的顺序总共重复N次,在原论文中重复了6次。
Multi-Head Attention
这一块进行多注意力机制的构造,与Encoder模块中的多注意力模块是一致的,只不过它的输入的value,与key来源于Encoder模块的输出。
Masked Multi-Head Attention
在多路注意力机制上加了一个Masked,因为输入的真实值语句序列不需要看到后方的注意力,所以加了一个Masked蒙层,屏蔽后方真实值语句序列。
Add & Norm
与Encoder模块一致。
Feed Forward
与Encoder模块一致。
这一块是对每个单词的种类的概率的输出。
## 定义编码的类 class embedding(nn.Module): def __init__(self, input_dim, output_dim, padding_is=True): """ :param input_dim: 输入维度 :param output_dim: 输出维度 :param padding_is: """ super(embedding, self).__init__() self.padding_is = True self.input_dim = input_dim self.output_dim = output_dim self.embed = torch.nn.Embedding(self.input_dim, self.output_dim) def forward(self, x): ### 将输入的维度 转化为 输出维度 output = self.embed(x) return output
### 多路注意力机制 class multiple_head_attention(nn.Module): def __init__(self,num_units, num_heads= 8, dropout_rate=0, masked=False): """ :param num_units: :param num_heads: 多注意力机制 :param dropout_rate: 舍弃率 :param masked: 是否是解码处mask_attention """ super(multiple_head_attention, self).__init__() self.num_units = num_units self.num_heads = num_heads self.dropout_rate = dropout_rate self.masked = masked ## 定义Q K V层 此处使用三个全连接层 输入为num_units 输出也为num_units self.Q = nn.Sequential(nn.Linear(self.num_units, self.num_units), nn.ReLU()) self.K = nn.Sequential(nn.Linear(self.num_units, self.num_units), nn.ReLU()) self.V = nn.Sequential(nn.Linear(self.num_units, self.num_units), nn.ReLU()) self.output_dropout = nn.Dropout(p=self.dropout_rate) # 定义舍弃层 self.normalization = layer_normalization(self.num_units) ### 定义标准化方式 使用的是layer_normalization def forward(self, queries, keys, values): ## q,k,v的生成 三个全连接层 q = self.Q(queries) k = self.K(keys) v = self.V(values) ## 数据变换 因为是多路的出现 ##---------------------------------------------- 代码中这一块多路注意力 ## 现将上述得到的 q k v 在dim=2维度上进行拆分 拆成多路(?)部分 然后再在dim=0维度上进行拼接 ## [512, 10, 512 ]->[512, 10, 64]*8 ->[4096, 10, 64] q_ = torch.cat(torch.chunk(q, self.num_heads, dim=2), dim=0) k_ = torch.cat(torch.chunk(k, self.num_heads, dim=2), dim=0) v_ = torch.cat(torch.chunk(v, self.num_heads, dim=2), dim=0) ### 将k_进行(1,2)维度上的转置 进行 ## 然后进行q与k进行计算注意力 outputs = torch.bmm(q_, k_.permute(0, 2, 1)) ## 然后将值进行缩放 与原论文保持一致 outputs = outputs / (k_.size()[-1] ** 0.5) ## 在这里 如果解码器有masked的话 if self.masked: ##True diag_vals = torch.ones(*outputs[0, :, :].size()).cuda() ## 产生一个二维的矩阵,就是output的后两个维度的size tril = torch.tril(diag_vals, diagonal=0) ## 返回一个下三角矩阵 全1 masks = Variable(torch.unsqueeze(tril, 0).repeat(outputs.size()[0], 1, 1)) ## 在tril添加上output的第一个维度 padding = Variable(torch.ones(*masks.size()).cuda() * (-2 ** 32 + 1)) ##与masks 同维度 负无穷大 condition = masks.eq(0.).float() ## masks变上三角 outputs = padding * condition + outputs * (1. - condition) ## 进行masks了 # Activation outputs = F.softmax(outputs, dim=-1) # (h*N, T_q, T_k) 在维度2上进行softmax # Dropouts 舍弃层 outputs = self.output_dropout(outputs) # (h*N, T_q, T_k) ## 注意力与value进行加权 outputs = torch.bmm(outputs, v_) ## 还原回原来的## [512, 10, 512 ]->[512, 10, 64]*8 ->[4096, 10, 64] 反向 ##---------------------------------------------- outputs = torch.cat(torch.chunk(outputs, self.num_heads, dim=0), dim=2) ## 残差结构 outputs += queries # Normalize 标准化 outputs = self.normalization(outputs) # (N, T_q, C) return outputs
### 全连接层 class feedforward(nn.Module): def __init__(self, inputs_channels, num_units=[2048, 512]): """ :param inputs_channels: :param num_units: """ super(feedforward, self).__init__() self.inputs_channels = inputs_channels self.num_units = num_units ## 有两种方法实现全连接操作 一种是卷积操作 另一种则是线性层 # 这里我选用线性层 linear self.layer1 = nn.Sequential(nn.Linear(self.inputs_channels, self.num_units[0]), nn.ReLU()) self.layer2 = nn.Linear(self.num_units[0], self.num_units[1]) self.normalization = layer_normalization(self.inputs_channels) ## 定义标准化 def forward(self, inputs): outputs = self.layer1(inputs) outputs = self.layer2(outputs) outputs += inputs outputs = self.normalization(outputs) return outputs
### 层标准化
class layer_normalization(nn.Module):
def __init__(self, features, epsilon=1e-8):
super(layer_normalization, self).__init__()
self.epsilon = epsilon
self.gamma = nn.Parameter(torch.ones(features))
self.beta = nn.Parameter(torch.zeros(features))
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.gamma * (x - mean) / (std + self.epsilon) + self.beta
##总模型 class scl_models(nn.Module): def __init__(self, enc_voc, dec_voc): super(scl_models, self).__init__() self.enc_voc = enc_voc self.dec_voc = dec_voc ### 定义编码部分 self.enc_embedding = embedding(enc_voc, hp.hidden_units) ## 定义单词编码 self.enc_pos_embedding = embedding(hp.maxlen, hp.hidden_units) ## 定义位置编码 self.enc_dropout = nn.Dropout(hp.dropout_rate) ##定义舍弃 for i in range(hp.num_blocks):## 循环多少次多路注意力以及feedforward self.__setattr__('enc_attention_%d' % i, multiple_head_attention(num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, masked=False )) self.__setattr__('enc_feed_forward_%d' % i, feedforward(hp.hidden_units, [4*hp.hidden_units, hp.hidden_units] )) #### 定义解码部分 self.dec_embedding = embedding(enc_voc, hp.hidden_units) ## 定义单词编码 self.dec_pos_embedding = embedding(hp.maxlen, hp.hidden_units) ## 定义位置编码 self.dec_dropout = nn.Dropout(hp.dropout_rate) ##定义舍弃 for i in range(hp.num_blocks): ## 循环多少次多路注意力以及feedforward self.__setattr__('dec_attention_%d' % i, multiple_head_attention(num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, masked=True )) self.__setattr__('dec_attention2_%d' % i, multiple_head_attention(num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, masked=False )) self.__setattr__('dec_feed_forward_%d' % i, feedforward(hp.hidden_units, [4 * hp.hidden_units, hp.hidden_units] )) ## 定义线性层self.dec_voc 词的个数 self.logits_layer = nn.Linear(hp.hidden_units, self.dec_voc) self.label_smoothing = label_smoothing() def forward(self, x, y): ## 在每一个句子的开头添加一个开始符号 self.decoder_inputs = torch.cat([Variable(torch.ones(y[:, :1].size()).cuda() * 2).long(), y[:, :-1]], dim=-1) ## 将句子的单词进行编码 self.enc = self.enc_embedding(x) ## 对位置进行编码 然后与单词编码相加 self.enc += self.enc_pos_embedding(Variable(torch.unsqueeze(torch.arange(0, x.size()[1]), 0).repeat(x.size(0), 1).long().cuda())) #### blocks (编码器多头注意力以及全连接层) for i in range(hp.num_blocks): self.enc = self.__getattr__('enc_attention_%d' % i)(self.enc, self.enc, self.enc) self.enc = self.__getattr__('enc_feed_forward_%d' % i)(self.enc) ## 解码器部分 self.dec = self.dec_embedding(self.decoder_inputs) self.dec += self.dec_pos_embedding(Variable(torch.unsqueeze(torch.arange(0, self.decoder_inputs.size()[1]), 0).repeat(self.decoder_inputs.size(0), 1).long().cuda())) self.dec = self.dec_dropout(self.dec) for i in range(hp.num_blocks): self.dec = self.__getattr__('dec_attention_%d' % i)(self.dec, self.dec, self.dec) self.dec = self.__getattr__('dec_attention2_%d' % i)(self.dec, self.enc, self.enc) self.dec = self.__getattr__('dec_feed_forward_%d' % i)(self.dec) self.logits = self.logits_layer(self.dec) ## 经过softmax函数后又拉成二维,得到的就是每一个单词的每一种类的概率数 self.probably = F.softmax(self.logits, dim=-1).view(-1, self.dec_voc) _, self.preds = torch.max(self.logits, -1) ## 通过上边的全连接层 得到最大值 ## 用来判断本句话的位置是否有单词 然后拉直为一维数组 self.istarget = (1. - y.eq(0.).float()).view(-1) ## 计算准确度 self.acc = torch.sum(self.preds.eq(y).float().view(-1) * self.istarget) / torch.sum(self.istarget) # Loss self.y_onehot = torch.zeros(self.logits.size()[0] * self.logits.size()[1], self.dec_voc).cuda() self.y_onehot = Variable(self.y_onehot.scatter_(1, y.view(-1, 1).data, 1)) self.y_smoothed = self.label_smoothing(self.y_onehot) self.loss = - torch.sum(self.y_smoothed * torch.log(self.probably), dim=-1) self.mean_loss = torch.sum(self.loss * self.istarget) / torch.sum(self.istarget) return self.mean_loss, self.preds, self.acc
de2idx, idx2de = load_de_vocab() ## ctrl+鼠标左键 点击函数 查看注释 en2idx, idx2en = load_en_vocab() enc_voc = len(de2idx) ### 有多少单词映射idx dec_voc = len(en2idx) ### 有多少单词映射idx writer = SummaryWriter() X, Y = load_train_data() ## 得到处理后的二维数据 全部编码成了idx 数字 num_batch = len(X) // hp.batch_size ### batch_size一批的大小 num_batch 总数据一共分为了多少批 model = scl_models(enc_voc, dec_voc) ### 初始化自定义模型 model.train() ## 将模型设置为训练状态 model.cuda() ## 将模型部署到GPU上 ## 查看是够有训练模型的文件夹 ## 如果没有文件夹则新建一个文件夹 if not os.path.exists(hp.model_dir): os.makedirs(hp.model_dir) ## 查看之前是否训练过模型 if hp.preload is not None and os.path.exists(hp.model_dir + '/history.pkl'): with open(hp.model_dir + '/history.pkl') as history_file: history = pickle.load(history_file) else: history = {'current_batches': 0} current_batches = history['current_batches'] ## 取出batches optimizer = optim.Adam(model.parameters(), lr= hp.lr, betas=[0.9, 0.99], eps=1e-8) ## 定义优化器 #### 如果有模型,则加载模型的参数 model_pth_path = os.path.join(hp.model_dir,'optimizer.pth') if hp.preload is not None and os.path.exists(model_pth_path): optimizer.load_state_dict(torch.load(model_pth_path)) if hp.preload is not None and os.path.exists(hp.model_dir + '/model_epoch_%02d.pth' % hp.preload): model.load_state_dict(torch.load(hp.model_dir + '/model_epoch_%02d.pth' % hp.preload)) startepoch = int(hp.preload) if hp.preload is not None else 1 for epoch in range(startepoch, hp.num_epochs + 1): current_batch = 0 for index, current_index in get_batch_indices(len(X), hp.batch_size): tic = time.time() x_batch = Variable(torch.LongTensor(X[index]).cuda()) y_batch = Variable(torch.LongTensor(Y[index]).cuda()) toc = time.time() tic_r = time.time() torch.cuda.synchronize() optimizer.zero_grad() loss, _, acc = model(x_batch, y_batch) loss.backward() optimizer.step() torch.cuda.synchronize() toc_r = time.time() current_batches += 1 current_batch += 1 if current_batches % 10 == 0: writer.add_scalar('./loss', loss.data.cpu().numpy().item(), current_batches) writer.add_scalar('./acc', acc.data.cpu().numpy().item(), current_batches) if current_batches % 5 == 0: print('epoch %d, batch %d/%d, loss %f, acc %f' % (epoch, current_batch, num_batch, loss.item(), acc.item())) print('batch loading used time %f, model forward used time %f' % (toc - tic, toc_r - tic_r)) if current_batches % 100 == 0: writer.export_scalars_to_json(hp.model_dir + '/all_scalars.json') with open(hp.model_dir + '/history.pkl', 'wb') as out_file: pickle.dump(history, out_file) checkpoint_path = hp.model_dir + '/model_epoch_%02d' % epoch + '.pth' torch.save(model.state_dict(), checkpoint_path) torch.save(optimizer.state_dict(), hp.model_dir + '/optimizer.pth')
X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() enc_voc = len(de2idx) dec_voc = len(en2idx) # load model model = scl_models(enc_voc, dec_voc) model.load_state_dict(torch.load(hp.model_dir + '/model_epoch_%02d' % hp.eval_epoch + '.pth')) print('Model Loaded.') model.eval() model.cuda() # Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open('results/model%d.txt' % hp.eval_epoch, 'w', 'utf-8') as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size2): # Get mini-batches x = X[i * hp.batch_size2: (i + 1) * hp.batch_size2] sources = Sources[i * hp.batch_size2: (i + 1) * hp.batch_size2] targets = Targets[i * hp.batch_size2: (i + 1) * hp.batch_size2] # Autoregressive inference x_ = Variable(torch.LongTensor(x).cuda()) preds_t = torch.LongTensor(np.zeros((hp.batch_size2, hp.maxlen), np.int32)).cuda() preds = Variable(preds_t) for j in range(hp.maxlen): _, _preds, _ = model(x_, preds) preds_t[:, j] = _preds.data[:, j] preds = Variable(preds_t.long()) preds = preds.data.cpu().numpy() # Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) # Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score))
class Hyperparams: source_train = 'data/train.tags.de-en.de' ### 训练集的x地址 target_train = 'data/train.tags.de-en.en' ### 训练集的目标值地址 source_test = 'data/IWSLT16.TED.tst2014.de-en.de.xml' target_test = 'data/IWSLT16.TED.tst2014.de-en.en.xml' # training batch_size = 512 # alias = N batch_size2 = 32 # alias = N lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step. logdir = 'logdir' # log directory model_dir = './models/' # saving directory 保存模型的文件夹 # model maxlen = 10 # Maximum number of words in a sentence. alias = T. 每一句话的最大长度值 # Feel free to increase this if you are ambitious. min_cnt = 20 # words whose occurred less than min_cnt are encoded as <UNK>. hidden_units = 512 # alias = C num_blocks = 6 # number of encoder/decoder blocks num_epochs = 200 # epoch num_heads = 8 # attention numbers dropout_rate = 0.1 eval_epoch = 135 # epoch of model for eval preload = 20 # epcho of preloaded model for resuming training ## 重新开始训练的轮数
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。