赞
踩
赛事链接:2024 iFLYTEK AI开发者大赛-讯飞开放平台
自然语言处理(Natural Language Processing,NLP)是语言学与人工智能的分支,试图让计算机能够完成处理语言、理解语言和生成语言等任务。大致可以将 NLP 任务分为四类:序列标注、分类任务、句子关系判断、生成式任务。
参数调整小tips:
当前机器翻译任务的主流解决方案是基于神经网络进行建模。Seq2Seq 技术开通了将经典深度神经网络模型(DNNs)运用于在翻译,文本自动摘要和机器人自动问答以及一些回归预测任务上。它的主要思路是将一个作为输入的序列映射为一个作为输出的序列,这一过程由编码(Encoder)输入与解码(Decoder)输出两个环节组成,前者负责把序列编码成一个固定长度的向量,这个向量作为输入传给后者,输出可变长度的向量。
- class Encoder(nn.Module):
- def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
- super().__init__()
- self.hid_dim = hid_dim
- self.n_layers = n_layers
-
- self.embedding = nn.Embedding(input_dim, emb_dim)
- self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
- self.dropout = nn.Dropout(dropout)
-
- def forward(self, src):
- # src = [batch size, src len]
- embedded = self.dropout(self.embedding(src))
- # embedded = [batch size, src len, emb dim]
-
- outputs, hidden = self.gru(embedded)
- # outputs = [batch size, src len, hid dim * n directions]
- # hidden = [n layers * n directions, batch size, hid dim]
-
- return outputs, hidden

传统的 Seq2Seq 模型在解码阶段仅依赖于编码器产生的最后一个隐藏状态,这在处理长序列时效果不佳。注意力机制允许解码器在生成每个输出词时,关注编码器产生的所有中间状态,从而更好地利用源序列的信息。具体来说,给定源语言序列经过编码器输出的向量序列
,注意力机制旨在依据解码端翻译的需要,自适应地从这个向量序列中查找对应的信息。h1,h2,h3,...,hm
- class Attention(nn.Module):
- def __init__(self, hid_dim):
- super().__init__()
- self.attn = nn.Linear(hid_dim * 2, hid_dim)
- self.v = nn.Linear(hid_dim, 1, bias=False)
-
- def forward(self, hidden, encoder_outputs):
- # hidden = [1, batch size, hid dim]
- # encoder_outputs = [batch size, src len, hid dim]
-
- batch_size = encoder_outputs.shape[0]
- src_len = encoder_outputs.shape[1]
-
- hidden = hidden.repeat(src_len, 1, 1).transpose(0, 1)
- # hidden = [batch size, src len, hid dim]
-
- energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
- # energy = [batch size, src len, hid dim]
-
- attention = self.v(energy).squeeze(2)
- # attention = [batch size, src len]
-
- return F.softmax(attention, dim=1)

- class Decoder(nn.Module):
- def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
- super().__init__()
- self.output_dim = output_dim
- self.hid_dim = hid_dim
- self.n_layers = n_layers
- self.attention = attention
-
- self.embedding = nn.Embedding(output_dim, emb_dim)
- self.gru = nn.GRU(hid_dim + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
- self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)
- self.dropout = nn.Dropout(dropout)
-
- def forward(self, input, hidden, encoder_outputs):
- # input = [batch size, 1]
- # hidden = [n layers, batch size, hid dim]
- # encoder_outputs = [batch size, src len, hid dim]
-
- input = input.unsqueeze(1)
- embedded = self.dropout(self.embedding(input))
- # embedded = [batch size, 1, emb dim]
-
- a = self.attention(hidden[-1:], encoder_outputs)
- # a = [batch size, src len]
-
- a = a.unsqueeze(1)
- # a = [batch size, 1, src len]
-
- weighted = torch.bmm(a, encoder_outputs)
- # weighted = [batch size, 1, hid dim]
-
- rnn_input = torch.cat((embedded, weighted), dim=2)
- # rnn_input = [batch size, 1, emb dim + hid dim]
-
- output, hidden = self.gru(rnn_input, hidden)
- # output = [batch size, 1, hid dim]
- # hidden = [n layers, batch size, hid dim]
-
- embedded = embedded.squeeze(1)
- output = output.squeeze(1)
- weighted = weighted.squeeze(1)
-
- prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
- # prediction = [batch size, output dim]
-
- return prediction, hidden

- class Seq2Seq(nn.Module):
- def __init__(self, encoder, decoder, device):
- super().__init__()
- self.encoder = encoder
- self.decoder = decoder
- self.device = device
-
- def forward(self, src, trg, teacher_forcing_ratio=0.5):
- # src = [batch size, src len]
- # trg = [batch size, trg len]
-
- batch_size = src.shape[0]
- trg_len = trg.shape[1]
- trg_vocab_size = self.decoder.output_dim
-
- outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
- encoder_outputs, hidden = self.encoder(src)
-
- input = trg[:, 0]
-
- for t in range(1, trg_len):
- output, hidden = self.decoder(input, hidden, encoder_outputs)
- outputs[:, t] = output
- teacher_force = random.random() < teacher_forcing_ratio
- top1 = output.argmax(1)
- input = trg[:, t] if teacher_force else top1
-
- return outputs

Transformer 在原论文中第一次提出就是将其应用到机器翻译领域,它的出现使得机器翻译的性能和效率迈向了一个新的阶段。它摒弃了循环结构,并完全通过注意力机制完成对源语言序列和目标语言序列全局依赖的建模。在抽取每个单词的上下文特征时,Transformer 通过自注意力机制(self-attention)衡量上下文中每一个单词对当前单词的重要程度。
Transformer的主要组件包括编码器(Encoder)、解码器(Decoder)和注意力层。其核心是利用多头自注意力机制(Multi-Head Self-Attention),使每个位置的表示不仅依赖于当前位置,还能够直接获取其他位置的表示。自从提出以来,Transformer模型在机器翻译、文本生成等自然语言处理任务中均取得了突破性进展,成为NLP领域新的主流模型。
- class PositionalEncoding(nn.Module):
- def __init__(self, d_model, dropout=0.1, max_len=5000):
- super(PositionalEncoding, self).__init__()
- self.dropout = nn.Dropout(p=dropout)
-
- pe = torch.zeros(max_len, d_model)
- position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
- div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
- pe[:, 0::2] = torch.sin(position * div_term)
- pe[:, 1::2] = torch.cos(position * div_term)
- pe = pe.unsqueeze(0).transpose(0, 1)
- self.register_buffer('pe', pe)
-
- def forward(self, x):
- x = x + self.pe[:x.size(0), :]
- return self.dropout(x)

- # Transformer
- class TransformerModel(nn.Module):
- def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
- super(TransformerModel, self).__init__()
- self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
- self.src_embedding = nn.Embedding(len(src_vocab), d_model)
- self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model)
- self.positional_encoding = PositionalEncoding(d_model, dropout)
- self.fc_out = nn.Linear(d_model, len(tgt_vocab))
- self.src_vocab = src_vocab
- self.tgt_vocab = tgt_vocab
- self.d_model = d_model
-
- def forward(self, src, tgt):
- # 调整src和tgt的维度
- src = src.transpose(0, 1) # (seq_len, batch_size)
- tgt = tgt.transpose(0, 1) # (seq_len, batch_size)
-
- src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
- tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
-
- src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1)
- tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1)
-
- src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
- tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
-
- output = self.transformer(src_embedded, tgt_embedded,
- src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
- return self.fc_out(output).transpose(0, 1)

- def initialize_model(src_vocab, tgt_vocab, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
- model = TransformerModel(src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
- return model
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。