赞
踩
本文来源: PyTorch官方教程
主体框架包括以下几个部分:
data.py: 负责数据预处理,包含字符切割、转换为token等;
model.py: 负责模型构建;
main.py: 主要脚本,负责训练模型;
generate.py: 负责用训练好的模型生成新文本。
以下对每个脚本中的代码进行详细解释:
data.py中包含两个主要类:
Dictionary和Corpus(语料库)
第一个类Dictionary负责构建word与index之间的转换关系
import os from io import open import torch class Dictionary(object): def __init__(self): self.word2idx = {} # 用于将字符转换为index self.idx2word = [] # 用于将index转换为字符 (生成文本时使用) def add_word(self, word): if word not in self.word2idx: self.idx2word.append(word) # 把word添加进列表末端 self.word2idx[word] = len(self.idx2word) - 1 # 生成word2idx字典,index为idx2word列表中的序号 return self.word2idx[word] def __len__(self): return len(self.idx2word)
第二个类定义了语料库:
class Corpus(object): def __init__(self, path): self.dictionary = Dictionary() #语料库的字典,包含所有训练集中的字符 self.train = self.tokenize(os.path.join(path, 'train.txt')) # 调用train数据集,并同时做tokenize self.valid = self.tokenize(os.path.join(path, 'valid.txt')) # 调用valid数据集,并同时做tokenize self.test = self.tokenize(os.path.join(path, 'test.txt')) # 调用test数据集,并同时做tokenize def tokenize(self, path): """Tokenizes a text file.""" assert os.path.exists(path) # 检查文件路径存在 # Add words to the dictionary with open(path, 'r', encoding="utf8") as f: for line in f: words = line.split() + ['<eos>'] #按照空格或制表符分割字符,并在每行末尾添加“End of Sentence”标识符 for word in words: self.dictionary.add_word(word) #将每个字符添加进语料库的dictionary中(同时添加进word2idx和idx2word) # Tokenize file content with open(path, 'r', encoding="utf8") as f: idss = [] # 最终储存所有句子的idx的列表 for line in f: words = line.split() + ['<eos>'] # 按照空格或制表符分割字符,并在每行末尾添加“End of Sentence”标识符 ids = [] # 储存每个句子的idx的列表 for word in words: ids.append(self.dictionary.word2idx[word]) # 将每个词的idx添加进ids列表中做记录 idss.append(torch.tensor(ids).type(torch.int64)) # 将每个句子的idx列表转换成tensor张量,并将其整合在一个大的idss列表中 ids = torch.cat(idss) # 将idss列表转换为张量,维度为[sentence number, sentence length] return ids
下一个重要的脚本是model.py:
该脚本包含两个主要的类:PositionalEncoding和TransformerModel
(这里暂不解释脚本中的RNNModel类)
第一个类实现Positional Encoding
import math import torch import torch.nn as nn import torch.nn.functional as F # Temporarily leave PositionalEncoding module here. Will be moved somewhere else. class PositionalEncoding(nn.Module): r"""Inject some information about the relative or absolute position of the tokens in the sequence. The positional encodings have the same dimension as the embeddings, so that the two can be summed. Here, we use sine and cosine functions of different frequencies. .. math: \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) \text{where pos is the word position and i is the embed idx) Args: d_model: the embed dim (required). dropout: the dropout value (default=0.1). max_len: the max. length of the incoming sequence (default=5000). Examples: >>> pos_encoder = PositionalEncoding(d_model) """ ''' __init__方法在实例化类的时候会自动调用''' def __init__(self, d_model, dropout=0.1, max_len=5000): super(PositionalEncoding, self).__init__() # 继承PositionalEncoding的父类的__init__方法 self.dropout = nn.Dropout(p=dropout) ''' initialize pe tensor''' pe = torch.zeros(max_len, d_model) ''' arange(start, end, step)生成一个张量''' ''' unsqueeze(0或1): 0代表在行的维度升维,1代表在列的维度升维。''' position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) ''' 生成每个位点的位置张量''' ''' 生成positional Encoding的矩阵(公式见下图)''' div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) ''' return the exponential for each elements''' pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe) def forward(self, x): r"""Inputs of forward function Args: x: the sequence fed to the positional encoder model (required). Shape: x: [sequence length, batch size, embed dim] output: [sequence length, batch size, embed dim] Examples: >>> output = pos_encoder(x) """ x = x + self.pe[:x.size(0), :] return self.dropout(x)
Positional Encoding公式表示如下:
https://kazemnejad.com/blog/transformer_architecture_positional_encoding
其中,pos表示每个字符在句子中的位置,dmodel表示positional encoding的Embedding的维度(通常与字符的Embedding维度相等,便于相加), i 的取值范围在 [0, dmodel/2)。
例如,当dmodel=512时,第一位字符的Positional Encoding可以表示为:
PE(1) = [sin(1/100000/512), cos(1/100000/512), sin(1/100002/512), cos(1/100002/512),…]
第二个类构建了Transformer的框架(还有没完全注释的地方,后续补充)
class TransformerModel(nn.Module): """Container module with an encoder, a recurrent or transformer module, and a decoder.""" """ ntoken: length of longest sentence; ninp: embedding dim (dimension of tensor produced by embedding) nhead: number of head in multi-head attention nhid: dimension of FeedForward network model (node number of hidden layer) nlayers: number of transformer encoder layers dropout: drop out rate """ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): super(TransformerModel, self).__init__() try: from torch.nn import TransformerEncoder, TransformerEncoderLayer except BaseException as e: raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or ' 'lower.') from e self.model_type = 'Transformer' self.src_mask = None self.pos_encoder = PositionalEncoding(ninp, dropout) encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) self.encoder = nn.Embedding(ntoken, ninp) self.ninp = ninp self.decoder = nn.Linear(nhid, ntoken) self.init_weights() ''' 实现mask''' ''' sz: 输入src的长度''' def _generate_square_subsequent_mask(self, sz): ''' triu: 返回矩阵的上三角部分(左对角线为界),其余设为0''' ''' ==1: 将数值转换为Boolen值''' ''' transpose后转换为下三角区域''' mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) ''' mask == 0: 取上三角除去对角线外的区域(取 除去下三角之后的区域)''' ''' masked_fill(mask矩阵, 填充值)''' mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) return mask ''' 初始化模型encoder和decoder参数''' def init_weights(self): initrange = 0.1 ''' nn.init.uniform_(tensor, a, b): 从均匀分布U(a,b)之间生成随机值(就是从(a,b)的范围内随机生成值),填充在向量encoder.weight中.''' ''' 这时,encoder=之前定义的nn.Embedding(ntoken,ninp)''' nn.init.uniform_(self.encoder.weight, -initrange, initrange) ''' nn.init.zeros_(tensor): Fills the input Tensor with the scalar value 0.''' nn.init.zeros_(self.decoder.bias) nn.init.uniform_(self.decoder.weight, -initrange, initrange) ''' forward函数是内嵌在nn.Module()类的__call__()方法中的,所以当实例化model后,在实例化的类后跟参数可以直接调用forward()函数进行训练''' ''' https://zhuanlan.zhihu.com/p/370234492''' def forward(self, src, has_mask=True): if has_mask: device = src.device if self.src_mask is None or self.src_mask.size(0) != len(src): mask = self._generate_square_subsequent_mask(len(src)).to(device) self.src_mask = mask else: self.src_mask = None src = self.encoder(src) * math.sqrt(self.ninp) ''' 下面这行是一个例子:直接实例化类(在__init__()中)后,给实例化的类输入参数(下面这行),即可调用__call__()中的forward()方法''' src = self.pos_encoder(src) ''' 下面这行同样,也是直接实例化类后,给输入参数后就可以调用__call__()中的forward()方法''' output = self.transformer_encoder(src, self.src_mask) output = self.decoder(output) return F.log_softmax(output, dim=-1)
剩余两个脚本未完待续…
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。