赞
踩
huggingface源码地址 https://github.com/huggingface/transformers
由于源码一直在更新迭代,我的版本可能不太新了,不过大致意思差不多
modeling_bert.py
预训练模型的下载地址,如果加载时 参数设置没用下好的模型地址,则会自动从这些地址上下载
- BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
- 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
- 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
- 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
- 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
- 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
- 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
- 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
- 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
- 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
- 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
- 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
- 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
- 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
- }
json格式存储的参数键值
- BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
- 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
- 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
- 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
- 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
- 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
- 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
- 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
- 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
- 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
- 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
- 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
- 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
- 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
- }
加载tf权重的函数:(主要是从checkpoints中获取weight和bias)
- def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
- """ Load tf checkpoints in a pytorch model.
- """
- try:
- import re
- import numpy as np
- import tensorflow as tf
- except ImportError: #import tensorflow失败时
- logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
- "https://www.tensorflow.org/install/ for installation instructions.")
- raise
- tf_path = os.path.abspath(tf_checkpoint_path) #取绝对路径
- logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
- # Load weights from TF model
- init_vars = tf.train.list_variables(tf_path) #Returns list of all variables in the checkpoint.
- names = []
- arrays = []
- for name, shape in init_vars:
- logger.info("Loading TF weight {} with shape {}".format(name, shape))
- array = tf.train.load_variable(tf_path, name)
- names.append(name)
- arrays.append(array)
-
- for name, array in zip(names, arrays):#一一对应的元祖
- name = name.split('/')
- # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
- # which are not required for using pretrained model
- if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
- logger.info("Skipping {}".format("/".join(name)))
- continue
- pointer = model
- for m_name in name:
- if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
- l = re.split(r'_(\d+)', m_name)
- else:
- l = [m_name]
- if l[0] == 'kernel' or l[0] == 'gamma':
- pointer = getattr(pointer, 'weight')
- elif l[0] == 'output_bias' or l[0] == 'beta':
- pointer = getattr(pointer, 'bias')
- elif l[0] == 'output_weights':
- pointer = getattr(pointer, 'weight')
- elif l[0] == 'squad':
- pointer = getattr(pointer, 'classifier')
- else:
- try:
- pointer = getattr(pointer, l[0])
- except AttributeError:
- logger.info("Skipping {}".format("/".join(name)))
- continue
- if len(l) >= 2:
- num = int(l[1])
- pointer = pointer[num]
- if m_name[-11:] == '_embeddings':
- pointer = getattr(pointer, 'weight')
- elif m_name == 'kernel':
- array = np.transpose(array)
- try:
- assert pointer.shape == array.shape
- except AssertionError as e:
- e.args += (pointer.shape, array.shape)
- raise
- logger.info("Initialize PyTorch weight {}".format(name))
- pointer.data = torch.from_numpy(array)
- return model
gelu激活函数 gelu(高斯误差线性单元)
激活函数的作用:给网络模型加非线性因子,让wx+b这样的线性变换加非线性
- def gelu(x):
- """Implementation of the gelu activation function.
- For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
- 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
- Also see https://arxiv.org/abs/1606.08415
- """
- return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
swish 激活函数 (不太清楚啥时候用这个激活函数比较好)
- def swish(x):
- return x * torch.sigmoid(x)
BertConfig bert参数类(存BERT模型配置参数的类)
参数 :vocab_size_or_config_json_file=30522: `BertModel`的`inputs_ids`的词典大小(如果是个路径str就直接读里面的参数)
hidden_size=768: 嵌入层和池化层的大小
num_hidden_layers=12: Transformer encoder.的隐藏层数
num_attention_heads=12: 每个注意力层的注意力头数
intermediate_size=3072: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act='gelu': 非线性激活函数in the encoder and pooler. 支持 "gelu", "relu" and "swish"
hidden_dropout_prob=0.1: 全连接层的dropout率 in the embeddings, encoder, and pooler.
attention_probs_dropout_prob=0.1: The dropout ratio for the attention probabilities.
max_position_embeddings=512: 最大嵌入长度(e.g., 512 or 1024 or 2048).
type_vocab_size=2: `token_type_ids` 的词典类型大小(非0即1)
initializer_range=0.02: truncated_normal_initializer的标准差用于初始化所有的权重矩阵
layer_norm_eps=1e-12: 层归一化(layerNorm)的epsilon值
- class BertConfig(PretrainedConfig):
- r"""
- :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a `BertModel`.
- Arguments:
- vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
- hidden_size: Size of the encoder layers and the pooler layer.
- num_hidden_layers: Number of hidden layers in the Transformer encoder.
- num_attention_heads: Number of attention heads for each attention layer in
- the Transformer encoder.
- intermediate_size: The size of the "intermediate" (i.e., feed-forward)
- layer in the Transformer encoder.
- hidden_act: The non-linear activation function (function or string) in the
- encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
- hidden_dropout_prob: The dropout probabilitiy for all fully connected
- layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob: The dropout ratio for the attention
- probabilities.
- max_position_embeddings: The maximum sequence length that this model might
- ever be used with. Typically set this to something large just in case
- (e.g., 512 or 1024 or 2048).
- type_vocab_size: The vocabulary size of the `token_type_ids` passed into
- `BertModel`.
- initializer_range: The sttdev of the truncated_normal_initializer for
- initializing all weight matrices.
- layer_norm_eps: The epsilon used by LayerNorm.
- """
- pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-
- def __init__(self,
- vocab_size_or_config_json_file=30522,
- hidden_size=768,
- num_hidden_layers=12,
- num_attention_heads=12,
- intermediate_size=3072,
- hidden_act="gelu",
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=2,
- initializer_range=0.02,
- layer_norm_eps=1e-12,
- **kwargs):
- super(BertConfig, self).__init__(**kwargs)
- if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
- and isinstance(vocab_size_or_config_json_file, unicode)):
- with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
- json_config = json.loads(reader.read())
- for key, value in json_config.items():
- self.__dict__[key] = value
- elif isinstance(vocab_size_or_config_json_file, int):
- self.vocab_size = vocab_size_or_config_json_file
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.hidden_act = hidden_act
- self.intermediate_size = intermediate_size
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.type_vocab_size = type_vocab_size
- self.initializer_range = initializer_range
- self.layer_norm_eps = layer_norm_eps
- else:
- raise ValueError("First argument must be either a vocabulary size (int)"
- " or the path to a pretrained model config file (str)")
apex貌似是用来加速的
- try:
- from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
- except (ImportError, AttributeError) as e:
- logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
- BertLayerNorm = torch.nn.LayerNorm
做embedding嵌入的处理
nn.Embedding 做lookup操作把对应标号的向量表示揪出来,分三个处理 ,
1. word_embedding相当于input_id就是字向量表示
2. position_embeddings位置嵌入,如果没处理就用0,1,2,3....这样的表明字所对应的位置
3.token_type_embeddings NSP操作 ,用0或1区分是上一句还是下一句 类似1111100000这样,现在很多论文说这个没用
三个处理完相加得到embedding并做层归一化和dropout
- class BertEmbeddings(nn.Module):
- """Construct the embeddings from word, position and token_type embeddings.
- """
- def __init__(self, config):
- super(BertEmbeddings, self).__init__()
- self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
- self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
- self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
- # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
- # any TensorFlow checkpoint file
- self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
- def forward(self, input_ids, token_type_ids=None, position_ids=None):
- seq_length = input_ids.size(1) #第一维是max_len
- if position_ids is None:
- position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) #0到511
- position_ids = position_ids.unsqueeze(0).expand_as(input_ids)#扩成input_ids的shape
- if token_type_ids is None: #如果这个参数是None,则不做NSP下个句子预测处理,句子类型全用0表示
- token_type_ids = torch.zeros_like(input_ids)
-
- words_embeddings = self.word_embeddings(input_ids)
- position_embeddings = self.position_embeddings(position_ids)
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
- embeddings = words_embeddings + position_embeddings + token_type_embeddings
- embeddings = self.LayerNorm(embeddings)
- embeddings = self.dropout(embeddings)
- return embeddings
BertSelfAttention自注意力机制
embedding完维度依然 [batch,max_len,hidden_size]
linear层后依然不变 transpose时变成[batch,max_len,num_heads,head_size] permute后[batch,num_heads,max_len,head_size]
matmul 后 [batch,num_heads,max_len,max_len] 除以根号64也就是8
然后用attention_scores + attention_mask(论文中的MLM操作)
torch.matmul(attention_probs, value_layer) 每个字V乘上对应的0-1的分数 维度[batch,num_heads,max_len,max_len] *[batch,num_heads,max_len,head_size]变成维度[batch,num_heads,max_len,head_size]
permute后 [batch,max_len,num_heads,head_size]
context_layer.size()[:-2] + (self.all_head_size,)后[batch,max_len,num_heads,all_head_size]
- class BertSelfAttention(nn.Module):
- def __init__(self, config):
- super(BertSelfAttention, self).__init__()
- if config.hidden_size % config.num_attention_heads != 0:#先判断注意力头数能否被隐藏层整除
- raise ValueError(
- "The hidden size (%d) is not a multiple of the number of attention "
- "heads (%d)" % (config.hidden_size, config.num_attention_heads))
- self.output_attentions = config.output_attentions #用来控制输出加不加attention_probs
- self.num_attention_heads = config.num_attention_heads #注意力头数
- self.attention_head_size = int(config.hidden_size / config.num_attention_heads)#每个头的维度大小 64
- self.all_head_size = self.num_attention_heads * self.attention_head_size
-
- self.query = nn.Linear(config.hidden_size, self.all_head_size)
- self.key = nn.Linear(config.hidden_size, self.all_head_size)
- self.value = nn.Linear(config.hidden_size, self.all_head_size)
- self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
- def transpose_for_scores(self, x):
- new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
- x = x.view(*new_x_shape)
- return x.permute(0, 2, 1, 3)
-
- def forward(self, hidden_states, attention_mask, head_mask=None):
- mixed_query_layer = self.query(hidden_states)
- mixed_key_layer = self.key(hidden_states)
- mixed_value_layer = self.value(hidden_states)
-
- query_layer = self.transpose_for_scores(mixed_query_layer)
- key_layer = self.transpose_for_scores(mixed_key_layer)
- value_layer = self.transpose_for_scores(mixed_value_layer)
-
- # Take the dot product between "query" and "key" to get the raw attention scores.
- attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
- attention_scores = attention_scores / math.sqrt(self.attention_head_size)
- # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
- attention_scores = attention_scores + attention_mask
-
- # Normalize the attention scores to probabilities.
- attention_probs = nn.Softmax(dim=-1)(attention_scores) #在最外面一维加起来和为1
-
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = self.dropout(attention_probs)
-
- # Mask heads if we want to
- if head_mask is not None:
- attention_probs = attention_probs * head_mask
-
- context_layer = torch.matmul(attention_probs, value_layer)
-
- context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
- new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
- context_layer = context_layer.view(*new_context_layer_shape)
-
- outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
- return outputs
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。