当前位置:   article > 正文

pytorch版bert modeling_bert代码解析

modeling_bert

huggingface源码地址 https://github.com/huggingface/transformers

由于源码一直在更新迭代,我的版本可能不太新了,不过大致意思差不多

modeling_bert.py

预训练模型的下载地址,如果加载时 参数设置没用下好的模型地址,则会自动从这些地址上下载

  1. BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
  2. 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
  3. 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
  4. 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
  5. 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
  6. 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
  7. 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
  8. 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
  9. 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
  10. 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
  11. 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
  12. 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
  13. 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
  14. 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
  15. }

json格式存储的参数键值

  1. BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
  2. 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
  3. 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
  4. 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
  5. 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
  6. 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
  7. 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
  8. 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
  9. 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
  10. 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
  11. 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
  12. 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
  13. 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
  14. 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
  15. }

加载tf权重的函数:(主要是从checkpoints中获取weight和bias)

  1. def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
  2. """ Load tf checkpoints in a pytorch model.
  3. """
  4. try:
  5. import re
  6. import numpy as np
  7. import tensorflow as tf
  8. except ImportError: #import tensorflow失败时
  9. logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
  10. "https://www.tensorflow.org/install/ for installation instructions.")
  11. raise
  12. tf_path = os.path.abspath(tf_checkpoint_path) #取绝对路径
  13. logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
  14. # Load weights from TF model
  15. init_vars = tf.train.list_variables(tf_path) #Returns list of all variables in the checkpoint.
  16. names = []
  17. arrays = []
  18. for name, shape in init_vars:
  19. logger.info("Loading TF weight {} with shape {}".format(name, shape))
  20. array = tf.train.load_variable(tf_path, name)
  21. names.append(name)
  22. arrays.append(array)
  23. for name, array in zip(names, arrays):#一一对应的元祖
  24. name = name.split('/')
  25. # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
  26. # which are not required for using pretrained model
  27. if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
  28. logger.info("Skipping {}".format("/".join(name)))
  29. continue
  30. pointer = model
  31. for m_name in name:
  32. if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
  33. l = re.split(r'_(\d+)', m_name)
  34. else:
  35. l = [m_name]
  36. if l[0] == 'kernel' or l[0] == 'gamma':
  37. pointer = getattr(pointer, 'weight')
  38. elif l[0] == 'output_bias' or l[0] == 'beta':
  39. pointer = getattr(pointer, 'bias')
  40. elif l[0] == 'output_weights':
  41. pointer = getattr(pointer, 'weight')
  42. elif l[0] == 'squad':
  43. pointer = getattr(pointer, 'classifier')
  44. else:
  45. try:
  46. pointer = getattr(pointer, l[0])
  47. except AttributeError:
  48. logger.info("Skipping {}".format("/".join(name)))
  49. continue
  50. if len(l) >= 2:
  51. num = int(l[1])
  52. pointer = pointer[num]
  53. if m_name[-11:] == '_embeddings':
  54. pointer = getattr(pointer, 'weight')
  55. elif m_name == 'kernel':
  56. array = np.transpose(array)
  57. try:
  58. assert pointer.shape == array.shape
  59. except AssertionError as e:
  60. e.args += (pointer.shape, array.shape)
  61. raise
  62. logger.info("Initialize PyTorch weight {}".format(name))
  63. pointer.data = torch.from_numpy(array)
  64. return model

gelu激活函数  gelu(高斯误差线性单元)

激活函数的作用:给网络模型加非线性因子,让wx+b这样的线性变换加非线性

  1. def gelu(x):
  2. """Implementation of the gelu activation function.
  3. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
  4. 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
  5. Also see https://arxiv.org/abs/1606.08415
  6. """
  7. return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

swish 激活函数 (不太清楚啥时候用这个激活函数比较好)

  1. def swish(x):
  2. return x * torch.sigmoid(x)

BertConfig bert参数类(存BERT模型配置参数的类)

参数 :vocab_size_or_config_json_file=30522: `BertModel`的`inputs_ids`的词典大小(如果是个路径str就直接读里面的参数)
            hidden_size=768: 嵌入层和池化层的大小
            num_hidden_layers=12: Transformer encoder.的隐藏层数
            num_attention_heads=12: 每个注意力层的注意力头数
            intermediate_size=3072: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            hidden_act='gelu': 非线性激活函数in the encoder and pooler. 支持 "gelu", "relu" and "swish" 
            hidden_dropout_prob=0.1: 全连接层的dropout率 in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob=0.1: The dropout ratio for the attention probabilities.
            max_position_embeddings=512: 最大嵌入长度(e.g., 512 or 1024 or 2048).
            type_vocab_size=2: `token_type_ids` 的词典类型大小(非0即1)
            initializer_range=0.02: truncated_normal_initializer的标准差用于初始化所有的权重矩阵
            layer_norm_eps=1e-12: 层归一化(layerNorm)的epsilon值

  1. class BertConfig(PretrainedConfig):
  2. r"""
  3. :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a `BertModel`.
  4. Arguments:
  5. vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
  6. hidden_size: Size of the encoder layers and the pooler layer.
  7. num_hidden_layers: Number of hidden layers in the Transformer encoder.
  8. num_attention_heads: Number of attention heads for each attention layer in
  9. the Transformer encoder.
  10. intermediate_size: The size of the "intermediate" (i.e., feed-forward)
  11. layer in the Transformer encoder.
  12. hidden_act: The non-linear activation function (function or string) in the
  13. encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
  14. hidden_dropout_prob: The dropout probabilitiy for all fully connected
  15. layers in the embeddings, encoder, and pooler.
  16. attention_probs_dropout_prob: The dropout ratio for the attention
  17. probabilities.
  18. max_position_embeddings: The maximum sequence length that this model might
  19. ever be used with. Typically set this to something large just in case
  20. (e.g., 512 or 1024 or 2048).
  21. type_vocab_size: The vocabulary size of the `token_type_ids` passed into
  22. `BertModel`.
  23. initializer_range: The sttdev of the truncated_normal_initializer for
  24. initializing all weight matrices.
  25. layer_norm_eps: The epsilon used by LayerNorm.
  26. """
  27. pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
  28. def __init__(self,
  29. vocab_size_or_config_json_file=30522,
  30. hidden_size=768,
  31. num_hidden_layers=12,
  32. num_attention_heads=12,
  33. intermediate_size=3072,
  34. hidden_act="gelu",
  35. hidden_dropout_prob=0.1,
  36. attention_probs_dropout_prob=0.1,
  37. max_position_embeddings=512,
  38. type_vocab_size=2,
  39. initializer_range=0.02,
  40. layer_norm_eps=1e-12,
  41. **kwargs):
  42. super(BertConfig, self).__init__(**kwargs)
  43. if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
  44. and isinstance(vocab_size_or_config_json_file, unicode)):
  45. with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
  46. json_config = json.loads(reader.read())
  47. for key, value in json_config.items():
  48. self.__dict__[key] = value
  49. elif isinstance(vocab_size_or_config_json_file, int):
  50. self.vocab_size = vocab_size_or_config_json_file
  51. self.hidden_size = hidden_size
  52. self.num_hidden_layers = num_hidden_layers
  53. self.num_attention_heads = num_attention_heads
  54. self.hidden_act = hidden_act
  55. self.intermediate_size = intermediate_size
  56. self.hidden_dropout_prob = hidden_dropout_prob
  57. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  58. self.max_position_embeddings = max_position_embeddings
  59. self.type_vocab_size = type_vocab_size
  60. self.initializer_range = initializer_range
  61. self.layer_norm_eps = layer_norm_eps
  62. else:
  63. raise ValueError("First argument must be either a vocabulary size (int)"
  64. " or the path to a pretrained model config file (str)")

apex貌似是用来加速的

  1. try:
  2. from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
  3. except (ImportError, AttributeError) as e:
  4. logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
  5. BertLayerNorm = torch.nn.LayerNorm

做embedding嵌入的处理 

nn.Embedding 做lookup操作把对应标号的向量表示揪出来,分三个处理 ,

1. word_embedding相当于input_id就是字向量表示

2. position_embeddings位置嵌入,如果没处理就用0,1,2,3....这样的表明字所对应的位置

3.token_type_embeddings NSP操作 ,用0或1区分是上一句还是下一句 类似1111100000这样,现在很多论文说这个没用

三个处理完相加得到embedding并做层归一化和dropout

  1. class BertEmbeddings(nn.Module):
  2. """Construct the embeddings from word, position and token_type embeddings.
  3. """
  4. def __init__(self, config):
  5. super(BertEmbeddings, self).__init__()
  6. self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
  7. self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
  8. self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
  9. # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
  10. # any TensorFlow checkpoint file
  11. self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
  12. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  13. def forward(self, input_ids, token_type_ids=None, position_ids=None):
  14. seq_length = input_ids.size(1) #第一维是max_len
  15. if position_ids is None:
  16. position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) #0到511
  17. position_ids = position_ids.unsqueeze(0).expand_as(input_ids)#扩成input_ids的shape
  18. if token_type_ids is None: #如果这个参数是None,则不做NSP下个句子预测处理,句子类型全用0表示
  19. token_type_ids = torch.zeros_like(input_ids)
  20. words_embeddings = self.word_embeddings(input_ids)
  21. position_embeddings = self.position_embeddings(position_ids)
  22. token_type_embeddings = self.token_type_embeddings(token_type_ids)
  23. embeddings = words_embeddings + position_embeddings + token_type_embeddings
  24. embeddings = self.LayerNorm(embeddings)
  25. embeddings = self.dropout(embeddings)
  26. return embeddings

BertSelfAttention自注意力机制

embedding完维度依然 [batch,max_len,hidden_size]

linear层后依然不变  transpose时变成[batch,max_len,num_heads,head_size] permute后[batch,num_heads,max_len,head_size]

matmul 后 [batch,num_heads,max_len,max_len] 除以根号64也就是8

然后用attention_scores + attention_mask(论文中的MLM操作)

 torch.matmul(attention_probs, value_layer) 每个字V乘上对应的0-1的分数  维度[batch,num_heads,max_len,max_len] *[batch,num_heads,max_len,head_size]变成维度[batch,num_heads,max_len,head_size]

permute后 [batch,max_len,num_heads,head_size]

context_layer.size()[:-2] + (self.all_head_size,)后[batch,max_len,num_heads,all_head_size]

  1. class BertSelfAttention(nn.Module):
  2. def __init__(self, config):
  3. super(BertSelfAttention, self).__init__()
  4. if config.hidden_size % config.num_attention_heads != 0:#先判断注意力头数能否被隐藏层整除
  5. raise ValueError(
  6. "The hidden size (%d) is not a multiple of the number of attention "
  7. "heads (%d)" % (config.hidden_size, config.num_attention_heads))
  8. self.output_attentions = config.output_attentions #用来控制输出加不加attention_probs
  9. self.num_attention_heads = config.num_attention_heads #注意力头数
  10. self.attention_head_size = int(config.hidden_size / config.num_attention_heads)#每个头的维度大小 64
  11. self.all_head_size = self.num_attention_heads * self.attention_head_size
  12. self.query = nn.Linear(config.hidden_size, self.all_head_size)
  13. self.key = nn.Linear(config.hidden_size, self.all_head_size)
  14. self.value = nn.Linear(config.hidden_size, self.all_head_size)
  15. self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
  16. def transpose_for_scores(self, x):
  17. new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
  18. x = x.view(*new_x_shape)
  19. return x.permute(0, 2, 1, 3)
  20. def forward(self, hidden_states, attention_mask, head_mask=None):
  21. mixed_query_layer = self.query(hidden_states)
  22. mixed_key_layer = self.key(hidden_states)
  23. mixed_value_layer = self.value(hidden_states)
  24. query_layer = self.transpose_for_scores(mixed_query_layer)
  25. key_layer = self.transpose_for_scores(mixed_key_layer)
  26. value_layer = self.transpose_for_scores(mixed_value_layer)
  27. # Take the dot product between "query" and "key" to get the raw attention scores.
  28. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
  29. attention_scores = attention_scores / math.sqrt(self.attention_head_size)
  30. # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
  31. attention_scores = attention_scores + attention_mask
  32. # Normalize the attention scores to probabilities.
  33. attention_probs = nn.Softmax(dim=-1)(attention_scores) #在最外面一维加起来和为1
  34. # This is actually dropping out entire tokens to attend to, which might
  35. # seem a bit unusual, but is taken from the original Transformer paper.
  36. attention_probs = self.dropout(attention_probs)
  37. # Mask heads if we want to
  38. if head_mask is not None:
  39. attention_probs = attention_probs * head_mask
  40. context_layer = torch.matmul(attention_probs, value_layer)
  41. context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
  42. new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
  43. context_layer = context_layer.view(*new_context_layer_shape)
  44. outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
  45. return outputs

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/348649
推荐阅读
相关标签
  

闽ICP备14008679号