当前位置:   article > 正文

Bert 代码详细解读——modeling.py_modeling.get_assignment_map_from_checkpoint

modeling.get_assignment_map_from_checkpoint

在官方的bert-github上,

git clone  https://github.com/google-research/bert.git

主要的文件内容如下图:

主要包括7个主要的python文件,小编要坚持把这7个解读清楚呀!

首先解读的是modeling.py文件,是bert实现的核心代码,主要包括2个类和17个函数,如下所示:

一、类

1.class BertConfig(object):

  1. class BertConfig(object):
  2. """Configuration for `BertModel`."""
  3. def __init__(self,
  4. vocab_size,“词表中共有多少个词”
  5. hidden_size=768,#词嵌入的维度,也是编码层和池化层的维度
  6. num_hidden_layers=12,#transformer隐藏层数个数
  7. num_attention_heads=12,#在encoder层中的注意头个数
  8. intermediate_size=3072,#encoder中间隐藏层神经元数,如feed-forward layer
  9. hidden_act="gelu",#encoder和pooler的激活函数
  10. hidden_dropout_prob=0.1,
  11. attention_probs_dropout_prob=0.1,
  12. max_position_embeddings=512,
  13. type_vocab_size=16,
  14. initializer_range=0.02):
  15. self.vocab_size = vocab_size
  16. self.hidden_size = hidden_size
  17. self.num_hidden_layers = num_hidden_layers
  18. self.num_attention_heads = num_attention_heads
  19. self.hidden_act = hidden_act
  20. self.intermediate_size = intermediate_size
  21. self.hidden_dropout_prob = hidden_dropout_prob
  22. self.attention_probs_dropout_prob = attention_probs_dropout_prob
  23. self.max_position_embeddings = max_position_embeddings
  24. self.type_vocab_size = type_vocab_size
  25. self.initializer_range = initializer_range
  26. @classmethod
  27. def from_dict(cls, json_object):#从json_object读入,将config的参数放入字典中
  28. """Constructs a `BertConfig` from a Python dictionary of parameters."""
  29. config = BertConfig(vocab_size=None)
  30. for (key, value) in six.iteritems(json_object):
  31. config.__dict__[key] = value
  32. return config
  33. @classmethod
  34. def from_json_file(cls, json_file):
  35. """Constructs a `BertConfig` from a json file of parameters."""
  36. with tf.gfile.GFile(json_file, "r") as reader:
  37. text = reader.read()
  38. return cls.from_dict(json.loads(text))
  39. def to_dict(self):
  40. """Serializes this instance to a Python dictionary."""
  41. output = copy.deepcopy(self.__dict__)
  42. return output
  43. def to_json_string(self):
  44. """Serializes this instance to a JSON string."""
  45. return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

2.class BertModel(object):

  1. class BertModel(object):
  2. """BERT model ("Bidirectional Encoder Representations from Transformers").
  3. def __init__(self,
  4. config,#上边第一个类BertConfig的实例
  5. is_training,#控制是否训练模型
  6. input_ids,#int32位的Tensor,维度为[batch_size, seq_length]
  7. input_mask=None,#可选,int32位的Tensor,维度为[batch_size, seq_length]
  8. token_type_ids=None,#可选,int32位,维度为[batch_size, seq_length]
  9. use_one_hot_embeddings=False,#可选,是否用one-hot,或使用tf.embedding_lookup()的预训练词向量
  10. scope=None#变量可用的范围,默认是整个bert):
  11. #当config实例错误或输入维度错误时,提示ValueError
  12. config = copy.deepcopy(config)
  13. if not is_training:#如果is_training是False,dropout设为0
  14. config.hidden_dropout_prob = 0.0
  15. config.attention_probs_dropout_prob = 0.0
  16. input_shape = get_shape_list(input_ids, expected_rank=2)
  17. batch_size = input_shape[0]
  18. seq_length = input_shape[1]
  19. if input_mask is None:#未提供mask,就都设为1
  20. input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
  21. if token_type_ids is None:#未提供token_type都设为0
  22. token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
  23. with tf.variable_scope(scope, default_name="bert"):
  24. with tf.variable_scope("embeddings"):
  25. # Perform embedding lookup on the word ids.
  26. (self.embedding_output, self.embedding_table) = embedding_lookup(
  27. input_ids=input_ids,
  28. vocab_size=config.vocab_size,
  29. embedding_size=config.hidden_size,
  30. initializer_range=config.initializer_range,
  31. word_embedding_name="word_embeddings",
  32. use_one_hot_embeddings=use_one_hot_embeddings)
  33. # Add positional embeddings and token type embeddings, then layer
  34. # normalize and perform dropout.
  35. self.embedding_output = embedding_postprocessor(
  36. input_tensor=self.embedding_output,
  37. use_token_type=True,
  38. token_type_ids=token_type_ids,
  39. token_type_vocab_size=config.type_vocab_size,
  40. token_type_embedding_name="token_type_embeddings",
  41. use_position_embeddings=True,
  42. position_embedding_name="position_embeddings",
  43. initializer_range=config.initializer_range,
  44. max_position_embeddings=config.max_position_embeddings,
  45. dropout_prob=config.hidden_dropout_prob)
  46. with tf.variable_scope("encoder"):
  47. # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
  48. # mask of shape [batch_size, seq_length, seq_length] which is used
  49. # for the attention scores.
  50. attention_mask = create_attention_mask_from_input_mask(
  51. input_ids, input_mask)
  52. # Run the stacked transformer.
  53. # `sequence_output` shape = [batch_size, seq_length, hidden_size].
  54. self.all_encoder_layers = transformer_model(
  55. input_tensor=self.embedding_output,
  56. attention_mask=attention_mask,
  57. hidden_size=config.hidden_size,
  58. num_hidden_layers=config.num_hidden_layers,
  59. num_attention_heads=config.num_attention_heads,
  60. intermediate_size=config.intermediate_size,
  61. intermediate_act_fn=get_activation(config.hidden_act),
  62. hidden_dropout_prob=config.hidden_dropout_prob,
  63. attention_probs_dropout_prob=config.attention_probs_dropout_prob,
  64. initializer_range=config.initializer_range,
  65. do_return_all_layers=True)
  66. self.sequence_output = self.all_encoder_layers[-1]
  67. # The "pooler" converts the encoded sequence tensor of shape
  68. # [batch_size, seq_length, hidden_size] to a tensor of shape
  69. # [batch_size, hidden_size]. This is necessary for segment-level
  70. # (or segment-pair-level) classification tasks where we need a fixed
  71. # dimensional representation of the segment.
  72. with tf.variable_scope("pooler"):
  73. # We "pool" the model by simply taking the hidden state corresponding
  74. # to the first token. We assume that this has been pre-trained
  75. first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
  76. self.pooled_output = tf.layers.dense(
  77. first_token_tensor,
  78. config.hidden_size,
  79. activation=tf.tanh,
  80. kernel_initializer=create_initializer(config.initializer_range))
  81. def get_pooled_output(self):
  82. return self.pooled_output
  83. def get_sequence_output(self):
  84. """Gets final hidden layer of encoder.
  85. Returns:
  86. float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
  87. to the final hidden of the transformer encoder.
  88. """
  89. return self.sequence_output
  90. def get_all_encoder_layers(self):
  91. return self.all_encoder_layers
  92. def get_embedding_output(self):
  93. """Gets output of the embedding lookup (i.e., input to the transformer).
  94. Returns:
  95. float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
  96. to the output of the embedding layer, after summing the word
  97. embeddings with the positional embeddings and the token type embeddings,
  98. then performing layer normalization. This is the input to the transformer.
  99. """
  100. return self.embedding_output
  101. def get_embedding_table(self):
  102. return self.embedding_table

二、17个小函数

1.def gelu(x)

  1. def gelu(x):
  2. """Gaussian Error Linear Unit.#高斯错误线性单元,GELU(x)=xP(X<=x)=xΦ(x)
  3. This is a smoother version of the RELU.
  4. Original paper: https://arxiv.org/abs/1606.08415
  5. Args:
  6. x: float Tensor to perform activation.
  7. Returns:
  8. `x` with the GELU activation applied.
  9. """
  10. cdf = 0.5 * (1.0 + tf.tanh(
  11. (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
  12. return x * cdf

2.def get_activation(activation_string)

  1. def get_activation(activation_string):
  2. """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
  3. #将激活函数的string名称变为一个函数
  4. Args:
  5. activation_string: String name of the activation function.
  6. Returns:
  7. A Python function corresponding to the activation function. If
  8. `activation_string` is None, empty, or "linear", this will return None.
  9. If `activation_string` is not a string, it will return `activation_string`.
  10. Raises:
  11. ValueError: The `activation_string` does not correspond to a known
  12. activation.
  13. """
  14. # We assume that anything that"s not a string is already an activation
  15. # function, so we just return it.
  16. if not isinstance(activation_string, six.string_types):
  17. return activation_string
  18. if not activation_string:
  19. return None
  20. act = activation_string.lower()
  21. if act == "linear":
  22. return None
  23. elif act == "relu":
  24. return tf.nn.relu
  25. elif act == "gelu":
  26. return gelu
  27. elif act == "tanh":
  28. return tf.tanh
  29. else:
  30. raise ValueError("Unsupported activation: %s" % act)

3.def get_assignment_map_from_checkpoint(tvars, init_checkpoint)

  1. def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
  2. #计算当前变量和交叉点变量的集合
  3. """Compute the union of the current variables and checkpoint variables."""
  4. assignment_map = {}
  5. initialized_variable_names = {}
  6. #定义一个按输入顺序排列的字典
  7. name_to_variable = collections.OrderedDict()
  8. for var in tvars:
  9. name = var.name
  10. m = re.match("^(.*):\\d+$", name)
  11. if m is not None:
  12. name = m.group(1)
  13. name_to_variable[name] = var
  14. init_vars = tf.train.list_variables(init_checkpoint)
  15. assignment_map = collections.OrderedDict()
  16. for x in init_vars:
  17. (name, var) = (x[0], x[1])
  18. if name not in name_to_variable:
  19. continue
  20. assignment_map[name] = name
  21. initialized_variable_names[name] = 1
  22. initialized_variable_names[name + ":0"] = 1
  23. return (assignment_map, initialized_variable_names)

4.def dropout(input_tensor, dropout_prob)

  1. def dropout(input_tensor, dropout_prob):
  2. """Perform dropout.
  3. #实现Dropout,第二个参数表示要舍弃的概率,应用到tf.nn.dropout中要用1减
  4. Args:
  5. input_tensor: float Tensor.
  6. dropout_prob: Python float. The probability of dropping out a value (NOT of
  7. *keeping* a dimension as in `tf.nn.dropout`).
  8. Returns:
  9. A version of `input_tensor` with dropout applied.
  10. """
  11. #tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)
  12. if dropout_prob is None or dropout_prob == 0.0:
  13. return input_tensor
  14. output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
  15. return output

5.def layer_norm(input_tensor, name=None)

  1. def layer_norm(input_tensor, name=None):
  2. #进行层normal,这里不是之前常用的batchnormal,主要是对一层即表示每个字或词的embedding进行norm
  3. """Run layer normalization on the last dimension of the tensor."""
  4. return tf.contrib.layers.layer_norm(
  5. inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)

6.def layer_norm_and_dropout(input_tensor, dropout_prob, name=None)

  1. def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
  2. """Runs layer normalization followed by dropout."""
  3. #对输入层先做Norm,再做dropout,返回最后结果
  4. output_tensor = layer_norm(input_tensor, name)
  5. output_tensor = dropout(output_tensor, dropout_prob)
  6. return output_tensor

7.def create_initializer(initializer_range=0.02)

  1. def create_initializer(initializer_range=0.02):
  2. """Creates a `truncated_normal_initializer` with the given range."""
  3. #从截断的正态分布中输出随机值。生成的值服从具有指定平均值和标准偏差的正态分布,如果生成的值大于平均值2个标准偏差的值则丢弃重新选择。
  4. #默认平均数为0,标准差为1,即下方函数要生成的是标准差为0.02,平均数为1的标准数
  5. return tf.truncated_normal_initializer(stddev=initializer_range)

8.def embedding_lookup

  1. #获取词id对应的embedding词向量
  2. def embedding_lookup(input_ids,#包含词id的 [batch_size, seq_length]维度的int32的tensor
  3. vocab_size,#corpus中词的个数
  4. embedding_size=128,#embedding的维度
  5. initializer_range=0.02,#embedding初始化的标准差
  6. word_embedding_name="word_embeddings",#embedding表的名字
  7. use_one_hot_embeddings=False#是否用one-hot,否就用tf.gather()):
  8. #tf.gather:用一个一维的索引数组,将张量中对应索引的向量提取出来
  9. Returns:
  10. float Tensor of shape [batch_size, seq_length, embedding_size].
  11. """
  12. # This function assumes that the input is of shape [batch_size, seq_length,
  13. # num_inputs].
  14. #
  15. # If the input is a 2D tensor of shape [batch_size, seq_length], we
  16. # reshape to [batch_size, seq_length, 1].
  17. if input_ids.shape.ndims == 2:
  18. input_ids = tf.expand_dims(input_ids, axis=[-1])
  19. #tf.get_variable(name, shape, initializer): name就是变量的名称,shape是变量的维度,initializer是变量初始化的方式,初始化的方式有以下几种:tf.constant_initializer:常量初始化函数;tf.random_normal_initializer:正态分布;tf.truncated_normal_initializer:截取的正态分布;tf.random_uniform_initializer:均匀分布;tf.zeros_initializer:全部是0;tf.ones_initializer:全是1;tf.uniform_unit_scaling_initializer:满足均匀分布,但不影响输出数量级的随机值
  20. embedding_table = tf.get_variable(
  21. name=word_embedding_name,
  22. shape=[vocab_size, embedding_size],
  23. initializer=create_initializer(initializer_range))#初始化方式是截取的正态分布
  24. #tf.reshape(tensor,shape,name=None)
  25. flat_input_ids = tf.reshape(input_ids, [-1])#形成一行
  26. if use_one_hot_embeddings:
  27. one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
  28. #将每个词或字转换成独热码,维度最后变为[flat_input_ids,vocab_size]
  29. output = tf.matmul(one_hot_input_ids, embedding_table)
  30. #output输出为[flat_input_ids,embedding_size]
  31. else:
  32. output = tf.gather(embedding_table, flat_input_ids)
  33. #不使用独热时就直接从embedding_table中提取对应的行,最后维度[ flat_input_ids,embedding_size]
  34. input_shape = get_shape_list(input_ids)#输出是list[batch_size, seq_length]
  35. output = tf.reshape(output,
  36. input_shape[0:-1] + [input_shape[-1] * embedding_size])
  37. #output维度改为[batch_size,input_shape[-1] * embedding_size]
  38. return (output, embedding_table)

9.def get_shape_list(tensor, expected_rank=None, name=None)

  1. def get_shape_list(tensor, expected_rank=None, name=None):
  2. """Returns a list of the shape of tensor, preferring static dimensions.
  3. Args:
  4. tensor: A tf.Tensor object to find the shape of.
  5. expected_rank: (optional) int. The expected rank of `tensor`. If this is
  6. specified and the `tensor` has a different rank, and exception will be
  7. thrown.
  8. name: Optional name of the tensor for the error message.
  9. Returns:
  10. A list of dimensions of the shape of tensor. All static dimensions will
  11. be returned as python integers, and dynamic dimensions will be returned
  12. as tf.Tensor scalars.
  13. """
  14. if name is None:
  15. name = tensor.name
  16. if expected_rank is not None:
  17. assert_rank(tensor, expected_rank, name)
  18. shape = tensor.shape.as_list()
  19. non_static_indexes = []
  20. for (index, dim) in enumerate(shape):
  21. if dim is None:
  22. non_static_indexes.append(index)
  23. if not non_static_indexes:
  24. return shape
  25. dyn_shape = tf.shape(tensor)
  26. for index in non_static_indexes:
  27. shape[index] = dyn_shape[index]
  28. return shape

10.def assert_rank(tensor, expected_rank, name=None)

  1. def assert_rank(tensor, expected_rank, name=None):
  2. """Raises an exception if the tensor rank is not of the expected rank.
  3. Args:
  4. tensor: A tf.Tensor to check the rank of.
  5. expected_rank: Python integer or list of integers, expected rank.
  6. name: Optional name of the tensor for the error message.
  7. Raises:
  8. ValueError: If the expected shape doesn't match the actual shape.
  9. """
  10. if name is None:
  11. name = tensor.name
  12. expected_rank_dict = {}
  13. if isinstance(expected_rank, six.integer_types):
  14. expected_rank_dict[expected_rank] = True
  15. else:
  16. for x in expected_rank:
  17. expected_rank_dict[x] = True
  18. actual_rank = tensor.shape.ndims
  19. if actual_rank not in expected_rank_dict:
  20. scope_name = tf.get_variable_scope().name
  21. raise ValueError(
  22. "For the tensor `%s` in scope `%s`, the actual rank "
  23. "`%d` (shape = %s) is not equal to the expected rank `%s`" %
  24. (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))

11.def embedding_postprocessor

  1. #对一个字或词嵌入张量做各种后加工
  2. def embedding_postprocessor(input_tensor,#[batch_size, seq_length,
  3. embedding_size]
  4. use_token_type=False,#是否为token加嵌入
  5. token_type_ids=None,#[batch_size, seq_length]
  6. token_type_vocab_size=16,#token_type_ids的词汇大小
  7. token_type_embedding_name="token_type_embeddings",
  8. use_position_embeddings=True,#增加位置嵌入
  9. position_embedding_name="position_embeddings",
  10. initializer_range=0.02,#权重初始化range
  11. max_position_embeddings=512,#最大序列长度,比真实的大的多,但不能减
  12. dropout_prob=0.1#最后输出层使用):
  13. input_shape = get_shape_list(input_tensor, expected_rank=3)
  14. batch_size = input_shape[0]
  15. seq_length = input_shape[1]
  16. width = input_shape[2]
  17. output = input_tensor
  18. # Segment embedding,单个句子仅使用一个Segment embedding,两个句子使用两个
  19. if use_token_type:
  20. if token_type_ids is None:
  21. raise ValueError("`token_type_ids` must be specified if"
  22. "`use_token_type` is True.")
  23. token_type_table = tf.get_variable(
  24. name=token_type_embedding_name,
  25. shape=[token_type_vocab_size, width],
  26. initializer=create_initializer(initializer_range))#加入16个词嵌入
  27. # 此处用One-hot
  28. flat_token_type_ids = tf.reshape(token_type_ids, [-1])
  29. one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
  30. token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
  31. token_type_embeddings = tf.reshape(token_type_embeddings,
  32. [batch_size, seq_length, width])
  33. output += token_type_embeddings
  34. # Position embedding信息
  35. if use_position_embeddings:
  36. #判断sen_length是否小于max_position_embeddings
  37. assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
  38. with tf.control_dependencies([assert_op]):
  39. full_position_embeddings = tf.get_variable(
  40. name=position_embedding_name,
  41. shape=[max_position_embeddings, width],
  42. initializer=create_initializer(initializer_range))
  43. #为了加速训练position_embedding,使用Max_length,但后续slice取出seq_length
  44. #tf.slice(input_,begin,size,name=None)
  45. position_embeddings = tf.slice(full_position_embeddings, [0, 0],
  46. [seq_length, -1])
  47. num_dims = len(output.shape.as_list())
  48. # Only the last two dimensions are relevant (`seq_length` and `width`), so
  49. # we broadcast among the first dimensions, which is typically just
  50. # the batch size.
  51. position_broadcast_shape = []
  52. for _ in range(num_dims - 2):
  53. position_broadcast_shape.append(1)
  54. position_broadcast_shape.extend([seq_length, width])
  55. position_embeddings = tf.reshape(position_embeddings,
  56. position_broadcast_shape)
  57. output += position_embeddings
  58. output = layer_norm_and_dropout(output, dropout_prob)
  59. return output

12.def create_attention_mask_from_input_mask(from_tensor, to_mask)

  1. def create_attention_mask_from_input_mask(from_tensor, to_mask):
  2. #从2D的mask,创建3D的attention mask,做Padding之后有些是无关的信息,不能参与计算
  3. """
  4. Args:
  5. from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
  6. to_mask: int32 Tensor of shape [batch_size, to_seq_length].
  7. Returns:
  8. float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  9. """
  10. from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  11. batch_size = from_shape[0]
  12. from_seq_length = from_shape[1]
  13. to_shape = get_shape_list(to_mask, expected_rank=2)
  14. to_seq_length = to_shape[1]
  15. #变为float32类型
  16. to_mask = tf.cast(
  17. tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
  18. # We don't assume that `from_tensor` is a mask (although it could be). We
  19. # don't actually care if we attend *from* padding tokens (only *to* padding)
  20. # tokens so we create a tensor of all ones.
  21. #
  22. # `broadcast_ones` = [batch_size, from_seq_length, 1]
  23. broadcast_ones = tf.ones(
  24. shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
  25. # Here we broadcast along two dimensions to create the mask.
  26. mask = broadcast_ones * to_mask
  27. return mask

13.def attention_layer

  1. #multihead attention
  2. def attention_layer(from_tensor,#[batch_size, from_seq_length, from_width].
  3. to_tensor,# [batch_size, to_seq_length, to_width]
  4. attention_mask=None,#shape [batch_size,
  5. from_seq_length, to_seq_length],值为01,attention-score在0的位置去无穷大,在1的位置不变
  6. num_attention_heads=1,#头个数
  7. size_per_head=512,
  8. query_act=None,#query transformer的激活函数
  9. key_act=None,#key transformer的激活函数
  10. value_act=None,#value transformer的激活函数
  11. attention_probs_dropout_prob=0.0,
  12. initializer_range=0.02,
  13. do_return_2d_tensor=False,#True---输出shape [batch_size* from_seq_length, num_attention_heads * size_per_head].False---[batch_size, from_seq_length, num_attention_heads * size_per_head].
  14. batch_size=None,
  15. from_seq_length=None,
  16. to_seq_length=None):
  17. ① If `from_tensor` and `to_tensor` are the same——self-attention.
  18. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector.
  19. `from_tensor` into a "query" tensor,`to_tensor` into "key" and "value" tensors. each tensor's shape[batch_size, seq_length, size_per_head].
  20. ②query和key点乘,加一个softmax,和value tensors相乘,multi-head连接作为一个tensor,返回。
  21. ③实际上多头的实现是通过转置和reshape,而非实际的分割tensor。
  22. Returns:
  23. float Tensor of shape [batch_size, from_seq_length,
  24. num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
  25. true, this will be of shape [batch_size * from_seq_length,
  26. num_attention_heads * size_per_head]).
  27. from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  28. to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
  29. if len(from_shape) != len(to_shape):
  30. raise ValueError(
  31. "The rank of `from_tensor` must match the rank of `to_tensor`.")
  32. if len(from_shape) == 3:
  33. batch_size = from_shape[0]
  34. from_seq_length = from_shape[1]
  35. to_seq_length = to_shape[1]
  36. elif len(from_shape) == 2:
  37. if (batch_size is None or from_seq_length is None or to_seq_length is None):
  38. raise ValueError(
  39. "When passing in rank 2 tensors to attention_layer, the values "
  40. "for `batch_size`, `from_seq_length`, and `to_seq_length` "
  41. "must all be specified.")
  42. #标量的简记
  43. # B = batch size (number of sequences)
  44. # F = `from_tensor` sequence length
  45. # T = `to_tensor` sequence length
  46. # N = `num_attention_heads`
  47. # H = `size_per_head`
  48. from_tensor_2d = reshape_to_matrix(from_tensor)
  49. to_tensor_2d = reshape_to_matrix(to_tensor)
  50. # `query_layer` = [B*F, N*H]
  51. query_layer = tf.layers.dense(
  52. from_tensor_2d,
  53. num_attention_heads * size_per_head,
  54. activation=query_act,
  55. name="query",
  56. kernel_initializer=create_initializer(initializer_range))
  57. # `key_layer` = [B*T, N*H]
  58. tf.layers.dense(
  59. inputs,
  60. units,
  61. activation=None,
  62. use_bias=True,
  63. kernel_initializer=None,
  64. bias_initializer=tf.zeros_initializer(),
  65. kernel_regularizer=None,
  66. bias_regularizer=None,
  67. activity_regularizer=None,
  68. kernel_constraint=None,
  69. bias_constraint=None,
  70. trainable=True,
  71. name=None,
  72. reuse=None
  73. )
  74. key_layer = tf.layers.dense(
  75. to_tensor_2d,
  76. num_attention_heads * size_per_head,
  77. activation=key_act,
  78. name="key",
  79. kernel_initializer=create_initializer(initializer_range))
  80. # `value_layer` = [B*T, N*H]
  81. value_layer = tf.layers.dense(
  82. to_tensor_2d,
  83. num_attention_heads * size_per_head,
  84. activation=value_act,
  85. name="value",
  86. kernel_initializer=create_initializer(initializer_range))
  87. # `query_layer` = [B, N, F, H]
  88. query_layer = transpose_for_scores(query_layer, batch_size,
  89. num_attention_heads, from_seq_length,
  90. size_per_head)
  91. # `key_layer` = [B, N, T, H]
  92. key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
  93. to_seq_length, size_per_head)
  94. # Take the dot product between "query" and "key" to get the raw
  95. # attention scores.
  96. # `attention_scores` = [B, N, F, T]
  97. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  98. attention_scores = tf.multiply(attention_scores,
  99. 1.0 / math.sqrt(float(size_per_head)))
  100. if attention_mask is not None:
  101. # `attention_mask` = [B, 1, F, T]
  102. attention_mask = tf.expand_dims(attention_mask, axis=[1])
  103. # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
  104. # masked positions, this operation will create a tensor which is 0.0 for
  105. # positions we want to attend and -10000.0 for masked positions.
  106. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
  107. # Since we are adding it to the raw scores before the softmax, this is
  108. # effectively the same as removing these entirely.
  109. attention_scores += adder
  110. # Normalize the attention scores to probabilities.
  111. # `attention_probs` = [B, N, F, T]
  112. attention_probs = tf.nn.softmax(attention_scores)
  113. # This is actually dropping out entire tokens to attend to, which might
  114. # seem a bit unusual, but is taken from the original Transformer paper.
  115. attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
  116. # `value_layer` = [B, T, N, H]
  117. value_layer = tf.reshape(
  118. value_layer,
  119. [batch_size, to_seq_length, num_attention_heads, size_per_head])
  120. # `value_layer` = [B, N, T, H]
  121. value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
  122. # `context_layer` = [B, N, F, H]
  123. context_layer = tf.matmul(attention_probs, value_layer)
  124. # `context_layer` = [B, F, N, H]
  125. context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
  126. if do_return_2d_tensor:
  127. # `context_layer` = [B*F, N*H]
  128. context_layer = tf.reshape(
  129. context_layer,
  130. [batch_size * from_seq_length, num_attention_heads * size_per_head])
  131. else:
  132. # `context_layer` = [B, F, N*H]
  133. context_layer = tf.reshape(
  134. context_layer,
  135. [batch_size, from_seq_length, num_attention_heads * size_per_head])
  136. return context_layer

14.def reshape_to_matrix(input_tensor)

  1. def reshape_to_matrix(input_tensor):
  2. #是2维向量直接返回input_tensor,大于2维向量返回[-1,width]
  3. """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
  4. ndims = input_tensor.shape.ndims
  5. if ndims < 2:
  6. raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
  7. (input_tensor.shape))
  8. if ndims == 2:
  9. return input_tensor
  10. width = input_tensor.shape[-1]
  11. output_tensor = tf.reshape(input_tensor, [-1, width])
  12. return output_tensor

15.def reshape_from_matrix(output_tensor, orig_shape_list)

  1. def reshape_from_matrix(output_tensor, orig_shape_list):
  2. #把2维的tensor反转为原先的维度
  3. """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
  4. if len(orig_shape_list) == 2:
  5. return output_tensor
  6. output_shape = get_shape_list(output_tensor)
  7. orig_dims = orig_shape_list[0:-1]
  8. width = output_shape[-1]
  9. return tf.reshape(output_tensor, orig_dims + [width])

16.def transpose_for_scores(input_tensor, batch_size, num_attention_heads,seq_length, width)

  1. def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
  2. seq_length, width):
  3. output_tensor = tf.reshape(
  4. input_tensor, [batch_size, seq_length, num_attention_heads, width])
  5. output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
  6. return output_tensor

17.def transformer_model

  1. def transformer_model(input_tensor,#[batch_size, seq_length, hidden_size]
  2. attention_mask=None,#shape [batch_size, seq_length,seq_length]
  3. hidden_size=768,#Transformer的隐藏层
  4. num_hidden_layers=12,#Transformer的block数
  5. num_attention_heads=12,
  6. intermediate_size=3072,#intermediate尺寸,如feed forward等
  7. intermediate_act_fn=gelu,
  8. hidden_dropout_prob=0.1,
  9. attention_probs_dropout_prob=0.1,
  10. initializer_range=0.02,
  11. do_return_all_layers=False#返回所以层还是最后层):
  12. """
  13. Multi-headed, multi-layer Transformer from "Attention is All You Need".Transformer的encoder部分.
  14. Also see:
  15. https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
  16. Returns:
  17. float Tensor of shape [batch_size, seq_length, hidden_size], the final
  18. hidden layer of the Transformer.
  19. Raises:
  20. ValueError: A Tensor shape or parameter is invalid.
  21. """
  22. if hidden_size % num_attention_heads != 0:
  23. raise ValueError(
  24. "The hidden size (%d) is not a multiple of the number of attention "
  25. "heads (%d)" % (hidden_size, num_attention_heads))
  26. attention_head_size = int(hidden_size / num_attention_heads)
  27. input_shape = get_shape_list(input_tensor, expected_rank=3)
  28. batch_size = input_shape[0]
  29. seq_length = input_shape[1]
  30. input_width = input_shape[2]
  31. # Transformer对所以层的剩余误差求和,输入层要与隐藏层数目一致
  32. if input_width != hidden_size:
  33. raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
  34. (input_width, hidden_size))
  35. prev_output = reshape_to_matrix(input_tensor)#形成2维矩阵
  36. all_layer_outputs = []
  37. for layer_idx in range(num_hidden_layers):
  38. with tf.variable_scope("layer_%d" % layer_idx):
  39. layer_input = prev_output
  40. with tf.variable_scope("attention"):
  41. attention_heads = []
  42. with tf.variable_scope("self"):
  43. attention_head = attention_layer(
  44. from_tensor=layer_input,
  45. to_tensor=layer_input,
  46. attention_mask=attention_mask,
  47. num_attention_heads=num_attention_heads,
  48. size_per_head=attention_head_size,
  49. attention_probs_dropout_prob=attention_probs_dropout_prob,
  50. initializer_range=initializer_range,
  51. do_return_2d_tensor=True,
  52. batch_size=batch_size,
  53. from_seq_length=seq_length,
  54. to_seq_length=seq_length)
  55. attention_heads.append(attention_head)
  56. attention_output = None
  57. if len(attention_heads) == 1:
  58. attention_output = attention_heads[0]
  59. else:
  60. # In the case where we have other sequences, we just concatenate
  61. # them to the self-attention head before the projection.
  62. attention_output = tf.concat(attention_heads, axis=-1)
  63. # Run a linear projection of `hidden_size` then add a residual
  64. # with `layer_input`.
  65. with tf.variable_scope("output"):
  66. attention_output = tf.layers.dense(
  67. attention_output,
  68. hidden_size,
  69. kernel_initializer=create_initializer(initializer_range))
  70. attention_output = dropout(attention_output, hidden_dropout_prob)
  71. attention_output = layer_norm(attention_output + layer_input)
  72. # The activation is only applied to the "intermediate" hidden layer.
  73. with tf.variable_scope("intermediate"):
  74. intermediate_output = tf.layers.dense(
  75. attention_output,
  76. intermediate_size,
  77. activation=intermediate_act_fn,
  78. kernel_initializer=create_initializer(initializer_range))
  79. # Down-project back to `hidden_size` then add the residual.
  80. with tf.variable_scope("output"):
  81. layer_output = tf.layers.dense(
  82. intermediate_output,
  83. hidden_size,
  84. kernel_initializer=create_initializer(initializer_range))
  85. layer_output = dropout(layer_output, hidden_dropout_prob)
  86. layer_output = layer_norm(layer_output + attention_output)
  87. prev_output = layer_output
  88. all_layer_outputs.append(layer_output)
  89. if do_return_all_layers:
  90. final_outputs = []
  91. for layer_output in all_layer_outputs:
  92. final_output = reshape_from_matrix(layer_output, input_shape)
  93. final_outputs.append(final_output)
  94. return final_outputs
  95. else:
  96. final_output = reshape_from_matrix(prev_output, input_shape)
  97. return final_output

第2个类是整个modeling函数的接口,通过调用上述类和函数,实现了最后结果

参考资料:https://mp.weixin.qq.com/s/rxJ0jAFKsP6ByWeVv6Tr5Q

举例为:

  1. 1# 假设输入已经经过分词变成word_ids. shape=[2, 3]
  2. 2input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
  3. 3input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
  4. 4# segment_emebdding. 表示第一个样本前两个词属于句子1,后一个词属于句子2.
  5. 5# 第二个样本的第一个词属于句子1, 第二次词属于句子2,第三个元素0表示padding
  6. 6token_type_ids = tf.constant([[0, 0, 1], [0, 1, 0]])
  7. 7
  8. 8# 创建BertConfig实例
  9. 9config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
  10. 10 num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
  11. 11
  12. 12# 创建BertModel实例
  13. 13model = modeling.BertModel(config=config, is_training=True,
  14. 14 input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
  15. 15
  16. 16
  17. 17label_embeddings = tf.get_variable(...)
  18. 18#得到最后一层的第一个Token也就是[CLS]向量表示,可以看成是一个句子的embedding
  19. 19pooled_output = model.get_pooled_output()
  20. 20logits = tf.matmul(pooled_output, label_embeddings)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/312408
推荐阅读
相关标签
  

闽ICP备14008679号