赞
踩
在官方的bert-github上,
git clone https://github.com/google-research/bert.git
主要的文件内容如下图:
主要包括7个主要的python文件,小编要坚持把这7个解读清楚呀!
首先解读的是modeling.py文件,是bert实现的核心代码,主要包括2个类和17个函数,如下所示:
一、类
1.class BertConfig(object):
- class BertConfig(object):
- """Configuration for `BertModel`."""
-
- def __init__(self,
- vocab_size,“词表中共有多少个词”
- hidden_size=768,#词嵌入的维度,也是编码层和池化层的维度
- num_hidden_layers=12,#transformer隐藏层数个数
- num_attention_heads=12,#在encoder层中的注意头个数
- intermediate_size=3072,#encoder中间隐藏层神经元数,如feed-forward layer
- hidden_act="gelu",#encoder和pooler的激活函数
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- max_position_embeddings=512,
- type_vocab_size=16,
- initializer_range=0.02):
-
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.hidden_act = hidden_act
- self.intermediate_size = intermediate_size
- self.hidden_dropout_prob = hidden_dropout_prob
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
- self.max_position_embeddings = max_position_embeddings
- self.type_vocab_size = type_vocab_size
- self.initializer_range = initializer_range
-
- @classmethod
- def from_dict(cls, json_object):#从json_object读入,将config的参数放入字典中
- """Constructs a `BertConfig` from a Python dictionary of parameters."""
- config = BertConfig(vocab_size=None)
- for (key, value) in six.iteritems(json_object):
- config.__dict__[key] = value
- return config
-
- @classmethod
- def from_json_file(cls, json_file):
- """Constructs a `BertConfig` from a json file of parameters."""
- with tf.gfile.GFile(json_file, "r") as reader:
- text = reader.read()
- return cls.from_dict(json.loads(text))
-
- def to_dict(self):
- """Serializes this instance to a Python dictionary."""
- output = copy.deepcopy(self.__dict__)
- return output
-
- def to_json_string(self):
- """Serializes this instance to a JSON string."""
- return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
2.class BertModel(object):
- class BertModel(object):
- """BERT model ("Bidirectional Encoder Representations from Transformers").
- def __init__(self,
- config,#上边第一个类BertConfig的实例
- is_training,#控制是否训练模型
- input_ids,#int32位的Tensor,维度为[batch_size, seq_length]
- input_mask=None,#可选,int32位的Tensor,维度为[batch_size, seq_length]
- token_type_ids=None,#可选,int32位,维度为[batch_size, seq_length]
- use_one_hot_embeddings=False,#可选,是否用one-hot,或使用tf.embedding_lookup()的预训练词向量
- scope=None#变量可用的范围,默认是整个bert):
- #当config实例错误或输入维度错误时,提示ValueError
- config = copy.deepcopy(config)
- if not is_training:#如果is_training是False,dropout设为0
- config.hidden_dropout_prob = 0.0
- config.attention_probs_dropout_prob = 0.0
- input_shape = get_shape_list(input_ids, expected_rank=2)
- batch_size = input_shape[0]
- seq_length = input_shape[1]
- if input_mask is None:#未提供mask,就都设为1
- input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
- if token_type_ids is None:#未提供token_type都设为0
- token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
- with tf.variable_scope(scope, default_name="bert"):
- with tf.variable_scope("embeddings"):
- # Perform embedding lookup on the word ids.
- (self.embedding_output, self.embedding_table) = embedding_lookup(
- input_ids=input_ids,
- vocab_size=config.vocab_size,
- embedding_size=config.hidden_size,
- initializer_range=config.initializer_range,
- word_embedding_name="word_embeddings",
- use_one_hot_embeddings=use_one_hot_embeddings)
- # Add positional embeddings and token type embeddings, then layer
- # normalize and perform dropout.
- self.embedding_output = embedding_postprocessor(
- input_tensor=self.embedding_output,
- use_token_type=True,
- token_type_ids=token_type_ids,
- token_type_vocab_size=config.type_vocab_size,
- token_type_embedding_name="token_type_embeddings",
- use_position_embeddings=True,
- position_embedding_name="position_embeddings",
- initializer_range=config.initializer_range,
- max_position_embeddings=config.max_position_embeddings,
- dropout_prob=config.hidden_dropout_prob)
- with tf.variable_scope("encoder"):
- # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
- # mask of shape [batch_size, seq_length, seq_length] which is used
- # for the attention scores.
- attention_mask = create_attention_mask_from_input_mask(
- input_ids, input_mask)
- # Run the stacked transformer.
- # `sequence_output` shape = [batch_size, seq_length, hidden_size].
- self.all_encoder_layers = transformer_model(
- input_tensor=self.embedding_output,
- attention_mask=attention_mask,
- hidden_size=config.hidden_size,
- num_hidden_layers=config.num_hidden_layers,
- num_attention_heads=config.num_attention_heads,
- intermediate_size=config.intermediate_size,
- intermediate_act_fn=get_activation(config.hidden_act),
- hidden_dropout_prob=config.hidden_dropout_prob,
- attention_probs_dropout_prob=config.attention_probs_dropout_prob,
- initializer_range=config.initializer_range,
- do_return_all_layers=True)
- self.sequence_output = self.all_encoder_layers[-1]
- # The "pooler" converts the encoded sequence tensor of shape
- # [batch_size, seq_length, hidden_size] to a tensor of shape
- # [batch_size, hidden_size]. This is necessary for segment-level
- # (or segment-pair-level) classification tasks where we need a fixed
- # dimensional representation of the segment.
- with tf.variable_scope("pooler"):
- # We "pool" the model by simply taking the hidden state corresponding
- # to the first token. We assume that this has been pre-trained
- first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
- self.pooled_output = tf.layers.dense(
- first_token_tensor,
- config.hidden_size,
- activation=tf.tanh,
- kernel_initializer=create_initializer(config.initializer_range))
- def get_pooled_output(self):
- return self.pooled_output
- def get_sequence_output(self):
- """Gets final hidden layer of encoder.
-
- Returns:
- float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
- to the final hidden of the transformer encoder.
- """
- return self.sequence_output
- def get_all_encoder_layers(self):
- return self.all_encoder_layers
- def get_embedding_output(self):
- """Gets output of the embedding lookup (i.e., input to the transformer).
-
- Returns:
- float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
- to the output of the embedding layer, after summing the word
- embeddings with the positional embeddings and the token type embeddings,
- then performing layer normalization. This is the input to the transformer.
- """
- return self.embedding_output
- def get_embedding_table(self):
- return self.embedding_table
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
二、17个小函数
1.def gelu(x)
- def gelu(x):
- """Gaussian Error Linear Unit.#高斯错误线性单元,GELU(x)=xP(X<=x)=xΦ(x)
- This is a smoother version of the RELU.
- Original paper: https://arxiv.org/abs/1606.08415
- Args:
- x: float Tensor to perform activation.
- Returns:
- `x` with the GELU activation applied.
- """
- cdf = 0.5 * (1.0 + tf.tanh(
- (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
- return x * cdf
2.def get_activation(activation_string)
- def get_activation(activation_string):
- """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
- #将激活函数的string名称变为一个函数
- Args:
- activation_string: String name of the activation function.
- Returns:
- A Python function corresponding to the activation function. If
- `activation_string` is None, empty, or "linear", this will return None.
- If `activation_string` is not a string, it will return `activation_string`.
- Raises:
- ValueError: The `activation_string` does not correspond to a known
- activation.
- """
-
- # We assume that anything that"s not a string is already an activation
- # function, so we just return it.
- if not isinstance(activation_string, six.string_types):
- return activation_string
-
- if not activation_string:
- return None
-
- act = activation_string.lower()
- if act == "linear":
- return None
- elif act == "relu":
- return tf.nn.relu
- elif act == "gelu":
- return gelu
- elif act == "tanh":
- return tf.tanh
- else:
- raise ValueError("Unsupported activation: %s" % act)
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
3.def get_assignment_map_from_checkpoint(tvars, init_checkpoint)
- def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
- #计算当前变量和交叉点变量的集合
- """Compute the union of the current variables and checkpoint variables."""
- assignment_map = {}
- initialized_variable_names = {}
- #定义一个按输入顺序排列的字典
- name_to_variable = collections.OrderedDict()
- for var in tvars:
- name = var.name
- m = re.match("^(.*):\\d+$", name)
- if m is not None:
- name = m.group(1)
- name_to_variable[name] = var
-
- init_vars = tf.train.list_variables(init_checkpoint)
-
- assignment_map = collections.OrderedDict()
- for x in init_vars:
- (name, var) = (x[0], x[1])
- if name not in name_to_variable:
- continue
- assignment_map[name] = name
- initialized_variable_names[name] = 1
- initialized_variable_names[name + ":0"] = 1
-
- return (assignment_map, initialized_variable_names)
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
4.def dropout(input_tensor, dropout_prob)
- def dropout(input_tensor, dropout_prob):
- """Perform dropout.
- #实现Dropout,第二个参数表示要舍弃的概率,应用到tf.nn.dropout中要用1减
- Args:
- input_tensor: float Tensor.
- dropout_prob: Python float. The probability of dropping out a value (NOT of
- *keeping* a dimension as in `tf.nn.dropout`).
- Returns:
- A version of `input_tensor` with dropout applied.
- """
- #tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)
- if dropout_prob is None or dropout_prob == 0.0:
- return input_tensor
-
- output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
- return output
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
5.def layer_norm(input_tensor, name=None)
- def layer_norm(input_tensor, name=None):
- #进行层normal,这里不是之前常用的batchnormal,主要是对一层即表示每个字或词的embedding进行norm
- """Run layer normalization on the last dimension of the tensor."""
- return tf.contrib.layers.layer_norm(
- inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
6.def layer_norm_and_dropout(input_tensor, dropout_prob, name=None)
- def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
- """Runs layer normalization followed by dropout."""
- #对输入层先做Norm,再做dropout,返回最后结果
- output_tensor = layer_norm(input_tensor, name)
- output_tensor = dropout(output_tensor, dropout_prob)
- return output_tensor
7.def create_initializer(initializer_range=0.02)
- def create_initializer(initializer_range=0.02):
- """Creates a `truncated_normal_initializer` with the given range."""
- #从截断的正态分布中输出随机值。生成的值服从具有指定平均值和标准偏差的正态分布,如果生成的值大于平均值2个标准偏差的值则丢弃重新选择。
- #默认平均数为0,标准差为1,即下方函数要生成的是标准差为0.02,平均数为1的标准数
- return tf.truncated_normal_initializer(stddev=initializer_range)
8.def embedding_lookup
- #获取词id对应的embedding词向量
- def embedding_lookup(input_ids,#包含词id的 [batch_size, seq_length]维度的int32的tensor
- vocab_size,#corpus中词的个数
- embedding_size=128,#embedding的维度
- initializer_range=0.02,#embedding初始化的标准差
- word_embedding_name="word_embeddings",#embedding表的名字
- use_one_hot_embeddings=False#是否用one-hot,否就用tf.gather()):
- #tf.gather:用一个一维的索引数组,将张量中对应索引的向量提取出来
-
- Returns:
- float Tensor of shape [batch_size, seq_length, embedding_size].
- """
- # This function assumes that the input is of shape [batch_size, seq_length,
- # num_inputs].
- #
- # If the input is a 2D tensor of shape [batch_size, seq_length], we
- # reshape to [batch_size, seq_length, 1].
- if input_ids.shape.ndims == 2:
- input_ids = tf.expand_dims(input_ids, axis=[-1])
- #tf.get_variable(name, shape, initializer): name就是变量的名称,shape是变量的维度,initializer是变量初始化的方式,初始化的方式有以下几种:tf.constant_initializer:常量初始化函数;tf.random_normal_initializer:正态分布;tf.truncated_normal_initializer:截取的正态分布;tf.random_uniform_initializer:均匀分布;tf.zeros_initializer:全部是0;tf.ones_initializer:全是1;tf.uniform_unit_scaling_initializer:满足均匀分布,但不影响输出数量级的随机值
- embedding_table = tf.get_variable(
- name=word_embedding_name,
- shape=[vocab_size, embedding_size],
- initializer=create_initializer(initializer_range))#初始化方式是截取的正态分布
- #tf.reshape(tensor,shape,name=None)
- flat_input_ids = tf.reshape(input_ids, [-1])#形成一行
- if use_one_hot_embeddings:
- one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
- #将每个词或字转换成独热码,维度最后变为[flat_input_ids,vocab_size]
- output = tf.matmul(one_hot_input_ids, embedding_table)
- #output输出为[flat_input_ids,embedding_size]
- else:
- output = tf.gather(embedding_table, flat_input_ids)
- #不使用独热时就直接从embedding_table中提取对应的行,最后维度[ flat_input_ids,embedding_size]
- input_shape = get_shape_list(input_ids)#输出是list[batch_size, seq_length]
- output = tf.reshape(output,
- input_shape[0:-1] + [input_shape[-1] * embedding_size])
- #output维度改为[batch_size,input_shape[-1] * embedding_size]
- return (output, embedding_table)
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
9.def get_shape_list(tensor, expected_rank=None, name=None)
- def get_shape_list(tensor, expected_rank=None, name=None):
- """Returns a list of the shape of tensor, preferring static dimensions.
- Args:
- tensor: A tf.Tensor object to find the shape of.
- expected_rank: (optional) int. The expected rank of `tensor`. If this is
- specified and the `tensor` has a different rank, and exception will be
- thrown.
- name: Optional name of the tensor for the error message.
- Returns:
- A list of dimensions of the shape of tensor. All static dimensions will
- be returned as python integers, and dynamic dimensions will be returned
- as tf.Tensor scalars.
- """
- if name is None:
- name = tensor.name
-
- if expected_rank is not None:
- assert_rank(tensor, expected_rank, name)
-
- shape = tensor.shape.as_list()
-
- non_static_indexes = []
- for (index, dim) in enumerate(shape):
- if dim is None:
- non_static_indexes.append(index)
-
- if not non_static_indexes:
- return shape
-
- dyn_shape = tf.shape(tensor)
- for index in non_static_indexes:
- shape[index] = dyn_shape[index]
- return shape
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
10.def assert_rank(tensor, expected_rank, name=None)
- def assert_rank(tensor, expected_rank, name=None):
- """Raises an exception if the tensor rank is not of the expected rank.
- Args:
- tensor: A tf.Tensor to check the rank of.
- expected_rank: Python integer or list of integers, expected rank.
- name: Optional name of the tensor for the error message.
- Raises:
- ValueError: If the expected shape doesn't match the actual shape.
- """
- if name is None:
- name = tensor.name
-
- expected_rank_dict = {}
- if isinstance(expected_rank, six.integer_types):
- expected_rank_dict[expected_rank] = True
- else:
- for x in expected_rank:
- expected_rank_dict[x] = True
-
- actual_rank = tensor.shape.ndims
- if actual_rank not in expected_rank_dict:
- scope_name = tf.get_variable_scope().name
- raise ValueError(
- "For the tensor `%s` in scope `%s`, the actual rank "
- "`%d` (shape = %s) is not equal to the expected rank `%s`" %
- (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
11.def embedding_postprocessor
- #对一个字或词嵌入张量做各种后加工
- def embedding_postprocessor(input_tensor,#[batch_size, seq_length,
- embedding_size]
- use_token_type=False,#是否为token加嵌入
- token_type_ids=None,#[batch_size, seq_length]
- token_type_vocab_size=16,#token_type_ids的词汇大小
- token_type_embedding_name="token_type_embeddings",
- use_position_embeddings=True,#增加位置嵌入
- position_embedding_name="position_embeddings",
- initializer_range=0.02,#权重初始化range
- max_position_embeddings=512,#最大序列长度,比真实的大的多,但不能减
- dropout_prob=0.1#最后输出层使用):
- input_shape = get_shape_list(input_tensor, expected_rank=3)
- batch_size = input_shape[0]
- seq_length = input_shape[1]
- width = input_shape[2]
- output = input_tensor
- # Segment embedding,单个句子仅使用一个Segment embedding,两个句子使用两个
- if use_token_type:
- if token_type_ids is None:
- raise ValueError("`token_type_ids` must be specified if"
- "`use_token_type` is True.")
- token_type_table = tf.get_variable(
- name=token_type_embedding_name,
- shape=[token_type_vocab_size, width],
- initializer=create_initializer(initializer_range))#加入16个词嵌入
- # 此处用One-hot
- flat_token_type_ids = tf.reshape(token_type_ids, [-1])
- one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
- token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
- token_type_embeddings = tf.reshape(token_type_embeddings,
- [batch_size, seq_length, width])
- output += token_type_embeddings
- # Position embedding信息
- if use_position_embeddings:
- #判断sen_length是否小于max_position_embeddings
- assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
- with tf.control_dependencies([assert_op]):
- full_position_embeddings = tf.get_variable(
- name=position_embedding_name,
- shape=[max_position_embeddings, width],
- initializer=create_initializer(initializer_range))
- #为了加速训练position_embedding,使用Max_length,但后续slice取出seq_length
- #tf.slice(input_,begin,size,name=None)
- position_embeddings = tf.slice(full_position_embeddings, [0, 0],
- [seq_length, -1])
- num_dims = len(output.shape.as_list())
- # Only the last two dimensions are relevant (`seq_length` and `width`), so
- # we broadcast among the first dimensions, which is typically just
- # the batch size.
- position_broadcast_shape = []
- for _ in range(num_dims - 2):
- position_broadcast_shape.append(1)
- position_broadcast_shape.extend([seq_length, width])
- position_embeddings = tf.reshape(position_embeddings,
- position_broadcast_shape)
- output += position_embeddings
- output = layer_norm_and_dropout(output, dropout_prob)
- return output
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
12.def create_attention_mask_from_input_mask(from_tensor, to_mask)
- def create_attention_mask_from_input_mask(from_tensor, to_mask):
- #从2D的mask,创建3D的attention mask,做Padding之后有些是无关的信息,不能参与计算
- """
- Args:
- from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
- to_mask: int32 Tensor of shape [batch_size, to_seq_length].
- Returns:
- float Tensor of shape [batch_size, from_seq_length, to_seq_length].
- """
- from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
- batch_size = from_shape[0]
- from_seq_length = from_shape[1]
-
- to_shape = get_shape_list(to_mask, expected_rank=2)
- to_seq_length = to_shape[1]
- #变为float32类型
- to_mask = tf.cast(
- tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
-
- # We don't assume that `from_tensor` is a mask (although it could be). We
- # don't actually care if we attend *from* padding tokens (only *to* padding)
- # tokens so we create a tensor of all ones.
- #
- # `broadcast_ones` = [batch_size, from_seq_length, 1]
- broadcast_ones = tf.ones(
- shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
-
- # Here we broadcast along two dimensions to create the mask.
- mask = broadcast_ones * to_mask
-
- return mask
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
13.def attention_layer
- #multihead attention
- def attention_layer(from_tensor,#[batch_size, from_seq_length, from_width].
- to_tensor,# [batch_size, to_seq_length, to_width]
- attention_mask=None,#shape [batch_size,
- from_seq_length, to_seq_length],值为0或1,attention-score在0的位置去无穷大,在1的位置不变
- num_attention_heads=1,#头个数
- size_per_head=512,
- query_act=None,#query transformer的激活函数
- key_act=None,#key transformer的激活函数
- value_act=None,#value transformer的激活函数
- attention_probs_dropout_prob=0.0,
- initializer_range=0.02,
- do_return_2d_tensor=False,#True---输出shape [batch_size* from_seq_length, num_attention_heads * size_per_head].False---[batch_size, from_seq_length, num_attention_heads * size_per_head].
- batch_size=None,
- from_seq_length=None,
- to_seq_length=None):
-
- ① If `from_tensor` and `to_tensor` are the same——self-attention.
- Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector.
- `from_tensor` into a "query" tensor,`to_tensor` into "key" and "value" tensors. each tensor's shape[batch_size, seq_length, size_per_head].
- ②query和key点乘,加一个softmax,和value tensors相乘,multi-head连接作为一个tensor,返回。
- ③实际上多头的实现是通过转置和reshape,而非实际的分割tensor。
- Returns:
- float Tensor of shape [batch_size, from_seq_length,
- num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
- true, this will be of shape [batch_size * from_seq_length,
- num_attention_heads * size_per_head]).
- from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
- to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
- if len(from_shape) != len(to_shape):
- raise ValueError(
- "The rank of `from_tensor` must match the rank of `to_tensor`.")
- if len(from_shape) == 3:
- batch_size = from_shape[0]
- from_seq_length = from_shape[1]
- to_seq_length = to_shape[1]
- elif len(from_shape) == 2:
- if (batch_size is None or from_seq_length is None or to_seq_length is None):
- raise ValueError(
- "When passing in rank 2 tensors to attention_layer, the values "
- "for `batch_size`, `from_seq_length`, and `to_seq_length` "
- "must all be specified.")
- #标量的简记
- # B = batch size (number of sequences)
- # F = `from_tensor` sequence length
- # T = `to_tensor` sequence length
- # N = `num_attention_heads`
- # H = `size_per_head`
- from_tensor_2d = reshape_to_matrix(from_tensor)
- to_tensor_2d = reshape_to_matrix(to_tensor)
- # `query_layer` = [B*F, N*H]
- query_layer = tf.layers.dense(
- from_tensor_2d,
- num_attention_heads * size_per_head,
- activation=query_act,
- name="query",
- kernel_initializer=create_initializer(initializer_range))
- # `key_layer` = [B*T, N*H]
- tf.layers.dense(
- inputs,
- units,
- activation=None,
- use_bias=True,
- kernel_initializer=None,
- bias_initializer=tf.zeros_initializer(),
- kernel_regularizer=None,
- bias_regularizer=None,
- activity_regularizer=None,
- kernel_constraint=None,
- bias_constraint=None,
- trainable=True,
- name=None,
- reuse=None
- )
- key_layer = tf.layers.dense(
- to_tensor_2d,
- num_attention_heads * size_per_head,
- activation=key_act,
- name="key",
- kernel_initializer=create_initializer(initializer_range))
- # `value_layer` = [B*T, N*H]
- value_layer = tf.layers.dense(
- to_tensor_2d,
- num_attention_heads * size_per_head,
- activation=value_act,
- name="value",
- kernel_initializer=create_initializer(initializer_range))
- # `query_layer` = [B, N, F, H]
- query_layer = transpose_for_scores(query_layer, batch_size,
- num_attention_heads, from_seq_length,
- size_per_head)
- # `key_layer` = [B, N, T, H]
- key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
- to_seq_length, size_per_head)
- # Take the dot product between "query" and "key" to get the raw
- # attention scores.
- # `attention_scores` = [B, N, F, T]
- attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
- attention_scores = tf.multiply(attention_scores,
- 1.0 / math.sqrt(float(size_per_head)))
- if attention_mask is not None:
- # `attention_mask` = [B, 1, F, T]
- attention_mask = tf.expand_dims(attention_mask, axis=[1])
- # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
- # masked positions, this operation will create a tensor which is 0.0 for
- # positions we want to attend and -10000.0 for masked positions.
- adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
- # Since we are adding it to the raw scores before the softmax, this is
- # effectively the same as removing these entirely.
- attention_scores += adder
- # Normalize the attention scores to probabilities.
- # `attention_probs` = [B, N, F, T]
- attention_probs = tf.nn.softmax(attention_scores)
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
- # `value_layer` = [B, T, N, H]
- value_layer = tf.reshape(
- value_layer,
- [batch_size, to_seq_length, num_attention_heads, size_per_head])
- # `value_layer` = [B, N, T, H]
- value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
- # `context_layer` = [B, N, F, H]
- context_layer = tf.matmul(attention_probs, value_layer)
- # `context_layer` = [B, F, N, H]
- context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
- if do_return_2d_tensor:
- # `context_layer` = [B*F, N*H]
- context_layer = tf.reshape(
- context_layer,
- [batch_size * from_seq_length, num_attention_heads * size_per_head])
- else:
- # `context_layer` = [B, F, N*H]
- context_layer = tf.reshape(
- context_layer,
- [batch_size, from_seq_length, num_attention_heads * size_per_head])
- return context_layer
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
14.def reshape_to_matrix(input_tensor)
- def reshape_to_matrix(input_tensor):
- #是2维向量直接返回input_tensor,大于2维向量返回[-1,width]
- """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
- ndims = input_tensor.shape.ndims
- if ndims < 2:
- raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
- (input_tensor.shape))
- if ndims == 2:
- return input_tensor
-
- width = input_tensor.shape[-1]
- output_tensor = tf.reshape(input_tensor, [-1, width])
- return output_tensor
15.def reshape_from_matrix(output_tensor, orig_shape_list)
- def reshape_from_matrix(output_tensor, orig_shape_list):
- #把2维的tensor反转为原先的维度
- """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
- if len(orig_shape_list) == 2:
- return output_tensor
- output_shape = get_shape_list(output_tensor)
- orig_dims = orig_shape_list[0:-1]
- width = output_shape[-1]
-
- return tf.reshape(output_tensor, orig_dims + [width])
16.def transpose_for_scores(input_tensor, batch_size, num_attention_heads,seq_length, width)
- def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
- seq_length, width):
- output_tensor = tf.reshape(
- input_tensor, [batch_size, seq_length, num_attention_heads, width])
-
- output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
- return output_tensor
17.def transformer_model
- def transformer_model(input_tensor,#[batch_size, seq_length, hidden_size]
- attention_mask=None,#shape [batch_size, seq_length,seq_length]
- hidden_size=768,#Transformer的隐藏层
- num_hidden_layers=12,#Transformer的block数
- num_attention_heads=12,
- intermediate_size=3072,#intermediate尺寸,如feed forward等
- intermediate_act_fn=gelu,
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- initializer_range=0.02,
- do_return_all_layers=False#返回所以层还是最后层):
- """
- Multi-headed, multi-layer Transformer from "Attention is All You Need".Transformer的encoder部分.
- Also see:
- https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
- Returns:
- float Tensor of shape [batch_size, seq_length, hidden_size], the final
- hidden layer of the Transformer.
- Raises:
- ValueError: A Tensor shape or parameter is invalid.
- """
- if hidden_size % num_attention_heads != 0:
- raise ValueError(
- "The hidden size (%d) is not a multiple of the number of attention "
- "heads (%d)" % (hidden_size, num_attention_heads))
- attention_head_size = int(hidden_size / num_attention_heads)
- input_shape = get_shape_list(input_tensor, expected_rank=3)
- batch_size = input_shape[0]
- seq_length = input_shape[1]
- input_width = input_shape[2]
- # Transformer对所以层的剩余误差求和,输入层要与隐藏层数目一致
- if input_width != hidden_size:
- raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
- (input_width, hidden_size))
- prev_output = reshape_to_matrix(input_tensor)#形成2维矩阵
- all_layer_outputs = []
- for layer_idx in range(num_hidden_layers):
- with tf.variable_scope("layer_%d" % layer_idx):
- layer_input = prev_output
- with tf.variable_scope("attention"):
- attention_heads = []
- with tf.variable_scope("self"):
- attention_head = attention_layer(
- from_tensor=layer_input,
- to_tensor=layer_input,
- attention_mask=attention_mask,
- num_attention_heads=num_attention_heads,
- size_per_head=attention_head_size,
- attention_probs_dropout_prob=attention_probs_dropout_prob,
- initializer_range=initializer_range,
- do_return_2d_tensor=True,
- batch_size=batch_size,
- from_seq_length=seq_length,
- to_seq_length=seq_length)
- attention_heads.append(attention_head)
- attention_output = None
- if len(attention_heads) == 1:
- attention_output = attention_heads[0]
- else:
- # In the case where we have other sequences, we just concatenate
- # them to the self-attention head before the projection.
- attention_output = tf.concat(attention_heads, axis=-1)
- # Run a linear projection of `hidden_size` then add a residual
- # with `layer_input`.
- with tf.variable_scope("output"):
- attention_output = tf.layers.dense(
- attention_output,
- hidden_size,
- kernel_initializer=create_initializer(initializer_range))
- attention_output = dropout(attention_output, hidden_dropout_prob)
- attention_output = layer_norm(attention_output + layer_input)
- # The activation is only applied to the "intermediate" hidden layer.
- with tf.variable_scope("intermediate"):
- intermediate_output = tf.layers.dense(
- attention_output,
- intermediate_size,
- activation=intermediate_act_fn,
- kernel_initializer=create_initializer(initializer_range))
- # Down-project back to `hidden_size` then add the residual.
- with tf.variable_scope("output"):
- layer_output = tf.layers.dense(
- intermediate_output,
- hidden_size,
- kernel_initializer=create_initializer(initializer_range))
- layer_output = dropout(layer_output, hidden_dropout_prob)
- layer_output = layer_norm(layer_output + attention_output)
- prev_output = layer_output
- all_layer_outputs.append(layer_output)
- if do_return_all_layers:
- final_outputs = []
- for layer_output in all_layer_outputs:
- final_output = reshape_from_matrix(layer_output, input_shape)
- final_outputs.append(final_output)
- return final_outputs
- else:
- final_output = reshape_from_matrix(prev_output, input_shape)
- return final_output
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
第2个类是整个modeling函数的接口,通过调用上述类和函数,实现了最后结果
参考资料:https://mp.weixin.qq.com/s/rxJ0jAFKsP6ByWeVv6Tr5Q
举例为:
- 1# 假设输入已经经过分词变成word_ids. shape=[2, 3]
- 2input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
- 3input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
- 4# segment_emebdding. 表示第一个样本前两个词属于句子1,后一个词属于句子2.
- 5# 第二个样本的第一个词属于句子1, 第二次词属于句子2,第三个元素0表示padding
- 6token_type_ids = tf.constant([[0, 0, 1], [0, 1, 0]])
- 7
- 8# 创建BertConfig实例
- 9config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
- 10 num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
- 11
- 12# 创建BertModel实例
- 13model = modeling.BertModel(config=config, is_training=True,
- 14 input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
- 15
- 16
- 17label_embeddings = tf.get_variable(...)
- 18#得到最后一层的第一个Token也就是[CLS]向量表示,可以看成是一个句子的embedding
- 19pooled_output = model.get_pooled_output()
- 20logits = tf.matmul(pooled_output, label_embeddings)
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。