赞
踩
今天要介绍的是BERT最主要的模型实现部分-----BertModel,代码位于
modeling.py模块
如有解读不正确,请务必指出~
这部分代码主要定义了BERT模型的一些默认参数,另外包括了一些文件处理函数。
class BertConfig(object): """BERT模型的配置类.""" def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02): self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range @classmethod def from_dict(cls, json_object): """Constructs a `BertConfig` from a Python dictionary of parameters.""" config = BertConfig(vocab_size=None) for (key, value) in six.iteritems(json_object): config.__dict__[key] = value return config @classmethod def from_json_file(cls, json_file): """Constructs a `BertConfig` from a json file of parameters.""" with tf.gfile.GFile(json_file, "r") as reader: text = reader.read() return cls.from_dict(json.loads(text)) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
参数具体含义:
vocab_size: 单词表大小
hidden_size: 隐藏层神经元节点数 H(指Feed Forward输出向量的维数)
num_hidden_layers: Transformer encoder中的隐藏层数 L
num_attention_heads: multi-head attention 的head数 A
intermediate_size: encoder的“中间”隐层神经元数(例如feed-forward layer)
hidden_act: 隐藏层激活函数
hidden_dropout_prob: 隐层dropout率
attention_probs_dropout_prob: 注意力部分的dropout
max_position_embeddings: 最大位置编码
type_vocab_size: token_type_ids的词典大小,这个名字很模糊
initializer_range: truncated_normal_initializer初始化方法的stdev
这里要注意一点,可能刚看的时候对 type_vocab_size这个参数会有点不理解,其实就是在next sentence prediction任务里的Segment A和 Segment B,用在multi-sentence任务中。在下载的bert_config.json文件里也有说明,默认值应该为2。
论文定义了两个版本,一个是base版本,一个是large版本。Large版本(L=24, H=1024, A=16, Total Parameters=340M)。base版本( L=12, H=768, A=12, Total Parameters=110M)。L代表网络层数,H代表隐藏层神经元节点数,A代表self attention head的数量。
BertModel模型详解:
class BertModel(object): """BERT model ("Bidirectional Encoder Representations from Transformers"). Example usage: ```python # Already been converted into WordPiece token ids input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) config = modeling.BertConfig(vocab_size=32000, hidden_size=512, num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) model = modeling.BertModel(config=config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) label_embeddings = tf.get_variable(...) pooled_output = model.get_pooled_output() logits = tf.matmul(pooled_output, label_embeddings) ... """ def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None:#输入没有被masked input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None:#只有一个句子,不用区分token属于哪个句子 token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # 添加 positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer. # `sequence_outpu
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。