当前位置:   article > 正文

【Bert4keras】解决Key bert/embeddings/word_embeddings not found in checkpoint

key bert/embeddings/word_embeddings not found in checkpoint

1 问题

使用苏神的bert4keras,预训练后产生了多个文件,但是在训练加载预训练模型的时候出错Key bert/embeddings/word_embeddings not found in checkpoint
(1)pretrain.py文件

# 预训练脚本
import os
os.environ['TF_KERAS'] = '1'  # 必须使用tf.keras

import tensorflow as tf
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.optimizers import extend_with_gradient_accumulation
from bert4keras.optimizers import extend_with_layer_adaptation
from bert4keras.optimizers import extend_with_piecewise_linear_lr
from bert4keras.optimizers import extend_with_weight_decay
from keras.layers import Input, Lambda
from keras.models import Model

from data_utils import TrainingDatasetRoBERTa

# 语料路径和模型保存路径
model_saved_path = 'pre_models/bert_model.ckpt'
corpus_paths = [f'corpus_tfrecord/corpus.{i}.tfrecord' for i in range(10)]

# 其他配置
sequence_length = 512
batch_size = 64
config_path = 'pre_models/bert_config.json'
checkpoint_path = None  # 如果从零训练,就设为None
learning_rate = 0.00176
weight_decay_rate = 0.01
num_warmup_steps = 3125
num_train_steps = 5000
steps_per_epoch = 100
grad_accum_steps = 16   # 大于1即表明使用梯度累积
epochs = num_train_steps * grad_accum_steps // steps_per_epoch
exclude_from_weight_decay = ['Norm', 'bias']
tpu_address = None      # 如果用多GPU跑,直接设为None
which_optimizer = 'lamb'    # adam 或 lamb,均自带weight decay
lr_schedule = {
    num_warmup_steps * grad_accum_steps: 1.0,
    num_train_steps * grad_accum_steps: 0.0,
}
floatx = K.floatx()

# 读取数据集,构建数据张量
dataset = TrainingDatasetRoBERTa.load_tfrecord(
    record_names=corpus_paths,
    sequence_length=sequence_length,
    batch_size=batch_size // grad_accum_steps,
)


def build_transformer_model_with_mlm():
    """带mlm的bert模型。"""
    bert = build_transformer_model(
        config_path, with_mlm='linear', return_keras_model=False
    )
    proba = bert.model.output

    # 辅助输入
    token_ids = Input(shape=(None,), dtype='int64', name='token_ids')   # 目标id
    is_masked = Input(shape=(None,), dtype=floatx, name='is_masked')    # mask标记

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层。"""
        y_true, y_pred, mask = inputs
        loss = K.sparse_categorical_crossentropy(
            y_true, y_pred, from_logits=True
        )
        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        y_true = K.cast(y_true, floatx)
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
        return acc

    mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])

    train_model = Model(
        bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]
    )

    loss = {
        'mlm_loss': lambda y_true, y_pred: y_pred,
        'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
    }

    return bert, train_model, loss


def build_transformer_model_for_pretraining():
    """构建训练模型,通用于TPU/GPU
    注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
    写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
    tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
    时要格外留意。
    """
    bert, train_model, loss = build_transformer_model_with_mlm()

    # 优化器
    optimizer = extend_with_weight_decay(Adam)
    if which_optimizer == 'lamb':
        optimizer = extend_with_layer_adaptation(optimizer)
    optimizer = extend_with_piecewise_linear_lr(optimizer)
    optimizer_params = {
        'learning_rate': learning_rate,
        'lr_schedule': lr_schedule,
        'weight_decay_rate': weight_decay_rate,
        'exclude_from_weight_decay': exclude_from_weight_decay,
        'bias_correction': False,
    }
    if grad_accum_steps > 1:
        optimizer = extend_with_gradient_accumulation(optimizer)
        optimizer_params['grad_accum_steps'] = grad_accum_steps
    optimizer = optimizer(**optimizer_params)

    # 模型定型
    train_model.compile(loss=loss, optimizer=optimizer)

    # 如果传入权重,则加载。注:须在此处加载,才保证不报错。
    if checkpoint_path is not None:
        bert.load_weights_from_checkpoint(checkpoint_path)

    return train_model,bert


if tpu_address is None:
    # 单机多卡模式(多机多卡也类似,但需要硬软件配合,请参考https://tf.wiki)
    strategy = tf.distribute.MirroredStrategy()
else:
    # TPU模式
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
        tpu=tpu_address
    )
    tf.config.experimental_connect_to_host(resolver.master())
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

with strategy.scope():
    train_model,bert = build_transformer_model_for_pretraining()
    train_model.summary()


class ModelCheckpoint(keras.callbacks.Callback):
    """自动保存最新模型。"""
    def on_epoch_end(self, epoch, logs=None):
        self.model.save_weights(model_saved_path, overwrite=True)


checkpoint = ModelCheckpoint()  # 保存模型
csv_logger = keras.callbacks.CSVLogger('training.log')  # 记录日志

# 模型训练
train_model.fit(
    dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    callbacks=[checkpoint, csv_logger],
)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163

(2)train.py文件
在train.py中加载模型时,报错Key bert/embeddings/word_embeddings not found in checkpoint

# 加载预训练模型
bert = build_transformer_model(
    config_path=BERT_CONFIG_PATH,
    checkpoint_path=model_saved_path,
    return_keras_model=False,
)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

2 解决办法

根据苏神的博客介绍到,需要把预训练的模型加载后,重新生成权重模型,再使用该权重模型,就不会出错。所以在pretrain.py文件中最后加两行代码,并注释掉train_model.fit。重新执行一遍该文件。再执行train.py就不会出现问题

# 以上代码一样,此处省略
# 、、、、
# 模型训练
# train_model.fit(
#     dataset,
#     steps_per_epoch=steps_per_epoch,
#     epochs=epochs,
#     callbacks=[checkpoint, csv_logger],
# )
train_model.load_weights(model_saved_path)
bert.save_weights_as_checkpoint(filename='bert_model/bert_model.ckpt')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/499784
推荐阅读
相关标签
  

闽ICP备14008679号