当前位置:   article > 正文

NER问题的模型训练5种相关提升tricks_cmedqaner

cmedqaner

NER问题的模型训练5种相关提升tricks

一.cMedQANER数据集介绍

用于医疗领域的命名实体识别任务

实体标签:{'physiology', 'test', 'disease', 'time', 'drug', 'symptom', 'body', 'department', 'crowd', 'feature', 'treatment'}

训练数据采用BIO的形式进行标注

  1. 便 B_disease
  2. 秘 I_disease
  3. 两 O
  4. 个 O
  5. 多 O
  6. 月 O
  7. 不 O
  8. 清 O
  9. 楚 O

二.模型选择

采用bert+bilstm+crf

2.1数据清洗

  1. def load_data(data_path,max_len):
  2. """加载数据
  3. 单条格式:[(片段1, 标签1), (片段2, 标签2), (片段3, 标签3), ...]
  4. """
  5. datasets = []
  6. samples_len = []
  7. X = []
  8. y = []
  9. sentence = []
  10. labels = []
  11. split_pattern = re.compile(r'[;;。,、?!\.\?,! ]')
  12. with open(data_path,'r',encoding = 'utf8') as f:
  13. for line in f.readlines():
  14. #每行为一个字符和其tag,中间用tab或者空格隔开
  15. # sentence = [w1,w2,w3,...,wn], labels=[B-xx,I-xxx,,,...,O]
  16. line = line.strip().split()
  17. if(not line or len(line) < 2):
  18. X.append(sentence)
  19. y.append(labels)
  20. sentence = []
  21. labels = []
  22. continue
  23. word, tag = line[0], line[1].replace('_','-').replace('M','I').replace('E','I').replace('S','B') # BMES -> BIO
  24. if split_pattern.match(word) and len(sentence)+8 >= max_len:
  25. sentence.append(word)
  26. labels.append(tag)
  27. X.append(sentence)
  28. y.append(labels)
  29. sentence = []
  30. labels = []
  31. else:
  32. sentence.append(word)
  33. labels.append(tag)
  34. if len(sentence):
  35. X.append(sentence)
  36. sentence = []
  37. y.append(labels)
  38. labels = []
  39. for token_seq,label_seq in zip(X,y):
  40. #目标sample_seq=[['XXXX','disease'],['asaa','drug'],[],...]
  41. if len(token_seq) < 2:
  42. continue
  43. sample_seq, last_flag = [], ''
  44. for token, this_flag in zip(token_seq,label_seq):
  45. if this_flag == 'O' and last_flag == 'O':
  46. sample_seq[-1][0] += token
  47. elif this_flag == 'O' and last_flag != 'O':
  48. sample_seq.append([token, 'O'])
  49. elif this_flag[:1] == 'B':
  50. sample_seq.append([token, this_flag[2:]]) # B-city
  51. else:
  52. if sample_seq:
  53. sample_seq[-1][0] += token
  54. last_flag = this_flag
  55. datasets.append(sample_seq)
  56. samples_len.append(len(token_seq))
  57. if len(token_seq) > 200:
  58. print(token_seq)
  59. df = pd.DataFrame(samples_len)
  60. print(data_path,'\n',df.describe())
  61. print(sorted(set([i for arr in y for i in arr])))
  62. #datasets训练数据
  63. return datasets,y

2.2模型的搭建

  1. def bert_bilstm_crf(config_path,checkpoint_path,num_labels,lstm_units,drop_rate,leraning_rate):
  2. bert = build_transformer_model(
  3. config_path = config_path,
  4. checkpoint_path = checkpoint_path,
  5. model = 'bert',
  6. return_keras_model = False
  7. )
  8. x = bert.model.output # [batch_size, seq_length, 768]
  9. lstm = keras.layers.Bidirectional(
  10. keras.layers.LSTM(
  11. lstm_units,
  12. kernel_initializer='he_normal',
  13. return_sequences=True
  14. )
  15. )(x) # [batch_size, seq_length, lstm_units * 2]
  16. x = keras.layers.concatenate(
  17. [lstm,x],
  18. axis=-1
  19. ) # [batch_size, seq_length, lstm_units * 2 + 768]
  20. x = keras.layers.TimeDistributed(
  21. keras.layers.Dropout(drop_rate)
  22. )(x) # [batch_size, seq_length, lstm_units * 2 + 768]
  23. x = keras.layers.TimeDistributed(
  24. keras.layers.Dense(
  25. num_labels,
  26. activation='relu',
  27. kernel_initializer='he_normal',
  28. )
  29. )(x) # [batch_size, seq_length, num_labels]
  30. crf = ConditionalRandomField()
  31. output = crf(x)
  32. model = keras.models.Model(bert.input, output)
  33. model.summary()
  34. model.compile(
  35. loss=crf.sparse_loss,
  36. optimizer=Adam(leraning_rate),
  37. metrics=[crf.sparse_accuracy]
  38. )
  39. return model,crf

2.3保存最佳模型的回调函数设计

  1. checkpoint = keras.callbacks.ModelCheckpoint(
  2. checkpoint_save_path,
  3. monitor='val_sparse_accuracy',
  4. verbose=1,
  5. save_best_only=True,
  6. mode='max'
  7. )
  8. model.fit(
  9. train_generator.forfit(),
  10. steps_per_epoch=len(train_generator),
  11. validation_data=valid_generator.forfit(),
  12. validation_steps=len(valid_generator),
  13. epochs=epochs,
  14. callbacks=[checkpoint]
  15. )

注意这里模型保存最佳模型的时候是根据验证数据中对标签(B-disease,I-drug)这种准确度进行的,而我们的评估指标f1不是基于标签的,是基于实体的,这也导致了训练监控指标和评估指标不一致的出现,模型还可以进一步优化,在之后的tricks里面会说做法。

2.3 CRF解码器

把模型预测出的标签解码成实体

  1. class NamedEntityRecognizer(ViterbiDecoder):
  2. """命名实体识别器
  3. """
  4. def recognize(self, text):
  5. #text = ['','','','']
  6. tokens = tokenizer.tokenize(text)
  7. while len(tokens) > max_len: #移除倒数第二个位置
  8. tokens.pop(-2)
  9. """
  10. rematch:给出原始的text和tokenize后的tokens的映射关系
  11. """
  12. mapping = tokenizer.rematch(text, tokens)
  13. token_ids = tokenizer.tokens_to_ids(tokens)
  14. segment_ids = [0] * len(token_ids)
  15. token_ids, segment_ids = to_array([token_ids], [segment_ids]) # ndarray
  16. nodes = model.predict([token_ids, segment_ids])[0] # [seq_len,43]
  17. labels = self.decode(nodes) # id [sqe_len,], [0 0 0 0 0 7 8 8 0 0 0 0 0 0 0]
  18. entities, starting = [], False
  19. """
  20. test_data[1:2][0]:
  21. [ ['浙江省', 'prov'],
  22. ['杭州市', 'city'],
  23. ['余杭', 'district'],
  24. ['乔司街道', 'town'],
  25. ['博卡路', 'road'],
  26. ['0号', 'roadno'],
  27. ['博卡制衣', 'poi']]
  28. text:'浙江省杭州市余杭乔司街道博卡路0号博卡制衣'
  29. tokens:['[CLS]','浙','江','省','杭','州','市','余','杭','乔','司','街','道','博','卡','路','0','号','博','卡','制','衣','[SEP]']
  30. nodes:(23, 43)
  31. labels:(23,)[ 0 27 28 28 5 6 6 15 16 39 40 40 40 31 32 32 33 34 25 26 26 26 0]
  32. entities:
  33. [[[1, 2, 3], 'prov'],
  34. [[4, 5, 6], 'city'],
  35. [[7, 8], 'district'],
  36. [[9, 10, 11, 12], 'town'],
  37. [[13, 14, 15], 'road'],
  38. [[16, 17], 'roadno'],
  39. [[18, 19, 20, 21], 'poi']]
  40. mapping:[[],[0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[]]
  41. """
  42. for i, label in enumerate(labels):
  43. if label > 0:
  44. #奇数是B- 开始id
  45. if label % 2 == 1:
  46. starting = True
  47. entities.append([[i], id2label[(label - 1) // 2]])
  48. elif starting:
  49. entities[-1][0].append(i)
  50. else:
  51. starting = False
  52. else:
  53. starting = False
  54. """
  55. return:[ ('浙江省', 'prov'),
  56. ('杭州市', 'city'),
  57. ('余杭', 'district'),
  58. ('乔司街道', 'town'),
  59. ('博卡路', 'road'),
  60. ('0号', 'roadno'),
  61. ('博卡制衣', 'poi')]
  62. """
  63. return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]

上面我以天池的某街道识别数据针对解码这一过程进行了详细的注释,同样适用于我们现在使用的医疗数据集的解码过程,可以看到最后返回的不再是一个个标签,而是由标签组成的实体(‘浙江省’, ‘prov’)

三.模型优化的5种tricks

3.1统一训练监控指标和评估指标

这里是想保存最优模型的时候就也以实体作为f1指标的,同评估的时候保持一致

  1. #假设data就是验证集
  2. def ner_metrics(data):
  3. X,Y,Z = 1e-6,1e-6,1e-6
  4. for d in tqdm(data):
  5. text = ''.join([i[0] for i in d])
  6. pred = NER.recognize(text) #注:这里的NER就是上面的CRF解码器实例化的一个名字
  7. R = set(pred)
  8. T = set([tuple(i) for i in d if i[1] != 'O'])
  9. X += len(R & T)
  10. Y += len(R)
  11. Z += len(T)
  12. f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
  13. return f1, precision, recall

#评估回调函数

  1. class Evaluator(keras.callbacks.Callback):
  2. def __init__(self):
  3. super(Evaluator,self).__init__()
  4. self.best_val_f1 = 0
  5. def on_epoch_end(self,epoch,logs=None):
  6. #每一轮都需要更新概率矩阵
  7. NER.trans = K.eval(CRF.trans)
  8. f1, precision, recall = ner_metrics(valid_data)
  9. if f1 > self.best_val_f1:
  10. model.save_weights(checkpoint_save_path)
  11. self.best_val_f1 = f1
  12. print('save model to {}'.format(checkpoint_save_path))
  13. else:
  14. global leraning_rate
  15. leraning_rate = leraning_rate / 5
  16. print(
  17. 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
  18. (f1, precision, recall, self.best_val_f1)
  19. )
  1. checkpoint = keras.callbacks.ModelCheckpoint(
  2. checkpoint_save_path,
  3. monitor='val_sparse_accuracy',
  4. verbose=1,
  5. save_best_only=True,
  6. mode='max'
  7. )
  8. evaluator = Evaluator() #实例化回调函数(改变指标为实体f1
  9. model.fit(
  10. train_generator.forfit(),
  11. steps_per_epoch=len(train_generator),
  12. validation_data=valid_generator.forfit(),
  13. validation_steps=len(valid_generator),
  14. epochs=epochs,
  15. callbacks=[evaluator] #回调函数使用evaluator
  16. )

3.2学习率衰减策略
参考博文
Keras中那些学习率衰减策略_西檬饭的博客-CSDN博客_keras 学习率衰减
利用学习率衰减找到最优结果_mb5ffd6eef9281a的技术博客_51CTO博客

  1. def scheduler(epoch):
  2. return leraning_rate /(max(2*(epoch-1),1))
  3. lr_scheduler = keras.callbacks.LearningRateScheduler(scheduler)
  4. model.fit(
  5. train_generator.forfit(),
  6. steps_per_epoch=len(train_generator),
  7. validation_data=valid_generator.forfit(),
  8. validation_steps=len(valid_generator),
  9. epochs=epochs,
  10. callbacks=[evaluator,lr_scheduler]
  11. )

3.3分层学习率技巧

因为这里使用bert模型进行微调,则后面的bilstm和dense层的权重都使用的是随机初始化的方式,那在有限的epoch里面,可能效果并不是很好,另一方面设置的学习率是1e-5,学习率也很小所以这里采用:分层设置学习率,非bert层要大

  1. class SetLearningRate:
  2. """
  3. 层的一个包装,用来设置当前层的学习率
  4. """
  5. def __init__(self, layer, lamb, is_ada=False):
  6. self.layer = layer
  7. self.lamb = lamb # 学习率比例
  8. self.is_ada = is_ada # 是否自适应学习率优化器
  9. def __call__(self, inputs):
  10. with K.name_scope(self.layer.name):
  11. if not self.layer.built:
  12. input_shape = K.int_shape(inputs)
  13. self.layer.build(input_shape)
  14. self.layer.built = True
  15. if self.layer._initial_weights is not None:
  16. self.layer.set_weights(self.layer._initial_weights)
  17. for key in ['kernel', 'bias', 'embeddings', 'depthwise_kernel', 'pointwise_kernel', 'recurrent_kernel', 'gamma', 'beta']:
  18. if hasattr(self.layer, key):
  19. weight = getattr(self.layer, key)
  20. if self.is_ada:
  21. lamb = self.lamb # 自适应学习率优化器直接保持lamb比例
  22. else:
  23. lamb = self.lamb**0.5 # SGD(包括动量加速),lamb要开平方
  24. K.set_value(weight, K.eval(weight) / lamb) # 更改初始化
  25. setattr(self.layer, key, weight * lamb) # 按比例替换
  26. return self.layer(inputs)
  1. #模型加入分层学习率设置
  2. def bert_bilstm_crf(config_path,checkpoint_path,num_labels,lstm_units,drop_rate,leraning_rate):
  3. bert = build_transformer_model(
  4. config_path = config_path,
  5. checkpoint_path = checkpoint_path,
  6. model = 'bert',
  7. return_keras_model = False
  8. )
  9. x = bert.model.output # [batch_size, seq_length, 768]
  10. lstm = SetLearningRate(
  11. keras.layers.Bidirectional(
  12. keras.layers.LSTM(
  13. lstm_units,
  14. kernel_initializer='he_normal',
  15. return_sequences=True
  16. )
  17. ),
  18. 500, #是bert学习率的500
  19. True
  20. )(x) # [batch_size, seq_length, lstm_units * 2]
  21. x = keras.layers.concatenate(
  22. [lstm,x],
  23. axis=-1
  24. ) # [batch_size, seq_length, lstm_units * 2 + 768]
  25. x = keras.layers.TimeDistributed(
  26. keras.layers.Dropout(drop_rate)
  27. )(x) # [batch_size, seq_length, lstm_units * 2 + 768]
  28. x = SetLearningRate(
  29. keras.layers.TimeDistributed(
  30. keras.layers.Dense(
  31. num_labels,
  32. activation='relu',
  33. kernel_initializer='he_normal',
  34. )
  35. ),
  36. 500,
  37. True
  38. )(x) # [batch_size, seq_length, num_labels]
  39. crf = ConditionalRandomField(lr_multiplier=500)
  40. output = crf(x)
  41. model = keras.models.Model(bert.input, output)
  42. model.summary()
  43. model.compile(
  44. loss=crf.sparse_loss,
  45. optimizer=Adam(leraning_rate),
  46. metrics=[crf.sparse_accuracy]
  47. )
  48. return model,crf

3.4使用对抗训练提升模型鲁棒性

  1. def adversarial_training(model, embedding_name, epsilon=1):
  2. """
  3. 给模型添加对抗训练
  4. 其中model是需要添加对抗训练的keras模型
  5. 这里针对的是bert的embdedding层
  6. """
  7. if model.train_function is None: # 如果还没有训练函数
  8. model._make_train_function() # 手动make
  9. old_train_function = model.train_function # 备份旧的训练函数
  10. # 查找Embedding层
  11. for output in model.outputs:
  12. embedding_layer = search_layer(output, embedding_name)
  13. if embedding_layer is not None:
  14. break
  15. if embedding_layer is None:
  16. raise Exception('Embedding layer not found')
  17. # 求Embedding梯度
  18. embeddings = embedding_layer.embeddings # Embedding矩阵
  19. gradients = K.gradients(model.total_loss, [embeddings]) # Embedding梯度
  20. gradients = K.zeros_like(embeddings) + gradients[0] # 转为dense tensor
  21. # 封装为函数
  22. inputs = (
  23. model._feed_inputs + model._feed_targets + model._feed_sample_weights
  24. ) # 所有输入层
  25. embedding_gradients = K.function(
  26. inputs=inputs,
  27. outputs=[gradients],
  28. name='embedding_gradients',
  29. ) # 封装为函数
  30. def train_function(inputs):
  31. # 重新定义训练函数
  32. grads = embedding_gradients(inputs)[0] # Embedding梯度
  33. delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8) # 计算扰动
  34. K.set_value(embeddings, K.eval(embeddings) + delta) # 注入扰动
  35. outputs = old_train_function(inputs) # 梯度下降
  36. K.set_value(embeddings, K.eval(embeddings) - delta) # 删除扰动
  37. return outputs
  38. model.train_function = train_function # 覆盖原训练函数

3.5更精细化调参

例如调整学习率or batch_size or lamb等等

四.各种tricks加持之下的模型提升效果

之后补上,最近服务器上不去

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号