当前位置:   article > 正文

【NLP_命名实体识别】Bert/Albert+CRF模型实现_bert + crf

bert + crf

模块调用

2021/3/8 周一:基于模块调用部分(如下)bug,重装Anaconda与Tensorflow,解决bug。

  1. import numpy as np
  2. from bert4keras.backend import keras, K
  3. from bert4keras.models import build_transformer_model
  4. from bert4keras.tokenizers import Tokenizer
  5. from bert4keras.optimizers import Adam
  6. from bert4keras.snippets import sequence_padding, DataGenerator
  7. from bert4keras.snippets import open, ViterbiDecoder
  8. from bert4keras.layers import ConditionalRandomField
  9. from keras.layers import Dense
  10. from keras.models import Model
  11. from tqdm import tqdm
  12. from tensorflow import ConfigProto
  13. from tensorflow import InteractiveSession
  14. #上述两句中的“tensorflow”原为 tensorflow.compat.v1

报错原因与解决方案:1.Anaconda内置的Python版本与Tensorflow版本不一致——未注意到Tensorflow不太适合Python3.7以上的版本。重装内置3.6Python版本的Anaconda,再重装相应版本的Tensorflow,即可解决。2.安装Tensorflow-GPU多次,均以失败告终。原来,我的电脑的显卡配置并不支持GPU版本。

小结:有些时候,bug“缠身”,不妨直接卸载重装,可能更节省时间。

2021/3/9 周二:开组会,接受批评,反思自己。

保存、加载并使用已训练的Bert/Albert-CRF模型

2021/3/10:使用训练好的Bert/Albert-CRF模型,同时,在此基础上,加一层BiLSTM网络,得修改后的Albert-BiLSTM-CRF模型(见下一篇文章),开始训练。

  1. '''
  2. if __name__ == '__main__':
  3. evaluator = Evaluate()
  4. train_generator = data_generator(train_data, batch_size)
  5. model.fit_generator(
  6. train_generator.forfit(),
  7. steps_per_epoch=len(train_generator),
  8. epochs=epochs,
  9. callbacks=[evaluator]
  10. )
  11. else:
  12. model.load_weights('best_model.weights')
  13. '''
  14. model.load_weights('best_model.weights')
  15. NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
  16. ner=NER.recognize("我在厦门")
  17. print(ner)

流程:先训练模型,得训练好的权重文件,此时,也可同时得到训练好的模型文件。加载上述权重文件,再修改最后的函数调用部分即可。

注意:类的实例化。

Bert/Albert-CRF模型完整代码

仅训练与评估模型:

  1. import numpy as np
  2. from bert4keras.backend import keras, K
  3. from bert4keras.models import build_transformer_model
  4. from bert4keras.tokenizers import Tokenizer
  5. from bert4keras.optimizers import Adam
  6. from bert4keras.snippets import sequence_padding, DataGenerator
  7. from bert4keras.snippets import open, ViterbiDecoder
  8. from bert4keras.layers import ConditionalRandomField
  9. from keras.layers import Dense
  10. from keras.models import Model
  11. from tqdm import tqdm
  12. from tensorflow import ConfigProto
  13. from tensorflow import InteractiveSession
  14. #上述两句中的“tensorflow”原为 tensorflow.compat.v1
  15. config = ConfigProto()
  16. # config.gpu_options.per_process_gpu_memory_fraction = 0.2
  17. config.gpu_options.allow_growth = True
  18. session = InteractiveSession(config=config)
  19. maxlen = 256
  20. epochs = 1#10
  21. batch_size = 16
  22. bert_layers = 12
  23. learing_rate = 1e-5 # bert_layers越小,学习率应该要越大
  24. crf_lr_multiplier = 10 # 必要时扩大CRF层的学习率#1000
  25. # # bert配置
  26. # config_path = './bert_model/chinese_L-12_H-768_A-12/bert_config.json'
  27. # checkpoint_path = './bert_model/chinese_L-12_H-768_A-12/bert_model.ckpt'
  28. # dict_path = './bert_model/chinese_L-12_H-768_A-12/vocab.txt'
  29. #albert配置
  30. config_path = './bert_model/albert_large/albert_config.json'
  31. checkpoint_path = './bert_model/albert_large/model.ckpt-best'
  32. dict_path = './bert_model/albert_large/vocab_chinese.txt'
  33. def load_data(filename):
  34. D = []
  35. with open(filename, encoding='utf-8') as f:
  36. f = f.read()
  37. for l in f.split('\n\n'):
  38. if not l:
  39. continue
  40. d, last_flag = [], ''
  41. for c in l.split('\n'):
  42. char, this_flag = c.split(' ')
  43. if this_flag == 'O' and last_flag == 'O':
  44. d[-1][0] += char
  45. elif this_flag == 'O' and last_flag != 'O':
  46. d.append([char, 'O'])
  47. elif this_flag[:1] == 'B':
  48. d.append([char, this_flag[2:]])
  49. else:
  50. d[-1][0] += char
  51. last_flag = this_flag
  52. D.append(d)
  53. return D
  54. # 标注数据
  55. train_data = load_data('./data/example.train')
  56. valid_data = load_data('./data/example.dev')
  57. test_data = load_data('./data/example.test')
  58. # 建立分词器
  59. tokenizer = Tokenizer(dict_path, do_lower_case=True)
  60. # 类别映射
  61. labels = ['PER', 'LOC', 'ORG']
  62. id2label = dict(enumerate(labels))
  63. label2id = {j: i for i, j in id2label.items()}
  64. num_labels = len(labels) * 2 + 1
  65. class data_generator(DataGenerator):
  66. """数据生成器
  67. """
  68. def __iter__(self, random=False):
  69. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  70. for is_end, item in self.sample(random):
  71. token_ids, labels = [tokenizer._token_start_id], [0]
  72. for w, l in item:
  73. w_token_ids = tokenizer.encode(w)[0][1:-1]
  74. if len(token_ids) + len(w_token_ids) < maxlen:
  75. token_ids += w_token_ids
  76. if l == 'O':
  77. labels += [0] * len(w_token_ids)
  78. else:
  79. B = label2id[l] * 2 + 1
  80. I = label2id[l] * 2 + 2
  81. labels += ([B] + [I] * (len(w_token_ids) - 1))
  82. else:
  83. break
  84. token_ids += [tokenizer._token_end_id]
  85. labels += [0]
  86. segment_ids = [0] * len(token_ids)
  87. batch_token_ids.append(token_ids)
  88. batch_segment_ids.append(segment_ids)
  89. batch_labels.append(labels)
  90. if len(batch_token_ids) == self.batch_size or is_end:
  91. batch_token_ids = sequence_padding(batch_token_ids)
  92. batch_segment_ids = sequence_padding(batch_segment_ids)
  93. batch_labels = sequence_padding(batch_labels)
  94. yield [batch_token_ids, batch_segment_ids], batch_labels
  95. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  96. """
  97. 后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
  98. """
  99. model = build_transformer_model(
  100. config_path,
  101. checkpoint_path,
  102. model='albert',
  103. )
  104. output_layer = 'Transformer-FeedForward-Norm'
  105. output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
  106. # model = build_transformer_model(
  107. # config_path,
  108. # checkpoint_path,
  109. # )
  110. #
  111. # output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
  112. # output = model.get_layer(output_layer).output
  113. output = Dense(num_labels)(output)
  114. CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
  115. output = CRF(output)
  116. model = Model(model.input, output)
  117. model.summary()
  118. model.compile(
  119. loss=CRF.sparse_loss,
  120. optimizer=Adam(learing_rate),
  121. metrics=[CRF.sparse_accuracy]
  122. )
  123. class NamedEntityRecognizer(ViterbiDecoder):
  124. """命名实体识别器
  125. """
  126. def recognize(self, text):
  127. tokens = tokenizer.tokenize(text)
  128. while len(tokens) > 512:
  129. tokens.pop(-2)
  130. mapping = tokenizer.rematch(text, tokens)
  131. token_ids = tokenizer.tokens_to_ids(tokens)
  132. segment_ids = [0] * len(token_ids)
  133. nodes = model.predict([[token_ids], [segment_ids]])[0]
  134. labels = self.decode(nodes)
  135. entities, starting = [], False
  136. for i, label in enumerate(labels):
  137. if label > 0:
  138. if label % 2 == 1:
  139. starting = True
  140. entities.append([[i], id2label[(label - 1) // 2]])
  141. elif starting:
  142. entities[-1][0].append(i)
  143. else:
  144. starting = False
  145. else:
  146. starting = False
  147. return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
  148. for w, l in entities]
  149. NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
  150. def evaluate(data):
  151. """评测函数
  152. """
  153. X, Y, Z = 1e-10, 1e-10, 1e-10
  154. for d in tqdm(data):
  155. text = ''.join([i[0] for i in d])
  156. R = set(NER.recognize(text))
  157. T = set([tuple(i) for i in d if i[1] != 'O'])
  158. X += len(R & T)
  159. Y += len(R)
  160. Z += len(T)
  161. f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
  162. return f1, precision, recall
  163. class Evaluate(keras.callbacks.Callback):
  164. def __init__(self):
  165. self.best_val_f1 = 0
  166. def on_epoch_end(self, epoch, logs=None):
  167. trans = K.eval(CRF.trans)
  168. NER.trans = trans
  169. print(NER.trans)
  170. f1, precision, recall = evaluate(valid_data)
  171. # 保存最优
  172. if f1 >= self.best_val_f1:
  173. self.best_val_f1 = f1
  174. model.save_weights('best_model.weights')
  175. print(
  176. 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
  177. (f1, precision, recall, self.best_val_f1)
  178. )
  179. f1, precision, recall = evaluate(test_data)
  180. print(
  181. 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
  182. (f1, precision, recall)
  183. )
  184. if __name__ == '__main__':
  185. evaluator = Evaluate()
  186. train_generator = data_generator(train_data, batch_size)
  187. model.fit_generator(
  188. train_generator.forfit(),
  189. steps_per_epoch=len(train_generator),
  190. epochs=epochs,
  191. callbacks=[evaluator]
  192. )
  193. else:
  194. model.load_weights('best_model.weights')

使用训练好的模型,简例:

  1. import numpy as np
  2. from bert4keras.backend import keras, K
  3. from bert4keras.models import build_transformer_model
  4. from bert4keras.tokenizers import Tokenizer
  5. from bert4keras.optimizers import Adam
  6. from bert4keras.snippets import sequence_padding, DataGenerator
  7. from bert4keras.snippets import open, ViterbiDecoder
  8. from bert4keras.layers import ConditionalRandomField
  9. from keras.layers import Dense
  10. from keras.models import Model
  11. from tqdm import tqdm
  12. from tensorflow import ConfigProto
  13. from tensorflow import InteractiveSession
  14. #上述两句中的“tensorflow”原为 tensorflow.compat.v1
  15. config = ConfigProto()
  16. # config.gpu_options.per_process_gpu_memory_fraction = 0.2
  17. config.gpu_options.allow_growth = True
  18. session = InteractiveSession(config=config)
  19. maxlen = 256
  20. epochs = 1#10
  21. batch_size = 16
  22. bert_layers = 12
  23. learing_rate = 1e-5 # bert_layers越小,学习率应该要越大
  24. crf_lr_multiplier = 10 # 必要时扩大CRF层的学习率#1000
  25. # # bert配置
  26. # config_path = './bert_model/chinese_L-12_H-768_A-12/bert_config.json'
  27. # checkpoint_path = './bert_model/chinese_L-12_H-768_A-12/bert_model.ckpt'
  28. # dict_path = './bert_model/chinese_L-12_H-768_A-12/vocab.txt'
  29. #albert配置
  30. config_path = './bert_model/albert_large/albert_config.json'
  31. checkpoint_path = './bert_model/albert_large/model.ckpt-best'
  32. dict_path = './bert_model/albert_large/vocab_chinese.txt'
  33. def load_data(filename):
  34. D = []
  35. with open(filename, encoding='utf-8') as f:
  36. f = f.read()
  37. for l in f.split('\n\n'):
  38. if not l:
  39. continue
  40. d, last_flag = [], ''
  41. for c in l.split('\n'):
  42. char, this_flag = c.split(' ')
  43. if this_flag == 'O' and last_flag == 'O':
  44. d[-1][0] += char
  45. elif this_flag == 'O' and last_flag != 'O':
  46. d.append([char, 'O'])
  47. elif this_flag[:1] == 'B':
  48. d.append([char, this_flag[2:]])
  49. else:
  50. d[-1][0] += char
  51. last_flag = this_flag
  52. D.append(d)
  53. return D
  54. # 标注数据
  55. train_data = load_data('./data/example.train')
  56. valid_data = load_data('./data/example.dev')
  57. test_data = load_data('./data/example.test')
  58. # 建立分词器
  59. tokenizer = Tokenizer(dict_path, do_lower_case=True)
  60. # 类别映射
  61. labels = ['PER', 'LOC', 'ORG']
  62. id2label = dict(enumerate(labels))
  63. label2id = {j: i for i, j in id2label.items()}
  64. num_labels = len(labels) * 2 + 1
  65. class data_generator(DataGenerator):
  66. """数据生成器
  67. """
  68. def __iter__(self, random=False):
  69. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  70. for is_end, item in self.sample(random):
  71. token_ids, labels = [tokenizer._token_start_id], [0]
  72. for w, l in item:
  73. w_token_ids = tokenizer.encode(w)[0][1:-1]
  74. if len(token_ids) + len(w_token_ids) < maxlen:
  75. token_ids += w_token_ids
  76. if l == 'O':
  77. labels += [0] * len(w_token_ids)
  78. else:
  79. B = label2id[l] * 2 + 1
  80. I = label2id[l] * 2 + 2
  81. labels += ([B] + [I] * (len(w_token_ids) - 1))
  82. else:
  83. break
  84. token_ids += [tokenizer._token_end_id]
  85. labels += [0]
  86. segment_ids = [0] * len(token_ids)
  87. batch_token_ids.append(token_ids)
  88. batch_segment_ids.append(segment_ids)
  89. batch_labels.append(labels)
  90. if len(batch_token_ids) == self.batch_size or is_end:
  91. batch_token_ids = sequence_padding(batch_token_ids)
  92. batch_segment_ids = sequence_padding(batch_segment_ids)
  93. batch_labels = sequence_padding(batch_labels)
  94. yield [batch_token_ids, batch_segment_ids], batch_labels
  95. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  96. """
  97. 后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
  98. """
  99. model = build_transformer_model(
  100. config_path,
  101. checkpoint_path,
  102. model='albert',
  103. )
  104. output_layer = 'Transformer-FeedForward-Norm'
  105. output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
  106. # model = build_transformer_model(
  107. # config_path,
  108. # checkpoint_path,
  109. # )
  110. #
  111. # output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
  112. # output = model.get_layer(output_layer).output
  113. output = Dense(num_labels)(output)
  114. CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
  115. output = CRF(output)
  116. model = Model(model.input, output)
  117. model.summary()
  118. model.compile(
  119. loss=CRF.sparse_loss,
  120. optimizer=Adam(learing_rate),
  121. metrics=[CRF.sparse_accuracy]
  122. )
  123. class NamedEntityRecognizer(ViterbiDecoder):
  124. """命名实体识别器
  125. """
  126. def recognize(self,text):
  127. tokens = tokenizer.tokenize(text)
  128. while len(tokens) > 512:
  129. tokens.pop(-2)
  130. mapping = tokenizer.rematch(text, tokens)
  131. token_ids = tokenizer.tokens_to_ids(tokens)
  132. segment_ids = [0] * len(token_ids)
  133. nodes = model.predict([[token_ids], [segment_ids]])[0]
  134. labels = self.decode(nodes)
  135. entities, starting = [], False
  136. for i, label in enumerate(labels):
  137. if label > 0:
  138. if label % 2 == 1:
  139. starting = True
  140. entities.append([[i], id2label[(label - 1) // 2]])
  141. elif starting:
  142. entities[-1][0].append(i)
  143. else:
  144. starting = False
  145. else:
  146. starting = False
  147. return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
  148. for w, l in entities]
  149. def evaluate(data):
  150. """评测函数
  151. """
  152. X, Y, Z = 1e-10, 1e-10, 1e-10
  153. for d in tqdm(data):
  154. text = ''.join([i[0] for i in d])
  155. R = set(NER.recognize(text))
  156. T = set([tuple(i) for i in d if i[1] != 'O'])
  157. X += len(R & T)
  158. Y += len(R)
  159. Z += len(T)
  160. f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
  161. return f1, precision, recall
  162. class Evaluate(keras.callbacks.Callback):
  163. def __init__(self):
  164. self.best_val_f1 = 0
  165. def on_epoch_end(self, epoch, logs=None):
  166. trans = K.eval(CRF.trans)
  167. NER.trans = trans
  168. print(NER.trans)
  169. f1, precision, recall = evaluate(valid_data)
  170. # 保存最优
  171. if f1 >= self.best_val_f1:
  172. self.best_val_f1 = f1
  173. model.save_weights('best_model.weights')
  174. print(
  175. 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
  176. (f1, precision, recall, self.best_val_f1)
  177. )
  178. f1, precision, recall = evaluate(test_data)
  179. print(
  180. 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
  181. (f1, precision, recall)
  182. )
  183. model.load_weights('best_model.weights')
  184. NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
  185. ner=NER.recognize("我在厦门")
  186. print(ner)

小总结:多尝试,改改改 + 基础编程知识(如:类的使用)要扎实。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/570159
推荐阅读
相关标签
  

闽ICP备14008679号