当前位置:   article > 正文

【NLP_命名实体识别】Albert+BiLSTM+CRF模型训练、评估与使用_albert-bilstm-crf

albert-bilstm-crf

模型训练

2021/3/10:使用训练好的Bert/Albert-CRF模型,同时,在此基础上,加一层BiLSTM网络,得修改后的Albert-BiLSTM-CRF模型(见下一篇文章),开始训练。

修改思路:以已有的Albert+CRF模型代码为基础,参考网上的Albert+BiLSTM+CRF模型,稍加修改即可。值得注意的,无非是“三种模型”之间的数据传递类型,比如,将Albert模型训练得到的embedding,传入BiLSTM(参考:ALBERT+BiLSTM+CRF实现序列标注 - 光彩照人 - 博客园)。

调试过程:其间,多次用到命令行,安装需要的库、工具包,按部就班去做即可。

  1. import numpy as np
  2. from bert4keras.backend import keras, K
  3. from bert4keras.models import build_transformer_model
  4. from bert4keras.tokenizers import Tokenizer
  5. from bert4keras.optimizers import Adam
  6. from bert4keras.snippets import sequence_padding, DataGenerator
  7. from bert4keras.snippets import open, ViterbiDecoder
  8. from bert4keras.layers import ConditionalRandomField
  9. from keras.layers import Dense
  10. from keras.models import Model
  11. from tqdm import tqdm
  12. from tensorflow import ConfigProto
  13. from tensorflow import InteractiveSession
  14. from numpy import array
  15. from keras.models import Sequential
  16. from keras.layers import LSTM
  17. from keras.layers import Dense
  18. from keras.layers import Bidirectional
  19. from keras.layers import Dropout
  20. from keras.layers import TimeDistributed
  21. from keras_contrib.layers import CRF
  22. from keras_contrib.losses import crf_loss
  23. from keras_contrib.metrics import crf_accuracy, crf_viterbi_accuracy
  24. config = ConfigProto()
  25. # config.gpu_options.per_process_gpu_memory_fraction = 0.2
  26. config.gpu_options.allow_growth = True
  27. session = InteractiveSession(config=config)
  28. maxlen = 256
  29. epochs = 1#10
  30. batch_size = 16
  31. bert_layers = 12
  32. learing_rate = 1e-5 # bert_layers越小,学习率应该要越大
  33. crf_lr_multiplier = 10 # 必要时扩大CRF层的学习率#1000
  34. # # bert配置
  35. # config_path = './bert_model/chinese_L-12_H-768_A-12/bert_config.json'
  36. # checkpoint_path = './bert_model/chinese_L-12_H-768_A-12/bert_model.ckpt'
  37. # dict_path = './bert_model/chinese_L-12_H-768_A-12/vocab.txt'
  38. #albert配置
  39. config_path = './bert_model/albert_large/albert_config.json'
  40. checkpoint_path = './bert_model/albert_large/model.ckpt-best'
  41. dict_path = './bert_model/albert_large/vocab_chinese.txt'
  42. def load_data(filename):
  43. D = []
  44. with open(filename, encoding='utf-8') as f:
  45. f = f.read()
  46. for l in f.split('\n\n'):
  47. if not l:
  48. continue
  49. d, last_flag = [], ''
  50. for c in l.split('\n'):
  51. char, this_flag = c.split(' ')
  52. if this_flag == 'O' and last_flag == 'O':
  53. d[-1][0] += char
  54. elif this_flag == 'O' and last_flag != 'O':
  55. d.append([char, 'O'])
  56. elif this_flag[:1] == 'B':
  57. d.append([char, this_flag[2:]])
  58. else:
  59. d[-1][0] += char
  60. last_flag = this_flag
  61. D.append(d)
  62. return D
  63. # 标注数据
  64. train_data = load_data('./data/example.train')
  65. valid_data = load_data('./data/example.dev')
  66. test_data = load_data('./data/example.test')
  67. # 建立分词器
  68. tokenizer = Tokenizer(dict_path, do_lower_case=True)
  69. # 类别映射
  70. labels = ['PER', 'LOC', 'ORG']
  71. id2label = dict(enumerate(labels))
  72. label2id = {j: i for i, j in id2label.items()}
  73. num_labels = len(labels) * 2 + 1
  74. class data_generator(DataGenerator):
  75. """数据生成器
  76. """
  77. def __iter__(self, random=False):
  78. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  79. for is_end, item in self.sample(random):
  80. token_ids, labels = [tokenizer._token_start_id], [0]
  81. for w, l in item:
  82. w_token_ids = tokenizer.encode(w)[0][1:-1]
  83. if len(token_ids) + len(w_token_ids) < maxlen:
  84. token_ids += w_token_ids
  85. if l == 'O':
  86. labels += [0] * len(w_token_ids)
  87. else:
  88. B = label2id[l] * 2 + 1
  89. I = label2id[l] * 2 + 2
  90. labels += ([B] + [I] * (len(w_token_ids) - 1))
  91. else:
  92. break
  93. token_ids += [tokenizer._token_end_id]
  94. labels += [0]
  95. segment_ids = [0] * len(token_ids)
  96. batch_token_ids.append(token_ids)
  97. batch_segment_ids.append(segment_ids)
  98. batch_labels.append(labels)
  99. if len(batch_token_ids) == self.batch_size or is_end:
  100. batch_token_ids = sequence_padding(batch_token_ids)
  101. batch_segment_ids = sequence_padding(batch_segment_ids)
  102. batch_labels = sequence_padding(batch_labels)
  103. yield [batch_token_ids, batch_segment_ids], batch_labels
  104. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  105. """
  106. 后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:
  107. """
  108. model = build_transformer_model(
  109. config_path,
  110. checkpoint_path,
  111. model='albert',
  112. )
  113. output_layer = 'Transformer-FeedForward-Norm'
  114. albert_output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
  115. lstm = Bidirectional(LSTM(units=128, return_sequences=True), name="bi_lstm")(albert_output)
  116. drop = Dropout(0.1, name="dropout")(lstm)
  117. dense = TimeDistributed(Dense(num_labels, activation="softmax"), name="time_distributed")(drop)
  118. output = Dense(num_labels)(dense)
  119. CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
  120. output = CRF(output)
  121. model = Model(model.input, output)
  122. model.summary()
  123. model.compile(
  124. loss=CRF.sparse_loss,
  125. optimizer=Adam(learing_rate),
  126. metrics=[CRF.sparse_accuracy]
  127. )
  128. class NamedEntityRecognizer(ViterbiDecoder):
  129. """命名实体识别器
  130. """
  131. def recognize(self,text):
  132. tokens = tokenizer.tokenize(text)
  133. while len(tokens) > 512:
  134. tokens.pop(-2)
  135. mapping = tokenizer.rematch(text, tokens)
  136. token_ids = tokenizer.tokens_to_ids(tokens)
  137. segment_ids = [0] * len(token_ids)
  138. nodes = model.predict([[token_ids], [segment_ids]])[0]
  139. labels = self.decode(nodes)
  140. entities, starting = [], False
  141. for i, label in enumerate(labels):
  142. if label > 0:
  143. if label % 2 == 1:
  144. starting = True
  145. entities.append([[i], id2label[(label - 1) // 2]])
  146. elif starting:
  147. entities[-1][0].append(i)
  148. else:
  149. starting = False
  150. else:
  151. starting = False
  152. return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
  153. for w, l in entities]
  154. def evaluate(data):
  155. """评测函数
  156. """
  157. X, Y, Z = 1e-10, 1e-10, 1e-10
  158. for d in tqdm(data):
  159. text = ''.join([i[0] for i in d])
  160. R = set(NER.recognize(text))
  161. T = set([tuple(i) for i in d if i[1] != 'O'])
  162. X += len(R & T)
  163. Y += len(R)
  164. Z += len(T)
  165. f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
  166. return f1, precision, recall
  167. class Evaluate(keras.callbacks.Callback):
  168. def __init__(self):
  169. self.best_val_f1 = 0
  170. def on_epoch_end(self, epoch, logs=None):
  171. trans = K.eval(CRF.trans)
  172. NER.trans = trans
  173. print(NER.trans)
  174. f1, precision, recall = evaluate(valid_data)
  175. # 保存最优
  176. if f1 >= self.best_val_f1:
  177. self.best_val_f1 = f1
  178. model.save_weights('best_model.weights')
  179. print(
  180. 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
  181. (f1, precision, recall, self.best_val_f1)
  182. )
  183. f1, precision, recall = evaluate(test_data)
  184. print(
  185. 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
  186. (f1, precision, recall)
  187. )
  188. if __name__ == '__main__':
  189. evaluator = Evaluate()
  190. train_generator = data_generator(train_data, batch_size)
  191. model.fit_generator(
  192. train_generator.forfit(),
  193. steps_per_epoch=len(train_generator),
  194. epochs=epochs,
  195. callbacks=[evaluator]
  196. )
  197. else:
  198. model.load_weights('best_model.weights')

模型评估

2021/3/11:今早,查看Albert+BiLSTM+CRF模型运行结果,发现其精度很低,仅为0.8左右。然而,使用同样的数据,Albert+CRF模型精度在0.95以上。→→→思考其中原因,尝试调整代码:①尝试调整LSTM相关参数(dropout),甚至去除dropout,皆无改善。②尝试去除dropout与。dropout的作用?防止模型过拟合,但我认为,其使用需要看场景,参考:为什么模型加入dropout层后变得更差了?最后dense层的作用?我认为,可以将其理解为分类输出层,因此模型中有CRF用于输出转换,故可能不需要dens层。参考:LSTM模型后增加Dense(全连接)层的目的是什么? →→→去除下面代码后两行后,Albert+BiLSTM+CRF模型精度在0.95以上。至于模型原理,待深究。

  1. lstm = Bidirectional(LSTM(units=128, return_sequences=True), name="bi_lstm")(albert_output)
  2. #drop = Dropout(0.2, name="dropout")(lstm)
  3. #dense = TimeDistributed(Dense(num_labels, activation="softmax"), name="time_distributed")(drop)

读写文件

2021/3/12:上午,一直在尝试Python读写文件,如此简单之事,竟耗费我两小时之久。原因:总是报错'open' object has no attribute 'readlines'。解决思路:新建一个py文件,在里面进行读写操作,可行。然而,同样的语句,在Albert+BiLSTM+CRF模型py文件中,不可行。→这说明,语句本身没错,可能是Albert+BiLSTM+CRF模型py文件中变量/函数等名称与读写语句冲突。→的确如此,Albert+BiLSTM+CRF模型py文件的开头,有“from bert4keras.snippets import open, ViterbiDecoder”,此"open"非彼"open"。

  1. model.load_weights('best_model.weights')
  2. NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
  3. r = open("D:\Asian elephant\gao A_geography_NER\A_geography_NER\data\\result.txt", 'w')
  4. with open("D:\Asian elephant\gao A_geography_NER\A_geography_NER\data\\t.txt",'r',encoding='utf-8') as tt:
  5. content = tt.readlines()
  6. for line in content:
  7. ner=NER.recognize(line)
  8. print(ner,file=r)

模型训练

2021/3/14:训练模型(迭代3次,学习率设为1000,其他参数设置如下)。

训练数据:现有标注数据集+自己标注的数据;测试数据:自己标注的数据;验证数据:自己标注的数据。

耗时:纯CPU,迭代一次大约需要7小时。

结果LOW:epoch 1 →1304/1304:loss: 3.9929 - sparse_accuracy: 0.9648,test:  f1: 0.13333, precision: 0.41176, recall: 0.07955,valid:  f1: 0.15493, precision: 0.64706, recall: 0.08800, best f1: 0.15493

epoch 2→1304/1304:loss: 0.5454 - sparse_accuracy: 0.9849,test:  f1: 0.25455, precision: 0.63636, recall: 0.15909,valid:  f1: 0.18919, precision: 0.60870, recall: 0.11200, best f1: 0.18919

epoch 3→test与valid的precision达0.7以上

  1. maxlen = 256 #文本保留的最大长度
  2. epochs = 3 #迭代次数
  3. batch_size = 16 #训练时,每次传入模型的特征数量
  4. bert_layers = 12
  5. learing_rate = 1e-5 # bert_layers越小,学习率应该要越大
  6. crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率#1000

各种bug及其解决

ValueError: substring not found

bug之“substring not found”

2021/3/5:问题:迭代三次的模型已训练完毕,但将所有数据放入模型时,得到上述bug。解决:解决bug并不难,甚至无需了解其原理,只需进行比对——多试几次,发现数据中报错行的规律。本以为是标点符号的问题,但排查过后,了解到,是字母的问题。

ValueError: not enough values to unpack (expected 2got 1)

2021/3/13:问题:其他设置一致,仅是使用的数据不同,精度结果却大相径庭。使用Albert+BiLSTM+CRF模型代码包自带的训练数据用于训练模型,使用自己标注的少量数据用于测试与验证,得到较好的结果;但在训练数据中,加上自己标注的少量数据,一起用于训练,却得到很差的结果。解决:仍是,找不同。我标注的数据与原数据有何不同?答:是否有'\n',这“不起眼”的'\n',却有很重要的作用(如下)。

2021/3/15:新增一些自己标注的数据,而后,程序又报错。错误原因:类似于2021/3/13那次报错原因,仍是数据里的格式问题(字符/空格/换行符多余或缺失),但本次错误更为细致——文件末尾两个换行符的缺失,而这两个换行符十分重要(见代码中的for l in f.split('\n\n'): #查找双换行符)。解决方案:仍是对比正确数据VS我的报错数据,①以为是数据中空格的问题(上次是此原因报错),就一直纠结空格;②对比的所谓“正确数据”并非原始的、真正正确的数据,导致迟迟未能解决。

  1. def load_data(filename): #加载标注数据:训练集、测试集与验证集
  2. D = []
  3. with open(filename, encoding='utf-8') as f: #打开并读取文件内容
  4. f = f.read() #读取文件全部内容
  5. for l in f.split('\n\n'): #查找双换行符
  6. if not l: #若无双换行符
  7. continue #跳出本次循环,可执行下一次 (而break是跳出整个循环)
  8. d, last_flag = [], ''
  9. for c in l.split('\n'): #查找换行符
  10. char, this_flag = c.split(' ')
  11. if this_flag == 'O' and last_flag == 'O':
  12. d[-1][0] += char
  13. elif this_flag == 'O' and last_flag != 'O':
  14. d.append([char, 'O'])
  15. elif this_flag[:1] == 'B': #从索引0开始取,到1,但不包括1(即标注首字母为B)
  16. d.append([char, this_flag[2:]]) #从索引2开始取,char竖着到最后,如“梁子老寨”每个字的标注都非O,输出('梁子老寨', 'LOC')。
  17. else:
  18. d[-1][0] += char #若无换行符,
  19. last_flag = this_flag
  20. D.append(d)
  21. return D
  22. #结果格式:[('良子', 'LOC'), ('勐乃通达', 'LOC'), ('梁子老寨', 'LOC'), ('黑山', 'LOC'), ('黑山', 'LOC'), ('勐乃通达', 'LOC')]

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/787202
推荐阅读
相关标签
  

闽ICP备14008679号