当前位置:   article > 正文

Python BiLSTM_CRF实现代码,电子病历命名实体识别和关系抽取,序列标注_bilstm-crf模型代码

bilstm-crf模型代码

参考刘焕勇老师的代码,按照自己数据的训练集进行修改,

1.原始数据样式:

2.清洗后数据样式:

 

3.训练数据

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import os
  4. from keras import backend as K
  5. from keras.preprocessing.sequence import pad_sequences
  6. from keras.models import Sequential
  7. from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
  8. from keras_contrib.layers.crf import CRF
  9. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  10. '''
  11. 默认为0,显示所有日志要过滤INFO日志,
  12. 请将其设置为1WARNINGS另外,2并进一步过滤掉ERROR日志,
  13. 将其设置为3可以执行以下操作使警告静音
  14. '''
  15. class LSTMNER:
  16. def __init__(self):
  17. cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
  18. self.train_path = os.path.join(cur, 'data/train.txt')
  19. self.vocab_path = os.path.join(cur, 'model/vocab.txt')
  20. self.embedding_file = os.path.join(cur, 'model/token_vec_300.bin')
  21. self.model_path = os.path.join(cur, 'model/tokenvec_bilstm2_crf_model_20.h5')
  22. self.datas, self.word_dict = self.build_data()
  23. self.class_dict = {
  24. 'O': 0,
  25. 'B-pro': 1,
  26. 'I-pro': 2,
  27. 'B-sym': 3,
  28. 'I-sym': 4,
  29. 'B-dis': 5,
  30. 'I-dis': 6,
  31. 'B-equ': 7,
  32. 'I-equ': 8,
  33. 'B-dru': 9,
  34. 'I-dru': 10,
  35. 'B-ite': 11,
  36. 'I-ite': 12,
  37. 'B-bod': 13,
  38. 'I-bod': 14,
  39. 'I-dep': 15,
  40. 'B-dep': 16,
  41. 'I-mic': 17,
  42. 'B-mic': 18
  43. }
  44. self.EMBEDDING_DIM = 300
  45. self.EPOCHS = 50
  46. self.BATCH_SIZE = 64
  47. self.NUM_CLASSES = len(self.class_dict)
  48. self.VOCAB_SIZE = len(self.word_dict)
  49. self.TIME_STAMPS = 150
  50. self.embedding_matrix = self.build_embedding_matrix()
  51. '''构造数据集'''
  52. def build_data(self):
  53. datas = []
  54. sample_x = []
  55. sample_y = []
  56. vocabs = {'UNK'}
  57. for line in open(self.train_path,encoding = 'utf-8'):
  58. # if line=='$':
  59. # continue
  60. # else:
  61. line = line.rstrip().strip('$').split('\t')
  62. if not line:
  63. continue
  64. char = line[0]
  65. if not char:
  66. continue
  67. cate = line[-1]
  68. sample_x.append(char)
  69. sample_y.append(cate)
  70. vocabs.add(char)
  71. if char in ['。','?','!','!','?']:
  72. datas.append([sample_x, sample_y])
  73. sample_x = []
  74. sample_y = []
  75. word_dict = {wd:index for index, wd in enumerate(list(vocabs))}
  76. self.write_file(list(vocabs), self.vocab_path)
  77. return datas, word_dict
  78. '''将数据转换成keras所需的格式'''
  79. def modify_data(self):
  80. x_train = [[self.word_dict[char] for char in data[0]] for data in self.datas]
  81. y_train = [[self.class_dict[label] for label in data[1]] for data in self.datas]
  82. x_train = pad_sequences(x_train, self.TIME_STAMPS)
  83. y = pad_sequences(y_train, self.TIME_STAMPS)
  84. y_train = np.expand_dims(y, 2)
  85. return x_train, y_train
  86. '''保存字典文件'''
  87. def write_file(self, wordlist, filepath):
  88. with open(filepath, 'w+',encoding = 'utf-8') as f:
  89. f.write('\n'.join(wordlist))
  90. '''加载预训练词向量'''
  91. def load_pretrained_embedding(self):
  92. embeddings_dict = {}
  93. with open(self.embedding_file, 'r',encoding = 'utf-8') as f:
  94. for line in f:
  95. values = line.strip().split(' ')
  96. if len(values) < 300:
  97. continue
  98. word = values[0]
  99. coefs = np.asarray(values[1:], dtype='float32')
  100. embeddings_dict[word] = coefs
  101. print('Found %s word vectors.' % len(embeddings_dict))
  102. return embeddings_dict
  103. '''加载词向量矩阵'''
  104. def build_embedding_matrix(self):
  105. embedding_dict = self.load_pretrained_embedding()
  106. embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
  107. for word, i in self.word_dict.items():
  108. embedding_vector = embedding_dict.get(word)
  109. if embedding_vector is not None:
  110. embedding_matrix[i] = embedding_vector
  111. return embedding_matrix
  112. '''使用预训练向量进行模型训练'''
  113. def tokenvec_bilstm2_crf_model(self):
  114. model = Sequential()
  115. embedding_layer = Embedding(self.VOCAB_SIZE + 1,
  116. self.EMBEDDING_DIM,
  117. weights=[self.embedding_matrix],
  118. input_length=self.TIME_STAMPS,
  119. trainable=False,
  120. mask_zero=True)
  121. model.add(embedding_layer)
  122. model.add(Bidirectional(LSTM(128, return_sequences=True)))
  123. model.add(Dropout(0.5))
  124. model.add(Bidirectional(LSTM(64, return_sequences=True)))
  125. model.add(Dropout(0.5))
  126. model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
  127. crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
  128. model.add(crf_layer)
  129. model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
  130. model.summary()
  131. return model
  132. '''训练模型'''
  133. def train_model(self):
  134. x_train, y_train = self.modify_data()
  135. print(x_train.shape, y_train.shape)
  136. model = self.tokenvec_bilstm2_crf_model()
  137. history = model.fit(x_train[:],
  138. y_train[:],
  139. validation_split=0.2,
  140. batch_size=self.BATCH_SIZE,
  141. epochs=self.EPOCHS)
  142. # self.draw_train(history)
  143. model.save(self.model_path)
  144. return model
  145. '''绘制训练曲线'''
  146. def draw_train(self, history):
  147. # Plot training & validation accuracy values
  148. plt.plot(history.history['acc'])
  149. plt.title('Model accuracy')
  150. plt.ylabel('Accuracy')
  151. plt.xlabel('Epoch')
  152. plt.legend(['Train'], loc='upper left')
  153. plt.show()
  154. # Plot training & validation loss values
  155. plt.plot(history.history['loss'])
  156. plt.title('Model loss')
  157. plt.ylabel('Loss')
  158. plt.xlabel('Epoch')
  159. plt.legend(['Train'], loc='upper left')
  160. plt.show()
  161. # 7836/7836 [==============================] - 205s 26ms/step - loss: 17.1782 - acc: 0.9624
  162. '''
  163. 6268/6268 [==============================] - 145s 23ms/step - loss: 18.5272 - acc: 0.7196 - val_loss: 15.7497 - val_acc: 0.8109
  164. 6268/6268 [==============================] - 142s 23ms/step - loss: 17.8446 - acc: 0.9099 - val_loss: 15.5915 - val_acc: 0.8378
  165. 6268/6268 [==============================] - 136s 22ms/step - loss: 17.7280 - acc: 0.9485 - val_loss: 15.5570 - val_acc: 0.8364
  166. 6268/6268 [==============================] - 133s 21ms/step - loss: 17.6918 - acc: 0.9593 - val_loss: 15.5187 - val_acc: 0.8451
  167. 6268/6268 [==============================] - 144s 23ms/step - loss: 17.6723 - acc: 0.9649 - val_loss: 15.4944 - val_acc: 0.8451
  168. '''
  169. if __name__ == '__main__':
  170. ner = LSTMNER()
  171. ner.train_model()

4.预测数据:

  1. import numpy as np
  2. from keras import backend as K
  3. from keras.preprocessing.sequence import pad_sequences
  4. from keras.models import Sequential,load_model
  5. from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
  6. from keras_contrib.layers.crf import CRF
  7. import matplotlib.pyplot as plt
  8. import os
  9. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  10. class LSTMNER:
  11. def __init__(self):
  12. cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
  13. self.train_path = os.path.join(cur, 'data/train.txt')
  14. self.vocab_path = os.path.join(cur, 'model/vocab.txt')
  15. self.embedding_file = os.path.join(cur, 'model/token_vec_300.bin')
  16. self.model_path = os.path.join(cur, 'model/tokenvec_bilstm2_crf_model_20.h5')
  17. self.word_dict = self.load_worddict()
  18. self.class_dict = {
  19. 'O': 0,
  20. 'B-pro': 1,
  21. 'I-pro': 2,
  22. 'B-sym': 3,
  23. 'I-sym': 4,
  24. 'B-dis': 5,
  25. 'I-dis': 6,
  26. 'B-equ': 7,
  27. 'I-equ': 8,
  28. 'B-dru': 9,
  29. 'I-dru': 10,
  30. 'B-ite': 11,
  31. 'I-ite': 12,
  32. 'B-bod': 13,
  33. 'I-bod': 14,
  34. 'I-dep': 15,
  35. 'B-dep': 16,
  36. 'I-mic': 17,
  37. 'B-mic': 18
  38. }
  39. self.label_dict = {j:i for i,j in self.class_dict.items()}
  40. self.EMBEDDING_DIM = 300
  41. self.EPOCHS = 20
  42. self.BATCH_SIZE = 64
  43. self.NUM_CLASSES = len(self.class_dict)
  44. self.VOCAB_SIZE = len(self.word_dict)
  45. self.TIME_STAMPS = 150
  46. self.embedding_matrix = self.build_embedding_matrix()
  47. self.model = self.tokenvec_bilstm2_crf_model()
  48. self.model.load_weights(self.model_path)
  49. '加载词表'
  50. def load_worddict(self):
  51. vocabs = [line.strip() for line in open(self.vocab_path)]
  52. word_dict = {wd: index for index, wd in enumerate(vocabs)}
  53. return word_dict
  54. '''构造输入,转换成所需形式'''
  55. def build_input(self, text):
  56. x = []
  57. for char in text:
  58. if char not in self.word_dict:
  59. char = 'UNK'
  60. x.append(self.word_dict.get(char))
  61. x = pad_sequences([x], self.TIME_STAMPS)
  62. return x
  63. def predict(self, text):
  64. str = self.build_input(text)
  65. raw = self.model.predict(str)[0][-self.TIME_STAMPS:]
  66. result = [np.argmax(row) for row in raw]
  67. chars = [i for i in text]
  68. tags = [self.label_dict[i] for i in result][len(result)-len(text):]
  69. res = list(zip(chars, tags))
  70. print(res)
  71. return res
  72. '''加载预训练词向量'''
  73. def load_pretrained_embedding(self):
  74. embeddings_dict = {}
  75. with open(self.embedding_file, 'r') as f:
  76. for line in f:
  77. values = line.strip().split(' ')
  78. if len(values) < 300:
  79. continue
  80. word = values[0]
  81. coefs = np.asarray(values[1:], dtype='float32')
  82. embeddings_dict[word] = coefs
  83. print('Found %s word vectors.' % len(embeddings_dict))
  84. return embeddings_dict
  85. '''加载词向量矩阵'''
  86. def build_embedding_matrix(self):
  87. embedding_dict = self.load_pretrained_embedding()
  88. embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
  89. for word, i in self.word_dict.items():
  90. embedding_vector = embedding_dict.get(word)
  91. if embedding_vector is not None:
  92. embedding_matrix[i] = embedding_vector
  93. return embedding_matrix
  94. '''使用预训练向量进行模型训练'''
  95. def tokenvec_bilstm2_crf_model(self):
  96. model = Sequential()
  97. embedding_layer = Embedding(self.VOCAB_SIZE + 1,
  98. self.EMBEDDING_DIM,
  99. weights=[self.embedding_matrix],
  100. input_length=self.TIME_STAMPS,
  101. trainable=False,
  102. mask_zero=True)
  103. model.add(embedding_layer)
  104. model.add(Bidirectional(LSTM(128, return_sequences=True)))
  105. model.add(Dropout(0.5))
  106. model.add(Bidirectional(LSTM(64, return_sequences=True)))
  107. model.add(Dropout(0.5))
  108. model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
  109. crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
  110. model.add(crf_layer)
  111. model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
  112. model.summary()
  113. return model
  114. if __name__ == '__main__':
  115. ner = LSTMNER()
  116. while 1:
  117. s = input('enter an sent:').strip()
  118. ner.predict(s)
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/648496
推荐阅读
相关标签
  

闽ICP备14008679号