赞
踩
我们将数据集构建完成后,下面就需要做关系抽取,这是本项目的第二步。
关系抽取是为了将句子中可能存在的关系提取出来,那有人会问了,前面不是已经有结构化定义了关系了吗。前面我是假设两个不同实体之间有且仅有一种关系,但如果之后存在多种可能存在的关系那关系抽取就必不可少了,这里我只是提供最简单可跑通的技术方案,如果有更好的研究那是最好的
,并且在具体学科沉积学领域下,目前我所接触到的还是两种实体唯一关系。那我只能假设存在不同关系,进行实验,这里我一共定义四种关系unknown|0 part|1 TimSub|2 property|3,
词向量这里都换成了英文词向量,都以整理好放到GitHub中(关系抽取代码非原创,只是将中文关系抽取,替换英文词向量和项目数据集,构建出完成本项目的数据代码)传送门:原作者模型描述
传送门作者中文关系抽取源代码
因此这里只展示部分代码,具体沉积学关系抽取可移步至GitHub:https://github.com/zhichen-roger/Relationship_etract_for_Sedimentology.git
模型训练train_GRU.py
import tensorflow as tf
import numpy as np
import time
import datetime
import os
import network
from tensorflow.contrib.tensorboard.plugins import projector
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('summary_dir', '.', 'path to store summary')
def main(_):
# the path to save models
save_path = './model/'
print('reading wordembedding')
wordembedding = np.load('./data/vec.npy')
print('reading training data')
train_y = np.load('./data/train_y.npy')
train_word = np.load('./data/train_word.npy', allow_pickle=True)
train_pos1 = np.load('./data/train_pos1.npy', allow_pickle=True)
train_pos2 = np.load('./data/train_pos2.npy', allow_pickle=True)
settings = network.Settings()
settings.vocab_size = len(wordembedding)
settings.num_classes = len(train_y[0])
big_num = settings.big_num
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
initializer = tf.contrib.layers.xavier_initializer()
with tf.variable_scope("model", reuse=None, initializer=initializer):
m = network.GRU(is_training=True, word_embeddings=wordembedding, settings=settings)
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(0.0005)
train_op = optimizer.minimize(m.final_loss, global_step=global_step)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=None)
merged_summary = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train_loss', sess.graph)
def train_step(word_batch, pos1_batch, pos2_batch, y_batch, big_num):
feed_dict = {}
total_shape = []
total_num = 0
total_word = []
total_pos1 = []
total_pos2 = []
for i in range(len(word_batch)):
total_shape.append(total_num)
total_num += len(word_batch[i])
for word in word_batch[i]:
total_word.append(word)
for pos1 in pos1_batch[i]:
total_pos1.append(pos1)
for pos2 in pos2_batch[i]:
total_pos2.append(pos2)
total_shape.append(total_num)
total_shape = np.array(total_shape)
total_word = np.array(total_word)
total_pos1 = np.array(total_pos1)
total_pos2 = np.array(total_pos2)
feed_dict[m.total_shape] = total_shape
feed_dict[m.input_word] = total_word
feed_dict[m.input_pos1] = total_pos1
feed_dict[m.input_pos2] = total_pos2
feed_dict[m.input_y] = y_batch
temp, step, loss, accuracy, summary, l2_loss, final_loss = sess.run(
[train_op, global_step, m.total_loss, m.accuracy, merged_summary, m.l2_loss, m.final_loss],
feed_dict)
time_str = datetime.datetime.now().isoformat()
accuracy = np.reshape(np.array(accuracy), (big_num))
acc = np.mean(accuracy)
summary_writer.add_summary(summary, step)
if step % 5 == 0: # 50
tempstr = "{}: step {}, softmax_loss {:g}, acc {:g}".format(time_str, step, loss, acc)
print(tempstr)
for one_epoch in range(settings.num_epochs):
temp_order = list(range(len(train_word)))
np.random.shuffle(temp_order)
for i in range(int(len(temp_order) / float(settings.big_num))):
temp_word = []
temp_pos1 = []
temp_pos2 = []
temp_y = []
temp_input = temp_order[i * settings.big_num:(i + 1) * settings.big_num]
for k in temp_input:
temp_word.append(train_word[k])
temp_pos1.append(train_pos1[k])
temp_pos2.append(train_pos2[k])
temp_y.append(train_y[k])
num = 0
for single_word in temp_word:
num += len(single_word)
if num > 1500:
print('out of range')
continue
temp_word = np.array(temp_word)
temp_pos1 = np.array(temp_pos1)
temp_pos2 = np.array(temp_pos2)
temp_y = np.array(temp_y)
train_step(temp_word, temp_pos1, temp_pos2, temp_y, settings.big_num)
current_step = tf.train.global_step(sess, global_step)
if current_step > 100 and current_step % 10 == 0: # 8000,100
print('saving model')
path = saver.save(sess, save_path + 'ATT_GRU_model', global_step=current_step)
tempstr = 'have saved model to ' + path
print(tempstr)
if __name__ == "__main__":
tf.app.run()
沉积学关系抽取
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。