赞
踩
- import tensorflow as tf
- import numpy as np
- from bert import modeling
- from bert import tokenization
- from bert import optimization
- import os
- import pandas as pd
-
-
- flags = tf.flags
- FLAGS = flags.FLAGS
-
- flags.DEFINE_integer('train_batch_size',32,'define the train batch size')
- flags.DEFINE_integer('num_train_epochs',3,'define the num train epochs')
- flags.DEFINE_float('warmup_proportion',0.1,'define the warmup proportion')
- flags.DEFINE_float('learning_rate',5e-5,'the initial learning rate for adam')
- flags.DEFINE_bool('is_traning',True,'define weather fine-tune the bert model')
-
-
-
- data = pd.read_csv('data/event_type_entity_extract_train.csv',encoding='UTF-8',header=None)
- data = data[data[2] != u'其他']
- classes = set(data[2])
-
- train_data = []
- for t,c,n in zip(data[1],data[2],data[3]):
- train_data.append((t.strip(),c.strip(),n.strip()))
- np.random.shuffle(train_data)
-
- def get_start_end_index(text,subtext):
- for i in range(len(text)):
- if text[i:i+len(subtext)] == subtext:
- return (i,i+len(subtext)-1)
- return (-1,-1)
-
- tmp_train_data = []
- for item in train_data:
- start,end = get_start_end_index(item[0],item[2])
- if start != -1:
- tmp_train_data.append(item)
-
- train_data = tmp_train_data
- np.random.shuffle(train_data)
-
- data = pd.read_csv('data/event_type_entity_extract_train.csv',encoding='UTF-8',header=None)
- test_data = []
- for t,c in zip(data[1],data[2]):
- test_data.append((t.strip(),c.strip()))
-
- config_path = r'D:\NLP_SOUNDAI\learnTensor\package9\bert\chinese_L-12_H-768_A-12\bert_config.json'
- checkpoint_path = r'D:\NLP_SOUNDAI\learnTensor\package9\bert\chinese_L-12_H-768_A-12\bert_model.ckpt'
- dict_path = r'D:\NLP_SOUNDAI\learnTensor\package9\bert\chinese_L-12_H-768_A-12\vocab.txt'
- bert_config = modeling.BertConfig.from_json_file(config_path)
- tokenizer = tokenization.FullTokenizer(vocab_file=dict_path,do_lower_case=False)
-
-
-
-
- def input_str_concat(inputList):
-
- assert len(inputList) == 2
- t,c = inputList
- newStr = '__%s__%s'%(c,t)
- tokens = tokenizer.tokenize(newStr)
- tokens = ['[CLS]']+tokens+['[SEP]']
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- input_mask = [1]*len(input_ids)
- segment_ids = [0]*len(input_ids)
- return tokens,(input_ids,input_mask,segment_ids)
-
- for i in train_data:
- print(input_str_concat(i[:-1]))
- break
-
- for i in test_data:
- print(input_str_concat(i))
- break
-
- def sequence_padding(sequence):
- lenlist = [len(item) for item in sequence]
- maxlen = max(lenlist)
- return np.array([
- np.concatenate([item,[0]*(maxlen - len(item))]) if len(item) < maxlen else item for item in sequence
- ])
-
-
- # 定于批训练数据函数
- def get_data_batch():
- batch_size = FLAGS.train_batch_size
- epoch = FLAGS.num_train_epochs
- for oneEpoch in range(epoch):
- num_batches = ((len(train_data) -1) // batch_size) + 1
- for i in range(num_batches):
- batch_data = train_data[i*batch_size:(i+1)*batch_size]
- yield_batch_data = {
- 'input_ids':[],
- 'input_mask':[],
- 'segment_ids':[],
- 'start_ids':[],
- 'end_ids':[]
- }
- for item in batch_data:
- tokens,(input_ids,input_mask,segment_ids) = input_str_concat(item[:-1])
- start,end = get_start_end_index(item[0],item[2])
- start += 1
- end += 1
- start_ids = [0]*len(input_ids)
- end_ids = [0]*len(input_ids)
- start_ids[start] = 1
- end_ids[end] = 1
- yield_batch_data['input_ids'].append(input_ids)
- yield_batch_data['input_mask'].append(input_mask)
- yield_batch_data['segment_ids'].append(segment_ids)
- yield_batch_data['start_ids'].append(start_ids)
- yield_batch_data['end_ids'].append(end_ids)
- yield_batch_data['input_ids'] = sequence_padding(yield_batch_data['input_ids'])
- yield_batch_data['input_mask'] = sequence_padding(yield_batch_data['input_mask'])
- yield_batch_data['segment_ids'] = sequence_padding(yield_batch_data['segment_ids'])
- yield_batch_data['start_ids'] = sequence_padding(yield_batch_data['start_ids'])
- yield_batch_data['end_ids'] = sequence_padding(yield_batch_data['end_ids'])
- yield yield_batch_data
-
-
- with tf.Graph().as_default(),tf.Session() as sess:
- input_ids_p = tf.placeholder(dtype=tf.int64,shape=[None,None],name='input_ids_p')
- input_mask_p = tf.placeholder(dtype=tf.int64,shape=[None,None],name='input_mask_p')
- segment_ids_p = tf.placeholder(dtype=tf.int64,shape=[None,None],name='segment_ids_p')
- start_p = tf.placeholder(dtype=tf.int64,shape=[None,None],name='start_p')
- end_p = tf.placeholder(dtype=tf.int64,shape=[None,None],name='end_p')
-
-
- model = modeling.BertModel(config=bert_config,
- is_training=False,
- input_ids=input_ids_p,
- input_mask=input_mask_p,
- token_type_ids=segment_ids_p,
- use_one_hot_embeddings=False)
- output_layer = model.get_sequence_output()
-
- # batch_size, sentence_max_len, word_dim
- batch_size, sentence_max_len, word_dim = tf.shape(output_layer)[0],tf.shape(output_layer)[1],tf.shape(output_layer)[2]
-
- output_reshape = tf.reshape(output_layer,shape=[-1,word_dim],name='output_reshape')
-
- with tf.variable_scope('weitht_and_bias',reuse=tf.AUTO_REUSE,initializer=tf.truncated_normal_initializer(mean=0.,stddev=0.05)):
- weight_start = tf.get_variable(name='weight_start',shape=[word_dim,1])
- bias_start = tf.get_variable(name='bias_start',shape=[1])
- weight_end = tf.get_variable(name='weight_end',shape=[word_dim,1])
- bias_end = tf.get_variable(name='bias_end',shape=[1])
-
- with tf.name_scope('predict_start_and_end'):
- pred_start = tf.nn.bias_add(tf.matmul(output_reshape,weight_start),bias_start)
- pred_start = tf.reshape(pred_start,shape=[batch_size,sentence_max_len,1])
- pred_start = tf.squeeze(pred_start,-1)
-
- pred_end = tf.nn.bias_add(tf.matmul(output_reshape, weight_end), bias_end)
- pred_end = tf.reshape(pred_end, shape=[batch_size, sentence_max_len, 1])
- pred_end = tf.squeeze(pred_end, -1)
-
- with tf.name_scope('loss'):
- loss1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred_start,labels=start_p))
- end_p -= (1-tf.cumsum(start_p,axis=1))*1e10
- loss2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred_end,labels=end_p))
- loss = loss1 + loss2
-
- with tf.name_scope('acc_predict'):
- start_acc = tf.cast(tf.equal(tf.argmax(start_p,axis=1),tf.argmax(pred_start,axis=1)),dtype=tf.float32)
- end_acc = tf.cast(tf.equal(tf.argmax(end_p,axis=1),tf.argmax(pred_end,axis=1)),dtype=tf.float32)
- start_acc_val = tf.reduce_mean(start_acc)
- end_acc_val = tf.reduce_mean(end_acc)
- total_acc = tf.reduce_mean(tf.cast(tf.equal(start_acc,end_acc),dtype=tf.float32))
-
-
- with tf.name_scope('train_op'):
- num_train_steps = int(
- len(train_data) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
- num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
- train_op = optimization.create_optimizer(
- loss, FLAGS.learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
-
-
-
- tvars = tf.trainable_variables()
- (assignment_map,initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,checkpoint_path)
- tf.train.init_from_checkpoint(checkpoint_path,assignment_map)
- sess.run(tf.variables_initializer(tf.global_variables()))
-
- total_steps = 0
- for yield_batch_data in get_data_batch():
- total_steps += 1
- feed_dict = {
- input_ids_p:yield_batch_data['input_ids'],
- input_mask_p:yield_batch_data['input_mask'],
- segment_ids_p:yield_batch_data['segment_ids'],
- start_p:yield_batch_data['start_ids'],
- end_p:yield_batch_data['end_ids']
- }
- fetches =[train_op,loss,start_acc_val,end_acc_val,total_acc]
-
- _,loss_val,start_acc_now,end_acc_now,total_acc_val = sess.run(fetches,feed_dict = feed_dict)
-
- print('i : %s, loss : %s, start_acc : %s, end_acc : %s, total_acc : %s'%(total_steps,loss_val,start_acc_now,end_acc_now,total_acc_val))

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。