赞
踩
由于此次代码升级的更改全部在 Google Colab 完成,Google Colab 可能与大多传统 jupyter 不同,Google Colab以两个空格作为标准
需要修改的函数 inference
def inference(images, batch_size, n_classes): with tf.variable_scope('conv1') as scope: weights = tf.get_variable('weights', shape=[3,3,3, 16], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32)) biases = tf.get_variable('biases', shape=[16], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) conv = tf.nn.conv2d(images, weights, strides=[1,1,1,1], padding='SAME') pre_activation = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(pre_activation, name= scope.name) #pool1 and norm1 with tf.variable_scope('pooling1_lrn') as scope: pool1 = tf.nn.max_pool(conv1, ksize=[1,3,3,1], strides=[1,2,2,1], padding='SAME', name='pooling1') norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001/9.0, beta=0.75,name='norm1') #conv2 with tf.variable_scope('conv2') as scope: weights = tf.get_variable('weights', shape=[3,3,16,16], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.1,dtype=tf.float32)) biases = tf.get_variable('biases', shape=[16], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) conv = tf.nn.conv2d(norm1, weights, strides=[1,1,1,1],padding='SAME') pre_activation = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(pre_activation, name='conv2') #pool2 and norm2 with tf.variable_scope('pooling2_lrn') as scope: norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001/9.0, beta=0.75,name='norm2') pool2 = tf.nn.max_pool(norm2, ksize=[1,3,3,1], strides=[1,1,1,1], padding='SAME',name='pooling2') #local3 with tf.variable_scope('local3') as scope: reshape = tf.reshape(pool2, shape=[batch_size, -1]) dim = reshape.get_shape()[1].value weights = tf.get_variable('weights', shape=[dim,128], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32)) biases = tf.get_variable('biases', shape=[128], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) local3_dropout = tf.nn.dropout(local3, keep_prob=0.8) #local4 with tf.variable_scope('local4') as scope: weights = tf.get_variable('weights', shape=[128,128], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32)) biases = tf.get_variable('biases', shape=[128], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) local4 = tf.nn.relu(tf.matmul(local3_dropout, weights) + biases, name='local4') local4_dropout = tf.nn.dropout(local4, keep_prob=0.8) # softmax with tf.variable_scope('softmax_linear') as scope: weights = tf.get_variable('softmax_linear', shape=[128, n_classes], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.005,dtype=tf.float32)) biases = tf.get_variable('biases', shape=[n_classes], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) softmax_linear = tf.add(tf.matmul(local4_dropout, weights), biases, name='softmax_linear') return softmax_linear def losses(logits, labels): with tf.variable_scope('loss') as scope: cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\ (logits=logits, labels=labels, name='xentropy_per_example') loss = tf.reduce_mean(cross_entropy, name='loss') tf.summary.scalar(scope.name+'/loss', loss) return loss def trainning(loss, learning_rate): with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate) global_step = tf.Variable(0, name='global_step', trainable=False) train_op = optimizer.minimize(loss, global_step= global_step) return train_op def evaluation(logits, labels): with tf.variable_scope('accuracy') as scope: correct = tf.nn.in_top_k(logits, labels, 1) correct = tf.cast(correct, tf.float16) accuracy = tf.reduce_mean(correct) tf.summary.scalar(scope.name+'/accuracy', accuracy) return accuracy
我没有包装成函数的形式,将原代码中的 run_training 注释即可
N_CLASSES = 2 CAPACITY = 2000 learning_rate = 0.0001 tf.reset_default_graph() train_epochs = 50 BATCH_SIZE = 50 total_batch = int(25000/50) tfrecords_file = './data/dog_vs_cat.tfrecords' logs_train_dir = './training' train_batch, train_label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE, capacity=CAPACITY) train_logits = inference(train_batch, BATCH_SIZE, N_CLASSES) train_loss = losses(train_logits, train_label_batch) train_op = trainning(train_loss, learning_rate) train__acc = evaluation(train_logits, train_label_batch) epoch = tf.Variable(0, name='epoch', trainable=False) startTime = time() sess = tf.Session() train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ckpt_dir = 'training/' if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) saver = tf.train.Saver(max_to_keep = 1) ckpt = tf.train.latest_checkpoint(ckpt_dir) if ckpt != None: saver.restore(sess, ckpt) else: print("Training from scratch.") start = sess.run(epoch) print("Training starts from {} epoch.".format(start + 1)) try: for ep in range(start, train_epochs): for i in range(total_batch): if coord.should_stop(): break _, tra_loss, tra_acc = sess.run([train_op, train_loss, train__acc]) if i % 50 == 0: print('Step %d, train loss = %.2f, train accuracy = %.2f%%' %(i, tra_loss, tra_acc*100.0)) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, ep) print("Train epoch:", '%02d' % (sess.run(epoch) + 1), "Loss=", "{:.6f}".format(tra_loss), "Accuracy=", tra_acc) checkpoint_path = os.path.join(logs_train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=ep) sess.run(epoch.assign(ep + 1)) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads) sess.close()
先训练一段时间然后中断
继续训练发现从上一步训练开始
1.BATCH_SIZE 自行设置,根据自己的想法与电脑配置设置
2.CAPACITY 自己设置,根据自己的想法与电脑配置设置
3.Turns 的计算时用你验证所使用数据集图片数量除以 BATCH_SIZE 取整
4.每次验证完可能总体错误略有偏差,因为 CAPACITY,BATCH_SIZE 的设置使得处理的轮数取整后可能不能完全遍历,还有队列中可能仍存在一些,自行摸索 CAPACITY ,BATCH_SIZE 的设置吧,实在不会,自行百度
为了方便,我用训练时的 tfrecord 进行评估,根据前面两篇大家可以自行将 验证集图像转化成 tfrecord 形式进行评估
IMG_W = 224 IMG_H = 224 CAPACITY = 64 BATCH_SIZE = 1000 Turns = int(25000/BATCH_SIZE) tfrecords_file = './data/dog_vs_cat.tfrecords' def evaluate_tfrecord_all_image(): tf.reset_default_graph() test_batch, test_label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE, capacity=CAPACITY) logits = inference(test_batch, BATCH_SIZE, N_CLASSES) testloss = losses(logits, test_label_batch) testacc = evaluation(logits, test_label_batch) logs_train_dir = './training/' saver = tf.train.Saver() sess = tf.Session() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(logs_train_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) else: print('No checkpoint file found') wrong_sum = 0 for i in range(Turns): print('INFO:The {} batch has been dealt'.format(i + 1)) test_loss,test_acc = sess.run([testloss,testacc]) print('The model\'s loss is %.2f' %test_loss) correct = int(BATCH_SIZE*test_acc) print('Correct : %d' % correct) wrong = BATCH_SIZE - correct wrong_sum += wrong print('Wrong : %d' % wrong) print('The accuracy in the batch is %.2f%%' %(test_acc*100.0)) total_accuracy = (25000.0-wrong_sum)/25000.0 print('------------------*******------------------') print('INFO:{} images are detected wrong'.format(wrong_sum)) print('INFO:The eval file accuracy is {:4}%'.format(total_accuracy*100)) coord.request_stop() coord.join(threads)
实际效果如下,在整个 25000 张图中,错误为 318,整体正确率大概在为98.73%
当然有的读者喜欢使用原图,或者已经使用原图进行训练,那么原图验证如下,当然运行起来确实慢了不少,tfrecord 模式全部验证完,可能原图一个 batch 还没跑完,最终显示有 592 张不同,为何结果不一样,参照上述
IMG_W = 224 IMG_H = 224 CAPACITY = 64 BATCH_SIZE = 1000 Turns = int(25000/BATCH_SIZE) def evaluate_all_image(): tf.reset_default_graph() test_dir = './data/train1/' N_CLASSES = 2 test, test_label = get_files(test_dir) IMAGES_NUM = len(test_label) print('There are %d test images totally..' % IMAGES_NUM) test_batch, test_label_batch = get_batch(test, test_label, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) logits = inference(test_batch, BATCH_SIZE, N_CLASSES) testloss = losses(logits, test_label_batch) testacc = evaluation(logits, test_label_batch) logs_train_dir = './training/' saver = tf.train.Saver() sess = tf.Session() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(logs_train_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) else: print('No checkpoint file found') wrong_sum = 0 for i in range(Turns): print('INFO:The {} batch has been dealt'.format(i + 1)) test_loss,test_acc = sess.run([testloss,testacc]) print('The model\'s loss is %.2f' %test_loss) correct = int(BATCH_SIZE*test_acc) print('Correct : %d' % correct) wrong = BATCH_SIZE - correct wrong_sum += wrong print('Wrong : %d' % wrong) print('The accuracy in the batch is %.2f%%' %(test_acc*100.0)) total_accuracy = (25000.0-wrong_sum)/25000.0 print('------------------*******------------------') print('INFO:{} images are detected wrong'.format(wrong_sum)) print('INFO:The eval file accuracy is {:4}%'.format(total_accuracy*100)) coord.request_stop() coord.join(threads) sess.close()
升级后的 Tensorboard 面板显示 accuracy 与 loss 如下
计算图如下
可能我水平也有限,但确实是我踩过的坑,因为训练时用的 TensorFlow 标准化函数,如果验证一张图使用 feed_dict 方式喂入则容易造成验证与预期差距很大,因为feed_dict 方式喂入的是图像的矩阵,而训练喂入的是张量,使用其他库对图像进行标准化得到的矩阵较使用 TensorFlow 使用内置函数标准化得到的张量(这个张量可以可视化为矩阵)在值上不同,例如使用 PIL 的 Image 函数标准化得到的图像与 TensorFlow 差太多
完结撒花,准备 GitHub 去了
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。