赞
踩
数据增强主要是通过以下方式获得更多的训练数据: 缩放、拉伸、旋转、剪切、翻转等。
本文将使用ImageDataGenerator的进行数据增强。
【例1】未使用增强
- !wget --no-check-certificate \
- https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip \
- -O /tmp/cats_and_dogs_filtered.zip
-
- import os
- import zipfile
- import tensorflow as tf
- from tensorflow.keras.optimizers import RMSprop
- from tensorflow.keras.preprocessing.image import ImageDataGenerator
-
- local_zip = '/tmp/cats_and_dogs_filtered.zip'
- zip_ref = zipfile.ZipFile(local_zip, 'r')
- zip_ref.extractall('/tmp')
- zip_ref.close()
-
- base_dir = '/tmp/cats_and_dogs_filtered'
- train_dir = os.path.join(base_dir, 'train')
- validation_dir = os.path.join(base_dir, 'validation')
-
- # Directory with our training cat pictures
- train_cats_dir = os.path.join(train_dir, 'cats')
-
- # Directory with our training dog pictures
- train_dogs_dir = os.path.join(train_dir, 'dogs')
-
- # Directory with our validation cat pictures
- validation_cats_dir = os.path.join(validation_dir, 'cats')
-
- # Directory with our validation dog pictures
- validation_dogs_dir = os.path.join(validation_dir, 'dogs')
-
- model = tf.keras.models.Sequential([
- tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(150, 150, 3)),
- tf.keras.layers.MaxPooling2D(2, 2),
- tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
- tf.keras.layers.MaxPooling2D(2,2),
- tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
- tf.keras.layers.MaxPooling2D(2,2),
- tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
- tf.keras.layers.MaxPooling2D(2,2),
- tf.keras.layers.Flatten(),
- tf.keras.layers.Dense(512, activation='relu'),
- tf.keras.layers.Dense(1, activation='sigmoid')
- ])
-
- model.compile(loss='binary_crossentropy',
- optimizer=RMSprop(lr=1e-4),
- metrics=['acc'])
-
- # All images will be rescaled by 1./255
- train_datagen = ImageDataGenerator(rescale=1./255)
- test_datagen = ImageDataGenerator(rescale=1./255)
-
- # Flow training images in batches of 20 using train_datagen generator
- train_generator = train_datagen.flow_from_directory(
- train_dir, # This is the source directory for training images
- target_size=(150, 150), # All images will be resized to 150x150
- batch_size=20,
- # Since we use binary_crossentropy loss, we need binary labels
- class_mode='binary')
-
- # Flow validation images in batches of 20 using test_datagen generator
- validation_generator = test_datagen.flow_from_directory(
- validation_dir,
- target_size=(150, 150),
- batch_size=20,
- class_mode='binary')
-
- history = model.fit_generator(
- train_generator,
- steps_per_epoch=100, # 2000 images = batch_size * steps
- epochs=100,
- validation_data=validation_generator,
- validation_steps=50, # 1000 images = batch_size * steps
- verbose=2)
-
-
- import matplotlib.pyplot as plt
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
-
- epochs = range(len(acc))
-
- plt.plot(epochs, acc, 'bo', label='Training accuracy')
- plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
- plt.title('Training and validation accuracy')
-
- plt.figure()
-
- plt.plot(epochs, loss, 'bo', label='Training Loss')
- plt.plot(epochs, val_loss, 'b', label='Validation Loss')
- plt.title('Training and validation loss')
- plt.legend()
-
- plt.show()
【运行结果】
【解析】
训练集中的准确率接近100%,但验证集中的准确率只有70%,说明严重过拟合(overfiting)了。
解决的办法是通过数据增强获得更多的训练数据。
【例2】数据增加的猫狗模型
- '''
- !wget - -no - check - certificate \
- https: // storage.googleapis.com / mledu - datasets / cats_and_dogs_filtered.zip \
- - O / tmp / cats_and_dogs_filtered.zip
- '''
-
- import os
- import zipfile
- import tensorflow as tf
- from tensorflow.keras.optimizers import RMSprop
- from tensorflow.keras.preprocessing.image import ImageDataGenerator
-
- local_zip = '/tmp/cats_and_dogs_filtered.zip'
- zip_ref = zipfile.ZipFile(local_zip, 'r')
- zip_ref.extractall('/tmp')
- zip_ref.close()
-
- base_dir = '/tmp/cats_and_dogs_filtered'
- train_dir = os.path.join(base_dir, 'train')
- validation_dir = os.path.join(base_dir, 'validation')
-
- # Directory with our training cat pictures
- train_cats_dir = os.path.join(train_dir, 'cats')
-
- # Directory with our training dog pictures
- train_dogs_dir = os.path.join(train_dir, 'dogs')
-
- # Directory with our validation cat pictures
- validation_cats_dir = os.path.join(validation_dir, 'cats')
-
- # Directory with our validation dog pictures
- validation_dogs_dir = os.path.join(validation_dir, 'dogs')
-
- model = tf.keras.models.Sequential([
- tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
- tf.keras.layers.MaxPooling2D(2, 2),
- tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
- tf.keras.layers.MaxPooling2D(2, 2),
- tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
- tf.keras.layers.MaxPooling2D(2, 2),
- tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
- tf.keras.layers.MaxPooling2D(2, 2),
- tf.keras.layers.Flatten(),
- tf.keras.layers.Dense(512, activation='relu'),
- tf.keras.layers.Dense(1, activation='sigmoid')
- ])
-
- model.compile(loss='binary_crossentropy',
- optimizer=RMSprop(lr=1e-4),
- metrics=['acc'])
-
- # This code has changed. Now instead of the ImageGenerator just rescaling
- # the image, we also rotate and do other operations
- # Updated to do image augmentation
- train_datagen = ImageDataGenerator(
- rescale=1. / 255,
- rotation_range=40,
- width_shift_range=0.2,
- height_shift_range=0.2,
- shear_range=0.2,
- zoom_range=0.2,
- horizontal_flip=True,
- fill_mode='nearest')
-
- test_datagen = ImageDataGenerator(rescale=1. / 255)
-
- # Flow training images in batches of 20 using train_datagen generator
- train_generator = train_datagen.flow_from_directory(
- train_dir, # This is the source directory for training images
- target_size=(150, 150), # All images will be resized to 150x150
- batch_size=20,
- # Since we use binary_crossentropy loss, we need binary labels
- class_mode='binary')
-
- # Flow validation images in batches of 20 using test_datagen generator
- validation_generator = test_datagen.flow_from_directory(
- validation_dir,
- target_size=(150, 150),
- batch_size=20,
- class_mode='binary')
-
- history = model.fit_generator(
- train_generator,
- steps_per_epoch=100, # 2000 images = batch_size * steps
- epochs=100,
- validation_data=validation_generator,
- validation_steps=50, # 1000 images = batch_size * steps
- verbose=2)
-
-
- import matplotlib.pyplot as plt
- acc = history.history['acc']
- val_acc = history.history['val_acc']
- loss = history.history['loss']
- val_loss = history.history['val_loss']
-
- epochs = range(len(acc))
-
- plt.plot(epochs, acc, 'bo', label='Training accuracy')
- plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
- plt.title('Training and validation accuracy')
-
- plt.figure()
-
- plt.plot(epochs, loss, 'bo', label='Training Loss')
- plt.plot(epochs, val_loss, 'b', label='Validation Loss')
- plt.title('Training and validation loss')
- plt.legend()
-
- plt.show()
与例1的区别是ImageDataGenerator的参数:
- train_datagen = ImageDataGenerator(
- rescale=1./255,
- rotation_range=40,
- width_shift_range=0.2,
- height_shift_range=0.2,
- shear_range=0.2,
- zoom_range=0.2,
- horizontal_flip=True,
- fill_mode='nearest')
其中:
【运行结果】
【解析】
训练集与验证集的准确率相差不大,效果比例1有明显改善。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。