赞
踩
训练神经网络模型之前,需要先获取训练数据集和测试数据集,本文介绍的获取数据集(get_data_train_test)的方法包括以下步骤:
1 在数据集文件夹中,不同类别图像分别放在以各自类别名称命名的文件夹中;
2 获取所有图像路径以及分类;
3 将分类转为字典格式;
4 将所有图像路径打乱;
5 将所有图像路径切分为训练部分和测试部分;
6 获取x部分
6.1 获取图像;
6.2 图像尺寸调整;
6.3 图像降维;
6.4 图像像素值取反;
6.5 图像像素值归一化;
7 获取y部分
7.1 获取图像的类别名称;
7.2 找到类别名称对应的id;
7.3 列表推到;
import os import random import math import sys import cv2 import numpy as np from PIL import Image #数据集路径 DATASET_TRAIN_TEST_DIR = 'D:/word/data_train_test' DATASET_TEST_DIR = 'D:/word/data_test' #随机种子 RANDOM_SEED = 0 #验证集数量 NUM_TEST = 20 #分类数量 NUM_CLASS = 10 #获取所有文件以及分类 def get_filenames_and_classes(dataset_dir): #数据目录 directories = [] #分类名称 class_names = [] for filename in os.listdir(dataset_dir): #合并文件路径 path = os.path.join(dataset_dir, filename) #判断该路径是否为目录 if os.path.isdir(path): #加入数据目录 directories.append(path) #加入类别名称 class_names.append(filename) photo_filenames = [] #循环每个分类的文件夹 for directory in directories: for filename in os.listdir(directory): path = os.path.join(directory, filename) #把图片加入图片列表 photo_filenames.append(path) return photo_filenames, class_names def get_xs(filenames): xs = [] for i in range(len(filenames)): image = Image.open(filenames[i]).convert('L') blank = Image.new('L',[28,28],(255)) max_length = np.max(image.size) w = int(image.size[0]*28/max_length) h = int(image.size[1]*28/max_length) #图像尺寸不超过28*28 image = image.resize((w,h), Image.NEAREST) #图像尺寸调整为28*28 blank.paste(image, ((28-w)//2, (28-h)//2)) #图像尺寸调整为1*784 x = blank.resize((1,784)) #图像转换为数组 x = np.array(x) #图像降维,如[[1],[2],[3]]变为[1,2,3] x = x.squeeze() #图像像素值取反 x = np.full(784, 255) - x #图像像素值归一化 max = np.max(x) x = x / np.full(784, max) #获取多幅图像数据 xs.append(x) return xs def get_ys(filenames, class_names_to_ids): ys = [] for i in range(len(filenames)): #获得图片的类别名称 class_name = os.path.basename(os.path.dirname(filenames[i])) #找到类别名称对应的id class_id = class_names_to_ids[class_name] #列表推到 y=[1 if id==class_id else 0 for id in range(NUM_CLASS)] ys.append(y) return ys def get_data_train_test(): #获得所有图片路径以及分类 photo_filenames, class_names = get_filenames_and_classes(DATASET_TRAIN_TEST_DIR) #把分类转为字典格式,类似于{'A':0, 'B':1, 'C':2} class_names_to_ids = dict(zip(class_names, range(len(class_names)))) #把数据切分为训练集和测试集 random.seed(RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[NUM_TEST:] testing_filenames = photo_filenames[:NUM_TEST] train_xs = get_xs(training_filenames) train_ys = get_ys(training_filenames, class_names_to_ids) test_xs = get_xs(testing_filenames) test_ys = get_ys(testing_filenames, class_names_to_ids) return train_xs, train_ys, test_xs, test_ys def get_data_test(): filenames = [] for filename in os.listdir(DATASET_TEST_DIR): #合并文件路径 path = os.path.join(DATASET_TEST_DIR, filename) filenames.append(path) xs = get_xs(filenames) return xs
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。