赞
踩
想要入门机器学习,比较关键的一点就是在算法之前的数据预处理环节。从github上clone一下代码再照着README跑了下demo离入门还是有点距离的。
本篇文章旨在介绍入门级的数据预处理。像是MNIST之类的公开数据集都是打包好了的,不易读取其中的部分数据。下面我们就通过写一点python的代码实现数据集的筛选。
python 3.7.6
pytorch+cuda 11.6
还有一些基础包,如numpy之类的就不再赘述,环境仅供参考,本文的程序没有什么的版本要求。
首先我们给出一个基本的程序代码,用于读取MNIST数据集并使用LeNet进行分类.
- import torch
- from torch import nn
- from torch.nn import init
- import numpy as np
- import sys
- import torchvision
- import torchvision.transforms as transforms
- import time
- from torch.utils.data import Dataset
- from torch.utils import data
-
- #使用pytorch官方方法读取MNIST全部数据并进行数据划分
- def load_data_fashion_mnist(mnist_train, mnist_test, batch_size):
- if sys.platform.startswith('win'):
- num_workers = 0
- else:
- num_workers = 4
- train_iter = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
- test_iter = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
- return train_iter, test_iter
-
- #Lenet定义
- class LeNet(nn.Module):
- def __init__(self):
- super(LeNet, self).__init__()
- self.conv = nn.Sequential(
- nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
- nn.Sigmoid(),
- nn.MaxPool2d(2, 2), # kernel_size, stride
- nn.Conv2d(6, 16, 5),
- nn.Sigmoid(),
- nn.MaxPool2d(2, 2)
- )
- self.fc = nn.Sequential(
- nn.Linear(16*4*4, 120),
- nn.Sigmoid(),
- nn.Linear(120, 84),
- nn.Sigmoid(),
- nn.Linear(84, 10)
- )
-
- def forward(self, img):
- feature = self.conv(img)
- output = self.fc(feature.view(img.shape[0], -1))
- return output
-
-
- #用于验证的函数
- def evaluate_accuracy(data_iter, net, device=None):
- if device is None and isinstance(net, torch.nn.Module):
- # 如果没指定device就使用net的device
- device = list(net.parameters())[0].device
- acc_sum, n = 0.0, 0
- with torch.no_grad():
- for X, y in data_iter:
- net.eval() # 评估模式, 这会关闭dropout
- acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
- net.train() # 改回训练模式
- n += y.shape[0]
- return acc_sum / n
-
- #训练
- def train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
- net = net.to(device)
- print("training on ", device)
- loss = torch.nn.CrossEntropyLoss()
- for epoch in range(num_epochs):
- train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
- for X, y in train_iter:
- X = X.to(device)
- y = y.to(device)
- y_hat = net(X)
- l = loss(y_hat, y)
- optimizer.zero_grad()
- l.backward()
- optimizer.step()
- train_l_sum += l.cpu().item()
- train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
- n += y.shape[0]
- batch_count += 1
- test_acc = evaluate_accuracy(test_iter, net)
- print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
- % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
- if __name__=='__main__':
- #调用官方函数进行数据集的读取
- mnist_train = torchvision.datasets.MNIST(root='../data', train=True, download=True, transform=transforms.ToTensor())
- mnist_test = torchvision.datasets.MNIST(root='../data', train=False, download=True, transform=transforms.ToTensor())
- batch_size = 128
- train_iter, test_iter = load_data_fashion_mnist(mnist_train, mnist_test, batch_size)
- #判断是否支持cuda加速
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
- net = LeNet()
- lr, num_epochs = 0.001, 5
- optimizer = torch.optim.Adam(net.parameters(), lr=lr)
- train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
以上代码源自网络,可以直接运行,这部分不是本文的重点,所以不再过多介绍。
MNIST中有手写字符0-9一共10类,现在我们将想要提取的类别做定义
- #numclass中输入想要提取的数字,可以不按顺序
- numclass=[0,1,3,5,6,7,8]
这里我们提取0,1,3,5,6,7,8做演示
接着定义一个函数,从pytorch的官方程序得到的迭代器(iter)中取得MNIST的数据
- #getdataset函数从torch读取的数据里面提取想要的数字
- def getdataset(iter,numclass):
- #因为是查表,为了减少查表次数采用了eval()进行一次性查表,cmd记录需要查表的内容
- cmd=''
- for i,j in enumerate(numclass):
- cmd +="("+"y=="+str(j)+")"
- if i!=(len(numclass)-1):
- cmd +="^"
- #开始查表
- for i,(X,y)in enumerate(iter):
- if i==0 :
- index=np.where(eval(cmd))
- x_out=X[index]
- y_out=y[index]
- else:
- index=np.where(eval(cmd))
- x_out=torch.cat([x_out,X[index]],dim=0)
- y_out=torch.cat([y_out,y[index]],dim=0)
- #提取到底数据集标签必须重新排序(从0开始)不如会报错
- for i,j in enumerate(numclass):
- index=np.where(y_out==j)
- y_out[index]=i
- return x_out,y_out
因为要使用查表的方式提取数据,所以循环次数要越少越好,本文就考虑了使用eval()函数的方法,eval()函数是python的标准库函数,具体用法请自行搜索。
cmd存放由numclass变量决定的代码段,这里第一个for循环结束后cmd变量中存放的内容为
(y==0)^(y==1)^(y==3)^(y==5)^(y==6)^(y==7)^(y==8)
第二个for循环开始查表的操作,从传入的iter中读取X和y,X存放的就是手写字符的图像,y则是标签,我们要对y进行查找。
np.where就是查表的操作,返回的则是查找结果的索引,具体用法也请自行搜索。这里为了不改变数据的tensor类型,用了if语句。
X[index]和y[index]就是python中的数据切片操作了,torch.cat()即为数据的合并操作。
第二个for循环结束之后呢,我们就得到了提取好的部分MNIST数据集。但是还没完,我们需要对y,也就是标签做顺序处理,否则训练的时候pytorch会报错。
第三个for循环也是一个查表操作,运行完毕之后,标签y会顺序化
原来:0,1,3,5,6,7,8
for后:0,1,2,3,4,5,6
这样就能return数据啦。
为了使用data.DataLoader进行iter的划分,我们需要重写类函数,定义一个继承于Dataset的MyDataSet类
- #MyDataSet重写数据读取类,为了使用data.DataLoader进行iter的划分
- class MyDataSet(Dataset):
- def __init__(self, data, label):
- self.data = data
- self.label = label
- self.length = data.shape[0]
- def __getitem__(self, mask):
- return self.data[mask], self.label[mask]
- def __len__(self):
- return self.length
子类中必须包含的三个元素:__init__,__getitem__和__len__
关于Dataset子类不再过多阐述,不同程序中的使用也大同小异,感兴趣请自行搜索。
需要注意的是由于类别数产生了变化,LeNet定义的部分本文做了一个简单的修改,确保自适应类别数。
- import torch
- from torch import nn
- from torch.nn import init
- import numpy as np
- import sys
- import torchvision
- import torchvision.transforms as transforms
- import time
- from torch.utils.data import Dataset
- from torch.utils import data
-
- #numclass中输入想要提取的数字,可以不按顺序
- numclass=[0,1,3,5,6,7,8]
-
- #getdataset函数从torch读取的数据里面提取想要的数字
- def getdataset(iter,numclass):
- #因为是查表,为了减少查表次数采用了eval()进行一次性查表,cmd记录需要查表的内容
- cmd=''
- for i,j in enumerate(numclass):
- cmd +="("+"y=="+str(j)+")"
- if i!=(len(numclass)-1):
- cmd +="^"
- #开始查表
- for i,(X,y)in enumerate(iter):
- if i==0 :
- index=np.where(eval(cmd))
- x_out=X[index]
- y_out=y[index]
- else:
- index=np.where(eval(cmd))
- x_out=torch.cat([x_out,X[index]],dim=0)
- y_out=torch.cat([y_out,y[index]],dim=0)
- #提取到底数据集标签必须重新排序(从0开始)不如会报错
- for i,j in enumerate(numclass):
- index=np.where(y_out==j)
- y_out[index]=i
- return x_out,y_out
-
- #MyDataSet重写数据读取类,为了使用data.DataLoader进行iter的划分
- class MyDataSet(Dataset):
- def __init__(self, data, label):
- self.data = data
- self.label = label
- self.length = data.shape[0]
- def __getitem__(self, mask):
- return self.data[mask], self.label[mask]
- def __len__(self):
- return self.length
-
- #使用pytorch官方方法读取MNIST全部数据并进行数据划分
- def load_data_fashion_mnist(mnist_train, mnist_test, batch_size):
- if sys.platform.startswith('win'):
- num_workers = 0
- else:
- num_workers = 4
- train_iter = data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
- test_iter = data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
- return train_iter, test_iter
-
- #Lenet定义
- class LeNet(nn.Module):
- #网络会跟据numclass调整全连接层的输出
- def __init__(self,numofclass=len(numclass)):
- super(LeNet, self).__init__()
- self.conv = nn.Sequential(
- nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
- nn.Sigmoid(),
- nn.MaxPool2d(2, 2), # kernel_size, stride
- nn.Conv2d(6, 16, 5),
- nn.Sigmoid(),
- nn.MaxPool2d(2, 2)
- )
- self.fc = nn.Sequential(
- nn.Linear(16*4*4, 120),
- nn.Sigmoid(),
- nn.Linear(120, 84),
- nn.Sigmoid(),
- nn.Linear(84, numofclass)
- )
-
- def forward(self, img):
- feature = self.conv(img)
- output = self.fc(feature.view(img.shape[0], -1))
- return output
-
-
- #用于验证的函数
- def evaluate_accuracy(data_iter, net, device=None):
- if device is None and isinstance(net, torch.nn.Module):
- # 如果没指定device就使用net的device
- device = list(net.parameters())[0].device
- acc_sum, n = 0.0, 0
- with torch.no_grad():
- for X, y in data_iter:
- net.eval() # 评估模式, 这会关闭dropout
- acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
- net.train() # 改回训练模式
- n += y.shape[0]
- return acc_sum / n
-
- #训练
- def train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
- net = net.to(device)
- print("training on ", device)
- loss = torch.nn.CrossEntropyLoss()
- for epoch in range(num_epochs):
- train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
- for X, y in train_iter:
- X = X.to(device)
- y = y.to(device)
- y_hat = net(X)
- l = loss(y_hat, y)
- optimizer.zero_grad()
- l.backward()
- optimizer.step()
- train_l_sum += l.cpu().item()
- train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
- n += y.shape[0]
- batch_count += 1
- test_acc = evaluate_accuracy(test_iter, net)
- print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
- % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
-
- if __name__=='__main__':
- #调用官方函数进行数据集的读取
- mnist_train = torchvision.datasets.MNIST(root='../data', train=True, download=True, transform=transforms.ToTensor())
- mnist_test = torchvision.datasets.MNIST(root='../data', train=False, download=True, transform=transforms.ToTensor())
- batch_size = 128
- train_iter, test_iter = load_data_fashion_mnist(mnist_train, mnist_test, batch_size)
-
- #从读取到的iter中将想要的数字选出来
- tdata,tlabel=getdataset(train_iter,numclass)
- mnist_train=MyDataSet(tdata,tlabel)
- tdata,tlabel=getdataset(test_iter,numclass)
- mnist_test=MyDataSet(tdata,tlabel)
- print("数据重载完毕:"+str(numclass))
- train_iter, test_iter = load_data_fashion_mnist(mnist_train, mnist_test, batch_size)
-
- #判断是否支持cuda加速
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
- net = LeNet()
- lr, num_epochs = 0.001, 5
- optimizer = torch.optim.Adam(net.parameters(), lr=lr)
- train(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
因为数据集较小,我们可以直接在同一程序中进行预处理并直接用于训练,对于比较大的数据集,建议采用存储再读取的方式,不然内存会顶不住。
本文介绍了从公开数据集中取得部分数据集的方法,以MNIST为例,仅作抛砖引玉的作用。实际的程序编写过程中要学会举一反三。
本人才疏学浅,如有错误之处,欢迎指正
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。