赞
踩
数据预处理部分显得非常笨拙……请大家指教。
- import numpy as np
- import pandas as pd
- from network import RNN
- from dataset import MyDataset
- from sklearn.model_selection import KFold
- import torch
- import torch.nn as nn
- import torch.utils.data.dataloader as DataLoader
- import matplotlib.pyplot as plt
-
- # Prepare data
- data = pd.DataFrame(pd.read_csv("train.csv"))
- pmData = data.query('測項=="PM2.5"')
-
- trainData = np.array(pmData.drop(['日期', '測站', '測項'], axis=1))
- trainData = np.delete(trainData, 0, 0)
- trainData = np.delete(trainData, 0, 1)
- trainData = trainData.flatten()
-
- (n,) = trainData.shape
- print(n)
- cycleData = trainData[0:9]
- cycleData = cycleData.reshape((1, 9))
- cycleLabel = trainData[9:]
- cycleLabel = cycleLabel.reshape((-1, 1))
-
- for i in range(1, n - 9):
- cycleData = np.append(cycleData, trainData[i:i + 9].reshape((1, 9)), 0)
-
- cycleData = cycleData.astype(np.float32)
- cycleLabel = cycleLabel.astype(np.float32)
- print(cycleData.shape) # [5488,9]
- print(cycleLabel.shape) # [5488,1]
-
- # cuda setting
- assert torch.cuda.is_available()
- cuda_device = torch.device("cuda") # device object representing GPU
-
- # training setting
- errors = []
- k_folds = 8 # 8折
- epochs = 30
- lr = [0.001, 0.001, 0.004, 0.008, 0.01, 0.04, 0.08, 0.1] # 学习率
- batch_size = 7
- criterion = nn.MSELoss(reduce=True)
- criterion.cuda()
- h_state = torch.zeros([1, batch_size, 32]) # 32对应网络定义中的hidden size即隐藏神经元个数
-
- kf = KFold(n_splits=k_folds)
-
- fold = 0
- for train_index, test_index in kf.split(cycleData):
-
- # 为节约时间,此处实际上只进行了一折
- if (fold != 0):
- break
-
- dataset = MyDataset(cycleData[train_index], cycleLabel[train_index])
- dataloader = DataLoader.DataLoader(dataset, batch_size=batch_size, shuffle=True)
-
- net = RNN()
- net = net.float()
- net.cuda()
- optimizer = torch.optim.Adam(net.parameters(), lr=lr[fold])
-
- # training loss
- err_list = []
- print("\ntraining...")
- for _ in range(epochs):
- epoch_loss = 0
-
- for i, (x, lbs) in enumerate(dataloader):
- outputs, h_state = net(x.float().cuda(), h_state.cuda())
- # print(outputs.shape)
- # print(h_state.shape)
- outputs = outputs[:, -1, :]
-
- h_state = h_state.detach()
- lbs = lbs.squeeze(1)
- loss = criterion(outputs, lbs.cuda())
-
- # backpropagation
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- epoch_loss += torch.sum((outputs.cuda() - lbs.cuda()) ** 2).item() / batch_size
- err_list.append(epoch_loss / i)
-
- print(err_list)
- errors.append(err_list)
-
- # paint the learning curve
- plt.figure(fold)
-
- x = range(len(errors[-1]))
- y = errors[-1]
- plt.plot(x, y)
- plt.title("k=" + str(fold) + ", lr=" + str(lr[fold]))
-
- plt.show()
- fold += 1
-
- torch.save(net, 'RNN_' + str(fold) + '.pkl')
-
-
一个网上到处都是的简单结构
- import torch.nn as nn
-
-
- class RNN(nn.Module):
- def __init__(self):
- super(RNN, self).__init__()
- self.rnn = nn.RNN(
- input_size=1,
- hidden_size=32,
- num_layers=1,
- batch_first=True
- )
- self.out = nn.Linear(32, 1)
-
- def forward(self, x, h):
- out, h = self.rnn(x, h)
- prediction = self.out(out)
- return prediction, h
简单粗暴的dataset,好像使用的必要性不大,但是习惯了这种……
- from torch.utils.data import Dataset
- import torch
-
-
- class MyDataset(Dataset):
- def __init__(self, a, b):
- self.file = torch.from_numpy(a).unsqueeze(-1)
- self.label = torch.from_numpy(b).unsqueeze(-1)
-
- def __getitem__(self, index):
- seq = self.file[index]
- label = self.label[index]
- return seq, label
-
- def __len__(self):
- return self.label.shape[0]
把training.py改了改……其实数据处理都是重复的
- import numpy as np
- import pandas as pd
- from network import RNN
- from dataset import MyDataset
- from sklearn.model_selection import KFold
- import torch
- import torch.nn as nn
- import torch.utils.data.dataloader as DataLoader
- import matplotlib.pyplot as plt
-
- #Prepare data
- data = pd.DataFrame(pd.read_csv("train.csv"))
- pmData = data.query('測項=="PM2.5"')
-
- trainData = np.array(pmData.drop(['日期','測站','測項'],axis=1))
- trainData = np.delete(trainData,0,0)
- trainData = np.delete(trainData,0,1)
-
- trainData = trainData.flatten()
-
- (n,) = trainData.shape
- print(n)
- cycleData = trainData[0:9]
- cycleData = cycleData.reshape((1,9))
- cycleLabel = trainData[9:]
- cycleLabel = cycleLabel.reshape((-1,1))
-
- for i in range(1,n-9):
- cycleData = np.append(cycleData,trainData[i:i+9].reshape((1,9)),0)
-
- cycleData = cycleData.astype(np.float32)
- cycleLabel = cycleLabel.astype(np.float32)
- print(cycleData.shape)
- print(cycleLabel.shape)
-
- #cuda setting
- assert torch.cuda.is_available()
- cuda_device = torch.device("cuda") # device object representing GPU
-
- #training setting
- errors = []
- k_folds = 8
- epochs = 30
- lr = [0.001, 0.001, 0.004, 0.008, 0.01, 0.04, 0.08, 0.1]
- batch_size = 1 #注意修改此处!!
- criterion = nn.MSELoss(reduce=True)
- criterion.cuda()
- h_state = torch.zeros([1,batch_size,32])
-
- kf = KFold(n_splits=8)
-
- fold = 0
- for train_index, test_index in kf.split(cycleData):
- #实际上只测试了一个模型
- if(fold!=0):
- break
-
- testset = MyDataset(cycleData[test_index], cycleLabel[test_index])
- testloader = DataLoader.DataLoader(testset)
-
- labels = cycleLabel[test_index].flatten().tolist()
-
- net = torch.load('RNN_1.pkl')#实际上只测试了一个模型
-
- net.eval()
- predict = []
- with torch.no_grad():
- for x, lbs in testloader:
- outputs, h_state = net(x.float().cuda(), h_state.cuda())
- outputs = outputs[:, -1, :]
-
- h_state = h_state.detach()
-
- lbs = lbs.squeeze(1)
- loss = criterion(outputs, lbs.cuda())
-
- predict.append(outputs.squeeze(1).item())
-
- plt.figure(fold)
-
- x = range(len(predict))
- plt.plot(x, labels, color='orange', label='label')
- plt.plot(x, predict, color='cyan', label='predict')
- plt.legend()
- plt.title("model RNN_1")
- plt.show()
-
- fold += 1
1、个人认为,按此方法处理数据以后,其实并没有使用RNN的必要,普通NN将九个输入视为九个特征完全可以达到差不多的效果。但显然RNN参数更少。时间有限,此模型没有调参,但基于样本量,完全可以使RNN结构更复杂一点。
2、代码学习借鉴了网络上各种博客,感谢!同时欢迎纠错和提问。
3、模型资源以上传,审核通过后会附上链接。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。