赞
踩
在这用到了注意力融合多特征,这个方法在现在还是挺少有人用的。只有这个方法的说明也没有实际案例。我也是看他们的文章打算来自己写写,里面也有一些创新地方-神经网络下。
在此之前经常看到别人说加入了注意力机制模型十个有九个没有效果,好不容易这个有效果,当然值得写出来分享!
本文的空气质量预测使用的数据集是来自德国马克斯普朗克研究所(Max-Planck Institute for Biogeochemistry)。这个数据集收录了从2003年开始至今的天气数据,每10分钟收集一次,每半年进行一次数据整理汇总。为了能拿到足够多的数据量,我们选择了从2019年1月1日开始到2020年6月30日的数据。
数据集共22个特征,分别是如图:
空气质量预测常用的数据集包括:二氧化碳(CO2(ppm)),气压(P(mbar)),降雨量(rho (g/m**3)),温度(T(°C))。
- import numpy as np
- import pandas as pd
- import torch
- from torch import nn
- from torch.utils.data import Dataset
- from torch.utils.tensorboard import SummaryWriter
- from torch.utils.data import DataLoader
- import matplotlib.pyplot as plt
- import datetime
- from sklearn.preprocessing import MinMaxScaler
在整个预测实验中,数据集预处理工作非常关键,数据预处理的流程包括:对数据缺失值进行处理、数据划分和特征提取。在进行数据处理之前要先对数据集做必要的准备,如删除冗余点,剔除噪音信息以及去噪平滑等等。通过该系列运算,能够获得优质的模型输入数据,增强模型训练效果,改善最终预测结果。本章数据如下图所示分为:按80%,10%,10%的比例分为训练集,验证集和测试集。+
- def data_concat(file):
- file_head = 'D:\\Jupyter Notebook\数据集'
- file_concat = pd.read_csv('D:\\Jupyter Notebook\数据集\Header.csv')
- for i in range(len(file)):
-
- file_concat = pd.concat([file_concat,pd.read_csv(file_head+'\\'+file[i])])
- return file_concat
-
- data = ['mpi_roof_2019a/mpi_roof_2019a.csv','mpi_roof_2019b/mpi_roof_2019b.csv'
- ,'mpi_roof_2020a/mpi_roof_2020a.csv']
-
- dataset = data_concat(data)
- #str转datetime
- def str_to_datetime(s):
- split = s.split('.')
- year,month,day,hour,minute,second = int(split[2][:4]),int(split[1]),int(split[0]),int(split[2][-8:-6]),int(split[2][-5:-3]),int(split[2][-2:])
- return datetime.datetime(year=year,month=month,day=day,hour=hour,minute=minute,second=second)
-
- dataset['Date Time'] = dataset['Date Time'].apply(str_to_datetime)
-
- #特征提取
- def Feature_extraction(data_file,features_considered):
- dataset_main = data_file[features_considered]
- return dataset_main
-
- dataset = Feature_extraction(dataset,features_considered)
- dataset['target'] = dataset['CO2 (ppm)'].shift(-1)
- dataset = dataset.dropna() #使用了shift函数,在最后必然是有缺失值的,这里去掉缺失值所在行
- #shift中的-1代表了target列数据是CO2数据的后一个,即为单步预测,
- #想要多步预测可以将1增大。
归一化和分离数据
- # 归一化+分离数据X,y
- def normalization(dataframe):
- dataframe = dataframe.to_numpy()
- dates = dataframe[:,0]
- middle_matrix = dataframe[:,1:-1]
- Y = dataframe[:,-1].reshape(-1,1)
- scaler_data=MinMaxScaler() # 导入sklearn的预处理容器
- scaler_label=MinMaxScaler()
- middle_matrix=scaler_data.fit_transform(middle_matrix) # 对数据和标签进行归一化等处理
- X = middle_matrix.reshape((len(dates),middle_matrix.shape[1],1))
- Y=scaler_label.fit_transform(Y)
- return dates,X.astype(np.float32),Y.astype(np.float32),scaler_label
-
- dates,X,y,scaler_label = normalization(dataset)
划分数据集
- #划分数据为训练集和测试集。验证集
- q_80 = int(len(dates)*0.8)
- q_90 = int(len(dates)*0.9)
-
- dates_train,X_train,y_train = dates[:q_80],X[:q_80],y[:q_80]
- dates_val,X_val,y_val = dates[q_80:q_90],X[q_80:q_90],y[q_80:q_90]
- dates_test,X_test,y_test = dates[q_90:],X[q_90:],y[q_90:]
-
- fig = plt.figure(figsize=(18,5))
- plt.plot(dates_train,y_train)
- plt.plot(dates_val,y_val)
- plt.plot(dates_test,y_test)
- plt.legend(['Train','Validation','Test'])
数据转换Tensor
- # 数据装入 转成tensor
- X_train = torch.Tensor(X_train)
- y_train = torch.Tensor(y_train)
- X_test = torch.Tensor(X_test)
- y_test = torch.Tensor(y_test)
- X_val = torch.Tensor(X_val)
- y_val = torch.Tensor(y_val)
设置迭代器
- def data_generator(x_train,y_train,x_val,y_val,x_test,y_test,batch_size):
- train_dataset=torch.utils.data.TensorDataset(x_train,y_train)
- test_dataset=torch.utils.data.TensorDataset(x_test,y_test)
- val_dataset = torch.utils.data.TensorDataset(x_val,y_val)
- train_loader=torch.utils.data.DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=False,drop_last=True) # 加载数据集,使数据集可迭代
- test_loader=torch.utils.data.DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=False,drop_last=True)
- val_loader = torch.utils.data.DataLoader(dataset=val_dataset,batch_size=batch_size,shuffle=False,drop_last=True)
-
- return train_loader,val_loader,test_loader
-
- rain_loader,val_loader,test_loader = data_generator(X_train,y_train,X_val,y_val,X_test,y_test,batch_size=batch_size)
添加tensorboard
- #添加tensorboard
- writer = SummaryWriter('logs_AFF_LSTM')
定义训练设备
- #定义训练设备
- device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
定义神经网络
考虑到时序模型,正常思维是想到用LSTM或GRU模型来进行预测,我这里使用了LSTM模型,
而按理来说AFF注意力融合多特征最少需要两个数据集来进行数据融合。而这篇文章是使用LSTM模型本身与output一并输出出来的hidden隐藏单元作为另外一个数据集。这也是本篇文章的创新之处。
另外,按照平常,时序预测模型上不应该使用正则化方法。(作者小试了一下)竟然发现竟然模型效果有所增加,虽然暂时不知道何种原因。希望各位大佬能对这一模型给出见解。
-
- class AFF(nn.Module):
-
- def __init__(self, channels=64, r=4):
- super(AFF, self).__init__()
- inter_channels = int(channels // r)
- self.local_att = nn.Sequential(
- nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
- nn.BatchNorm2d(inter_channels),
- nn.ReLU(inplace=True),
- nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
- nn.BatchNorm2d(channels),
- )
-
- self.global_att = nn.Sequential(
- nn.AdaptiveAvgPool2d(1),
- nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
- nn.BatchNorm2d(inter_channels),
- nn.ReLU(inplace=True),
- nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
- nn.BatchNorm2d(channels),
- )
-
- self.sigmoid = nn.Sigmoid()
- def forward(self, x, residual):
- x = x.unsqueeze(-1)
- residual = residual.unsqueeze(-1)
- x = x.transpose(0,1)
- xa = x + residual
- xl = self.local_att(xa)
- xg = self.global_att(xa)
- xlg = xl + xg
- wei = self.sigmoid(xlg)
- # wei = wei.squeeze(-1)
- # wei = wei.transpose(0,1)
- xo = 2 * x * wei + 2 * residual*(1- wei)
- xo = xo.squeeze(-1)
- xo = xo.transpose(0,1)
- return xo
-
-
- # 定义一个类
- class net_lstm(nn.Module):
- def __init__(self, input_size, hidden_size, num_layers, output_size, batch_size):
- super(net_lstm, self).__init__()
- self.input_size = input_size
- self.hidden_size = hidden_size
- self.num_layers = num_layers
- self.output_size = output_size
- self.batch_size = batch_size
- self.num_directions = 1 # 单向LSTM
-
- self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
- batch_first=True) # LSTM层
- self.aff = AFF()
- self.drop = nn.Dropout(0.03)
- self.linear1 = nn.Linear(hidden_size, 64) # 全连接层
- self.linear2 = nn.Linear(64, 32)
- self.linear3 = nn.Linear(32, 1)
- self.relu1 = nn.ReLU()
-
- def forward(self, x):
- x = x.transpose(1, 2)
- batch_size, seq_len = x.size()[0], x.size()[1] # x.shape=(604,3,3)
- h_0 = torch.randn(self.num_directions * self.num_layers, x.size(0), self.hidden_size).to(device)
- c_0 = torch.randn(self.num_directions * self.num_layers, x.size(0), self.hidden_size).to(device)
- # output(batch_size, seq_len, num_directions * hidden_size)
- output, (h, c) = self.lstm(x, (h_0, c_0)) # output(5, 30, 64)
- output = self.aff(output, h)
- output = self.linear1(output) # (5, 30, 1)
- output = self.drop(output)
- output = self.relu1(output)
- output = self.linear2(output)
- output = self.relu1(output)
- pred = self.linear3(output)
- pred = pred[:, -1, :] # (5, 1)
- return pred
-
参数设置
-
- # 参数设置
- input_size=4
- num_layers=3
- hidden_size=12
- batch_size=64
- lr=0.0001
- output_size=1
- num_epochs = 150
- moudle = AFF_net.net_lstm(input_size,hidden_size,num_layers,output_size,batch_size).to(device)
- loss_fn= nn.MSELoss()
- loss_fn = loss_fn.to(device)
- optimizer=torch.optim.Adam(moudle.parameters(),lr=lr)
- # train
- torch.cuda.empty_cache()
- total_train_step = 0
- total_test_step = 0
- for epochs in range(num_epochs):
- print('-----开始第{}轮训练-----'.format(epochs + 1))
- moudle.train()
- for batch_x, batch_y in train_loader:
- batch_x = batch_x.to(device)
- batch_y = batch_y.to(device)
- outputs = moudle(batch_x)
- optimizer.zero_grad() # 将每次传播时的梯度累积清除
- # print(outputs.shape, batch_y.shape)
- loss = loss_fn(outputs, batch_y) # 计算损失
- loss.backward() # 反向传播
- optimizer.step()
- total_train_step += 1
- if total_train_step % 100 == 0:
- print("训练次数:{},Loss_MSE:{:4f}".format(total_train_step, loss.item()))
- writer.add_scalar("train_loss", loss.item(), total_train_step)
-
- moudle.eval()
- total_test_loss = 0
- with torch.no_grad(): # 梯度为0的时候
- for i, (batch_x, batch_y) in enumerate(test_loader):
- batch_x = batch_x.to(device)
- batch_y = batch_y.to(device)
- outputs = moudle(batch_x)
- # print(outputs.shape, batch_y.shape)
- loss = loss_fn(outputs, batch_y) # 计算损失
- total_test_loss = total_test_loss + loss.item()
- print("整体测试集上的Loss:{:4f}".format(total_test_loss))
- writer.add_scalar("test_loss", total_test_loss, total_test_step)
- total_test_step = total_test_step + 1
- writer.close()
为验证文中所提基于注意力机制的多特征融合空气质量预测模型优越性和有效性,选择经过多层迭代的注意力机制iAFF-LSTM模型,进行比较试验。以模型评估指标的数值来度量其优越性。具体模式如下:
(1)LSTM:长短期记忆网络是一种经典深度网络模型,经常作为基础模型进行实验结果比较。
(2)AFF-LSTM:以LSTM网络为基础,增加AFF注意力融合模型对多特征进行融合,使的两种不同的特征能够有效的融合。
如下表所示为本章模型与消融模型的模型评估数值。
对比模型 | MSE | MAE | Explained_variance_score |
LSTM | 0.0511 | 2.0369 | 0.9741 |
AFF-LSTM | 0.0311 | 2.2548 | 0.9719 |
iAFF-LSTM | 0.0286 | 2.1274 | 0.9722 |
在tensorboard上看看误差曲线,从图中可以看出,虽然RNN的梯度问题在LSTM中得到一定的解决,但在庞大的空气质量数据面前远远不够,在面对量级更大的序列时,LSTM网络也会显得十分棘手。在数据集对比模型AFF-LSTM在进行相同的num_epochs之后得到的MSE相比于普通的LSTM模型的下降了0.02,通过加入注意力融合特征的方法后,MSE有了一定程度的下降,说明了在时间序列预测任务上使用注意力融合特征方法具有一定优势且相比于单一的LSTM模型,混合模型可以更好的发挥各个模块的价值。而AFF-LSTM和iAFF-LSTM相比,额外多了一层AFF层,能更好的生成初始特征,如下图4-3和图4-4所示:AFF-LSTM模型经过150轮训练周期,将MSE损失函数降到0.0311,并且iAFF-LSTM模型在同样的训练周期后,将MSE损失函数减少到0.0286。综合考虑,本数据集在iAFF-LSTM模型上表现了较好的预测拟合结果,与传统单一的网络模型比较,具有更好的预测能力。
这里预测的轮数需要用到150轮左右,预测轮数太小可能到到时候预测出来是一条直线。
另外,在上表中还有一种方法是iAFF注意力融合多特征方法。是AFF方法的迭代版本。可以看出,在保持模型效果下,loss值更降低一步。看完AFF-LSTM模型,想必大家都会了iAFF-LSTM模型了吧。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。