基于LTSM的多变量(Features)多输入\多时间跨度(Timesteps)的股票预测模型构建(Keras, Tensorflow, python)_lstm 跨特征变量预测

网上有很多LSTM模型的示例代码,但让人头疼的是,大部分都是用v(t-n)的feature来预测v(t),那么对于有预测v(t+1), v(t+2)...v(t+n)需求的时候,估计就会有人采用笨办法,既先预测v(t),之后再用v(t)预测v(t+1),愚公移山,慢慢挖。。。(*@ο@*) 哇~既然写,就写点和别人不一样的,那么我们今天的模型就是用v(t-n)来预测v(t+m)时刻感兴趣的特征值。




  1. import numpy as np
  2. import time
  3. import argparse
  4. import json
  5. from math import sqrt, ceil
  6. from matplotlib import pyplot
  7. from pandas import read_csv
  8. from pandas import DataFrame
  9. from pandas import concat
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.preprocessing import LabelEncoder
  12. from sklearn.metrics import mean_squared_error
  13. from keras.models import Sequential
  14. from keras.layers import Dense
  15. from keras.layers import LSTM

第二步:加载文件数据,我使用的是.csv的原数据,这只股票相关的feature截图如下图所示(如果没有这数据,可以通过修改自己手头数据的列名称搞定)。加载.csv的函数中,需要设定一些变量,如file_path:文件路径, header_row_index:列表头所在行位置(一般都是第一行0), index_col_name:这个没有直接设置为None, col_to_predict:需要预测的列名, cols_to_drop:要丢掉列名,比如这里的index_code和date就直接丢掉,如果不想设置太多参数,可以使用被注释掉的带默认参数值的函数定义。数据加载的过程中,会按照设定删除不需要的列,此外,还会将要预测的列调整到第0列的位置。最终返回col_names:使用到的feature列名组,values:各feature的值(float32格式),values.shape[1]:模型建模中用到的feature数量, output_col_name:预测feature名(本例是open)


  1. # load data set
  2. #def load_dataset(file_path='dataset.csv', header_row_index=0, index_col_name =None, col_to_predict, cols_to_drop=None):
  3. def _load_dataset(file_path, header_row_index, index_col_name, col_to_predict, cols_to_drop):
  4. """
  5. file_path: the csv file path
  6. header_row_index: the header row index in the csv file
  7. index_col_name: the index column (can be None if no index is there)
  8. col_to_predict: the column name/index to predict
  9. cols_to_drop: the column names/indices to drop (single label or list-like)
  10. """
  11. # read dataset from disk
  12. dataset = read_csv(file_path, header=header_row_index, index_col=False)
  13. #print(dataset)
  14. # set index col,设置索引列,参数输入列的名字列表
  15. if index_col_name:
  16. dataset.set_index(index_col_name, inplace=True)
  17. # drop nonused colums,删除不需要的列,参数输入列的名字列表
  18. '''if cols_to_drop:
  19. if type(cols_to_drop[0]) == int:
  20. dataset.drop(index=cols_to_drop, axis=0, inplace=True)
  21. else:
  22. dataset.drop(columns=cols_to_drop, axis=1, inplace=True)'''
  23. if cols_to_drop:
  24. dataset.drop(cols_to_drop, axis =1, inplace = True)
  25. #print('\nprint data set again\n',dataset)
  26. # get rows and column names
  27. col_names = dataset.columns.values.tolist()
  28. values = dataset.values
  29. #print(col_names, '\n values\n', values)
  30. # move the column to predict to be the first col: 把预测列调至第一列
  31. col_to_predict_index = col_to_predict if type(col_to_predict) == int else col_names.index(col_to_predict)
  32. output_col_name = col_names[col_to_predict_index]
  33. if col_to_predict_index > 0:
  34. col_names = [col_names[col_to_predict_index]] + col_names[:col_to_predict_index] + col_names[col_to_predict_index+1:]
  35. values = np.concatenate((values[:, col_to_predict_index].reshape((values.shape[0], 1)), values[:,:col_to_predict_index], values[:,col_to_predict_index+1:]), axis=1)
  36. #print(col_names, '\n values2\n', values)
  37. # ensure all data is float
  38. values = values.astype("float32")
  39. #print(col_names, '\n values3\n', values)
  40. return col_names, values,values.shape[1], output_col_name


  1. # scale dataset
  2. #def _scale_dataset(values, scale_range = (0,1)):
  3. def _scale_dataset(values, scale_range):
  4. """
  5. values: dataset values
  6. scale_range: scale range to fit data in
  7. """
  8. # normalize features
  9. scaler = MinMaxScaler(feature_range=scale_range or (0, 1))
  10. scaled = scaler.fit_transform(values)
  11. return (scaler, scaled)

第四步:将数据格式转化为监督学习的格式。在本例中,我们将n_in_timestep和n_out_timestep设置为3和1,意味着我们将使用T-3, T-2, T-1时刻的所有feature数据来预测T时刻的open值。在生成新的格式的时候,会有一部分数据格被填充Nan,通过设置dropnan参数为True把这部分行直接删除。

  1. # convert series to supervised learning (ex: var1(t)_row1 = var1(t-1)_row2),列表打印出来一看就明白了
  2. #def _series_to_supervised(values, n_in=3, n_out=1, dropnan=True, col_names, verbose=True):
  3. def _series_to_supervised(values, n_in, n_out, dropnan, col_names, verbose):
  4. """
  5. values: dataset scaled values
  6. n_in: number of time lags (intervals) to use in each neuron, 与多少个之前的time_step相关,和后面的n_intervals是一样
  7. n_out: number of time-steps in future to predict,预测未来多少个time_step
  8. dropnan: whether to drop rows with NaN values after conversion to supervised learning
  9. col_names: name of columns for dataset
  10. verbose: whether to output some debug data
  11. """
  12. n_vars = 1 if type(values) is list else values.shape[1]
  13. if col_names is None: col_names = ["var%d" % (j+1) for j in range(n_vars)]
  14. df = DataFrame(values)
  15. cols, names = list(), list()
  16. # input sequence (t-n, ... t-1)
  17. for i in range(n_in, 0, -1):
  18. cols.append(df.shift(i))
  19. names += [("%s(t-%d)" % (col_names[j], i)) for j in range(n_vars)]
  20. # forecast sequence (t, t+1, ... t+n)
  21. for i in range(0, n_out):
  22. cols.append(df.shift(-i)) #这里循环结束后cols是个列表,每个列表都是一个shift过的矩阵
  23. if i == 0:
  24. names += [("%s(t)" % (col_names[j])) for j in range(n_vars)]
  25. else:
  26. names += [("%s(t+%d)" % (col_names[j], i)) for j in range(n_vars)]
  27. # put it all together
  28. agg = concat(cols, axis=1) #将cols中的每一行元素一字排开,连接起来,vala t-n_in, valb t-n_in ... valta t, valb t... vala t+n_out-1, valb t+n_out-1
  29. agg.columns = names
  30. # drop rows with NaN values
  31. if dropnan:
  32. agg.dropna(inplace=True)
  33. if verbose:
  34. print("\nsupervised data shape:", agg.shape)
  35. return agg



  1. # split into train and test sets
  2. #def _split_data_to_train_test_sets(values, n_intervals=3, n_features, train_percentage=0.67, verbose=True):
  3. def _split_data_to_train_test_sets(values, n_intervals, n_features, train_percentage, verbose):
  4. """
  5. values: dataset supervised values
  6. n_intervals: number of time lags (intervals) to use in each neuron
  7. n_features: number of features (variables) per neuron
  8. train_percentage: percentage of train data related to the dataset series size; (1-train_percentage) will be for test data
  9. verbose: whether to output some debug data
  10. """
  11. n_train_intervals = ceil(values.shape[0] * train_percentage) #ceil(x)->得到最接近的一个不小于x的整数,如ceil(2.001)=3
  12. train = values[:n_train_intervals, :]
  13. test = values[n_train_intervals:, :]
  14. # split into input and outputs
  15. n_obs = n_intervals * n_features
  16. train_X, train_y = train[:, :n_obs], train[:, -n_features] #train_Y直接赋值倒数第六列,刚好是t + n_out_timestep-1时刻的0号要预测列
  17. #train_X此时的shape为[train.shape[0], timesteps * features]
  18. #print('before reshape\ntrain_X shape:', train_X.shape)
  19. test_X, test_y = test[:, :n_obs], test[:, -n_features]
  20. # reshape input to be 3D [samples, timesteps, features]
  21. train_X = train_X.reshape((train_X.shape[0], n_intervals, n_features))
  22. test_X = test_X.reshape((test_X.shape[0], n_intervals, n_features))
  23. if verbose:
  24. print("")
  25. print("train_X shape:", train_X.shape)
  26. print("train_y shape:", train_y.shape)
  27. print("test_X shape:", test_X.shape)
  28. print("test_y shape:", test_y.shape)
  29. return (train_X, train_y, test_X, test_y)


  1. # create the nn model
  2. #def _create_model(train_X, train_y, test_X, test_y, n_neurons=20, n_batch=50, n_epochs=60, is_stateful=False, has_memory_stack=False, loss_function='mae', optimizer_function='adam', draw_loss_plot=True, output_col_name, verbose=True):
  3. def _create_model(train_X, train_y, test_X, test_y, n_neurons, n_batch, n_epochs, is_stateful, has_memory_stack, loss_function, optimizer_function, draw_loss_plot, output_col_name, verbose):
  4. """
  5. train_X: train inputs
  6. train_y: train targets
  7. test_X: test inputs
  8. test_y: test targets
  9. n_neurons: number of neurons for LSTM nn
  10. n_batch: nn batch size
  11. n_epochs: training epochs
  12. is_stateful: whether the model has memory states
  13. has_memory_stack: whether the model has memory stack
  14. loss_function: the model loss function evaluator
  15. optimizer_function: the loss optimizer function
  16. draw_loss_plot: whether to draw the loss history plot
  17. output_col_name: name of the output/target column to be predicted
  18. verbose: whether to output some debug data
  19. """
  20. # design network
  21. model = Sequential()
  22. if is_stateful:
  23. # calculate new compatible batch size
  24. for i in range(n_batch, 0, -1):
  25. if train_X.shape[0] % i == 0 and test_X.shape[0] % i == 0:
  26. if verbose and i != n_batch:
  27. print ("\n*In stateful network, batch size should be dividable by training and test sets; had to decrease it to %d." % i)
  28. n_batch = i
  29. break
  30. model.add(LSTM(n_neurons, batch_input_shape=(n_batch, train_X.shape[1], train_X.shape[2]), stateful=True, return_sequences=has_memory_stack))
  31. if has_memory_stack:
  32. model.add(LSTM(n_neurons, batch_input_shape=(n_batch, train_X.shape[1], train_X.shape[2]), stateful=True))
  33. else:
  34. model.add(LSTM(n_neurons, input_shape=(train_X.shape[1], train_X.shape[2])))
  35. model.add(Dense(1))
  36. model.compile(loss=loss_function, optimizer=optimizer_function)
  37. if verbose:
  38. print("")
  39. # fit network
  40. losses = []
  41. val_losses = []
  42. if is_stateful:
  43. for i in range(n_epochs):
  44. history = model.fit(train_X, train_y, epochs=1, batch_size=n_batch,
  45. validation_data=(test_X, test_y), verbose=0, shuffle=False)
  46. if verbose:
  47. print("Epoch %d/%d" % (i + 1, n_epochs))
  48. print("loss: %f - val_loss: %f" % (history.history["loss"][0], history.history["val_loss"][0]))
  49. losses.append(history.history["loss"][0])
  50. val_losses.append(history.history["val_loss"][0])
  51. model.reset_states()
  52. else:
  53. history = model.fit(train_X, train_y, epochs=n_epochs, batch_size=n_batch,
  54. validation_data=(test_X, test_y), verbose=2 if verbose else 0, shuffle=False)
  55. if draw_loss_plot:
  56. pyplot.plot(history.history["loss"] if not is_stateful else losses, label="Train Loss (%s)" % output_col_name)
  57. pyplot.plot(history.history["val_loss"] if not is_stateful else val_losses, label="Test Loss (%s)" % output_col_name)
  58. pyplot.legend()
  59. pyplot.show()
  60. print(history.history)
  61. #model.save('./my_model_%s.h5'%datetime.datetime.now())
  62. return (model, n_batch)


  1. # make a prediction
  2. #def _make_prediction(model, train_X, train_y, test_X, test_y, compatible_n_batch, n_intervals=3, n_features, scaler=(0,1), draw_prediction_fit_plot=True, output_col_name, verbose=True):
  3. def _make_prediction(model, train_X, train_y, test_X, test_y, compatible_n_batch, n_intervals, n_features, scaler, draw_prediction_fit_plot, output_col_name, verbose):
  4. """
  5. train_X: train inputs
  6. train_y: train targets
  7. test_X: test inputs
  8. test_y: test targets
  9. compatible_n_batch: modified (compatible) nn batch size
  10. n_intervals: number of time lags (intervals) to use in each neuron
  11. n_features: number of features (variables) per neuron
  12. scaler: the scaler object used to invert transformation to real scale
  13. draw_prediction_fit_plot: whether to draw the the predicted vs actual fit plot
  14. output_col_name: name of the output/target column to be predicted
  15. verbose: whether to output some debug data
  16. """
  17. if verbose:
  18. print("")
  19. yhat = model.predict(test_X, batch_size=compatible_n_batch, verbose = 1 if verbose else 0)
  20. test_X = test_X.reshape((test_X.shape[0], n_intervals*n_features))
  21. # invert scaling for forecast
  22. inv_yhat = np.concatenate((yhat, test_X[:, (1-n_features):]), axis=1)
  23. inv_yhat = scaler.inverse_transform(inv_yhat)
  24. inv_yhat = inv_yhat[:,0]
  25. # invert scaling for actual
  26. test_y = test_y.reshape((len(test_y), 1))
  27. inv_y = np.concatenate((test_y, test_X[:, (1-n_features):]), axis=1)
  28. inv_y = scaler.inverse_transform(inv_y)
  29. inv_y = inv_y[:,0]
  30. # calculate RMSE
  31. rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
  32. # calculate average error percentage
  33. avg = np.average(inv_y)
  34. error_percentage = rmse / avg
  35. if verbose:
  36. print("")
  37. print("Test Root Mean Square Error: %.3f" % rmse)
  38. print("Test Average Value for %s: %.3f" % (output_col_name, avg))
  39. print("Test Average Error Percentage: %.2f/100.00" % (error_percentage * 100))
  40. if draw_prediction_fit_plot:
  41. pyplot.plot(inv_y, label="Actual (%s)" % output_col_name)
  42. pyplot.plot(inv_yhat, label="Predicted (%s)" % output_col_name)
  43. pyplot.legend()
  44. pyplot.show()
  45. return (inv_y, inv_yhat, rmse, error_percentage)


  1. #!input
  2. file_path= 'data2_2_2.csv'
  3. header_row_index = 0
  4. index_col_name = None
  5. col_to_predict ='open'
  6. cols_to_drop = ['index_code','date']
  7. col_names, values,n_features, output_col_name = _load_dataset(file_path, header_row_index,
  8. index_col_name, col_to_predict, cols_to_drop)
  9. scaler, values = _scale_dataset(values, None)
  10. print('values before _series_to_supervised\n', values, '\nvalue shape:', values.shape)
  11. #!input
  12. n_in_timestep = 3
  13. n_out_timestep = 1
  14. verbose = 2
  15. dropnan = True
  16. agg1 = _series_to_supervised(values, n_in_timestep, n_out_timestep, dropnan, col_names, verbose)
  17. #agg2 = _series_to_supervised(values, 1, 2, dropnan, col_names, verbose)
  18. #agg3 = _series_to_supervised(values, 2, 1, dropnan, col_names, verbose)
  19. #agg4 = _series_to_supervised(values, 3, 2, dropnan, col_names, verbose)
  20. '''
  21. #不懂_series_to_supervised()中n_in和n_out作用的话把下面被注释掉的列表一打出来就明白了
  22. print('agg1:\n', agg1.columns)
  23. print('agg2:\n', agg2.columns)
  24. print('agg3:\n', agg3.columns)
  25. print('agg4:\n', agg4.columns)
  26. #print(agg1)
  27. agg3
  28. '''
  29. print('agg1.value:\n', agg1.values, '\nagg1.shape:', agg1.shape, '\nagg1.columns:', agg1.columns) #agg1和agg1.value是不一样的,agg1是DataFrame,agg1.value是np.array
  30. #print('\nagg1\n', agg1)
  31. #!input
  32. train_percentage = 0.67
  33. train_X, train_Y, test_X, test_Y =_split_data_to_train_test_sets(agg1.values, n_in_timestep, n_features,
  34. train_percentage, verbose)
  35. #!input
  36. n_neurons=20
  37. n_batch=50
  38. n_epochs=60
  39. is_stateful=False
  40. has_memory_stack=False
  41. loss_function='mae'
  42. optimizer_function='adam'
  43. draw_loss_plot=True
  44. model, compatible_n_batch = _create_model(train_X, train_Y, test_X, test_Y, n_neurons, n_batch, n_epochs,
  45. is_stateful, has_memory_stack, loss_function, optimizer_function,
  46. draw_loss_plot, output_col_name, verbose)
  47. #model.save('./my_model_%s.h5'%datetime.datetime.now())
  48. model.save('./my_model_in time step_%d_out_timestep_%d.h5'%n_in_timestep%n_out_timestep)
  49. #!input
  50. draw_prediction_fit_plot = True
  51. actual_target, predicted_target, error_value, error_percentage = _make_prediction(model, train_X, train_Y,
  52. test_X, test_Y, compatible_n_batch,
  53. n_in_timestep, n_features, scaler,
  54. draw_prediction_fit_plot, output_col_name,
  55. verbose)



  1. #导入需要使用到的模块
  2. import urllib
  3. import re
  4. import pandas as pd
  5. import os
  6. #爬虫抓取网页函数
  7. def getHtml(url):
  8. html = urllib.request.urlopen(url).read()
  9. html = html.decode('gbk')
  10. return html
  11. #抓取网页股票代码函数
  12. def getStackCode(html):
  13. s = r'<li><a target="_blank" href="http://quote.eastmoney.com/\S\S(.*?).html">'
  14. pat = re.compile(s)
  15. code = pat.findall(html)
  16. return code
  17. Url = 'http://quote.eastmoney.com/stocklist.html'#东方财富网股票数据连接地址
  18. filepath = 'C:\\Users\\rihang\\Desktop\\my data\\my project\\stock2 prediction\\stock data\\'#定义数据文件保存路径
  19. #实施抓取
  20. code = getStackCode(getHtml(Url))
  21. #获取所有股票代码(以6开头的,应该是沪市数据)集合
  22. CodeList = []
  23. for item in code:
  24. if item[0]=='6':
  25. CodeList.append(item)
  26. #抓取数据并保存到本地csv文件
  27. for code in CodeList:
  28. print('正在获取股票%s数据'%code)
  29. url = 'http://quotes.money.163.com/service/chddata.html?code=0'+code+\
  31. urllib.request.urlretrieve(url, filepath+code+'.csv')



