当前位置:   article > 正文

机器学习-时间序列的特征工程_时间序列输入矩阵有冗余

时间序列输入矩阵有冗余

写在前面:为什么要做这个报告?

从上面的比较发现,主流的时序处理方法,都有其局限性:

  • 可解释性强的方法不够灵活
    • 如ARIMA需要做平稳性检验以及满足其他假设条件
    • 基本只能单变量,不能使用如领先指标
  • 效果上限高的的如神经网络和Boosting方法对数据量需求大,且可解释性差

有没有一种既灵活、解释性又强、数据需求可大可小的方法 ?

简单模型 + 特征工程

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  # more plots
from dateutil.relativedelta import relativedelta  # working with dates with style
from scipy.optimize import minimize  # for function minimization
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge,Lasso
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
# 忽略warnings
import warnings
warnings.filterwarnings("ignore")

定义常用模板函数

In [2]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def plotModelResults(model, X_train, X_test, y_train, y_test, plot_intervals=False, plot_anomalies=False, subtitle=""):
    """
 Plots modelled vs fact values, prediction intervals and anomalies

 """

    prediction = model.predict(X_test)

    plt.figure(figsize=(15, 7))
    plt.plot(y_test.index, prediction, "g", label="prediction", linewidth=2.0)
    plt.plot(y_test.index, y_test.values, label="actual", linewidth=2.0)

    if plot_intervals:
        tscv = TimeSeriesSplit(n_splits=3)
        # for train, test in tscv.split(y_train):
        # model.fit(X_train[train],y_train[train])
        # B = pd.DataFrame(y_train[test])
        # B['pred'] = model.predict(X_train[test])
        # score = model.score(X_train[test],y_train[test])
        # print(score)

        cv = cross_val_score(model, X_train, y_train,
                             cv=tscv,
                             scoring="neg_mean_absolute_error")
        mae = cv.mean() * (-1)
        deviation = cv.std()

        scale = 1.96
        lower = prediction - (mae + scale * deviation)
        upper = prediction + (mae + scale * deviation)

        plt.plot(y_test.index, lower, "r--", label="upper bond / lower bond", alpha=0.5)
        plt.plot(y_test.index, upper, "r--", alpha=0.5)

        if plot_anomalies:
            anomalies = np.array([np.NaN] * len(y_test))
            anomalies[y_test < lower] = y_test[y_test < lower]
            anomalies[y_test > upper] = y_test[y_test > upper]
            plt.plot(y_test.index, anomalies, "o", markersize=10, label="Anomalies")

    error = mean_absolute_percentage_error(prediction, y_test)
    plt.title((subtitle+"Mean absolute percentage error {0:.2f}%").format(error))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True)
    plt.show()


def plotModelBoostingResults(model1, model2, X_test, y_test):
    """
 Plots modelled vs fact values, prediction intervals and anomalies

 """

    prediction1 = model1.predict(X_test)
    prediction2 = model2.predict(X_test)
    prediction = prediction1 + prediction2

    plt.figure(figsize=(15, 7))
    plt.plot(y_test.index, prediction, "g", label="prediction", linewidth=2.0)
    plt.plot(y_test.index, y_test.values, label="actual", linewidth=2.0)

    error = mean_absolute_percentage_error(prediction, y_test)
    plt.title("Mean absolute percentage error {0:.2f}%".format(error))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True)
    plt.show()


def plotCoefficients(model):
    """
 Plots sorted coefficient values of the model
 """
    
    coefs = pd.DataFrame(model.coef_, X_train.columns)
    coefs.columns = ["coef"]
    coefs["abs"] = coefs.coef.apply(np.abs)
    coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)
    
    plt.figure(figsize=(15, 7))
    coefs.coef.plot(kind='bar')
    plt.grid(True, axis='y')
    plt.hlines(y=0, xmin=0, xmax=len(coefs), linestyles='dashed');


def code_mean(data, cat_feature, real_feature):
    """
 Returns a dictionary where keys are unique categories of the cat_feature,
 and values are means over real_feature
 """
    return dict(data.groupby(cat_feature)[real_feature].mean())

def makeCirclyStats(data, target, cols, stats):
    for col in cols:
        for stat in stats:
            data[col + '_' + stat] = data.groupby(col)[target].transform(stat)
            
def featuresRank(model, X_train):
    coefs = pd.DataFrame(model.coef_, X_train.columns)
    coefs.columns = ["coef"]
    coefs["abs"] = coefs.coef.apply(np.abs)
    coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)
    return coefs

def prepareData(data):
    data_ = data.copy()
    y = data_.dropna().y
    X = data_.dropna().drop(['y'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.1)
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
    return X_train, X_test, y_train, y_test

导入发电耗煤数据

In [3]:
Df = pd.read_excel('/kaggle/input/coal-consume/data.xlsx', index_col=0, date_parser='date')
Df = Df.dropna()
Df.columns = ["y"]
Df.head()
Out[3]:
y
指标名称
2013-01-0168.6
2013-01-0268.2
2013-01-0367.2
2013-01-0469.4
2013-01-0571.8

历史平移特征

In [4]:
data = Df['2015-01-01':].copy()
lags = range(1, 31)
for i in lags:
    data["lag_{}".format(i)] = data.y.shift(i)
data.head()
Out[4]:
ylag_1lag_2lag_3lag_4lag_5lag_6lag_7lag_8lag_9...lag_21lag_22lag_23lag_24lag_25lag_26lag_27lag_28lag_29lag_30
指标名称
2015-01-0266.7NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2015-01-0468.966.7NaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2015-01-0569.368.966.7NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2015-01-0672.369.368.966.7NaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2015-01-0772.772.369.368.966.7NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN

5 rows × 31 columns

In [5]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

查看变量之间的相关性

In [6]:
plt.figure(figsize=(10, 8))
sns.heatmap(X_train.corr())
Out[6]:
<AxesSubplot:>

减少相关性、删除冗余变量

In [7]:
data = Df['2015-01-01':].copy()
lags = [1,14,21,28]
for i in lags:
    data["lag_{}".format(i)] = data.y.shift(i)
data.head()
Out[7]:
ylag_1lag_14lag_21lag_28
指标名称
2015-01-0266.7NaNNaNNaNNaN
2015-01-0468.966.7NaNNaNNaN
2015-01-0569.368.9NaNNaNNaN
2015-01-0672.369.3NaNNaNNaN
2015-01-0772.772.3NaNNaNNaN
In [8]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

lag_1 占的权重过高,过拟合,导致预测滞后


引入日期特征

In [9]:
data.index = pd.to_datetime(data.index)
data["weekday"] = data.index.weekday
data['is_weekend'] = data.weekday.isin([5, 6]) * 1
data['quarter'] = data.index.quarter
data['month'] = data.index.month
data['dayofmonth'] = data.index.day

data.head()
Out[9]:
ylag_1lag_14lag_21lag_28weekdayis_weekendquartermonthdayofmonth
指标名称
2015-01-0266.7NaNNaNNaNNaN40112
2015-01-0468.966.7NaNNaNNaN61114
2015-01-0569.368.9NaNNaNNaN00115
2015-01-0672.369.3NaNNaNNaN10116
2015-01-0772.772.3NaNNaNNaN20117
In [10]:
weekday = code_mean(data, 'weekday', "y")
plt.figure(figsize=(7, 5))
plt.title("weekday averages")
pd.DataFrame.from_dict(weekday, orient='index')[0].plot()

quarter = code_mean(data, 'quarter', "y")
plt.figure(figsize=(7, 5))
plt.title("quarter averages")
pd.DataFrame.from_dict(quarter, orient='index')[0].plot()

month = code_mean(data, 'month', "y")
plt.figure(figsize=(7, 5))
plt.title("month averages")
pd.DataFrame.from_dict(month, orient='index')[0].plot()

dayofmonth = code_mean(data, 'dayofmonth', "y")
plt.figure(figsize=(7, 5))
plt.title("dayofmonth averages")
pd.DataFrame.from_dict(dayofmonth, orient='index')[0].plot()
Out[10]:
<AxesSubplot:title={'center':'dayofmonth averages'}>

In [11]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

日期全局统计特征

"mean","max", "min", "median", "sum", "skew", "std"

In [12]:
cols = ["weekday", "month", "dayofmonth", "quarter", "is_weekend"]
stats = ["mean","max", "min", "median", "sum", "skew", "std"]
makeCirclyStats(data, "y", cols, stats)
data.head()
Out[12]:
ylag_1lag_14lag_21lag_28weekdayis_weekendquartermonthdayofmonth...quarter_sumquarter_skewquarter_stdis_weekend_meanis_weekend_maxis_weekend_minis_weekend_medianis_weekend_sumis_weekend_skewis_weekend_std
指标名称
2015-01-0266.7NaNNaNNaNNaN40112...40348.07-0.37778913.28387364.26784793.1929.063.85108355.59-0.04624411.109158
2015-01-0468.966.7NaNNaNNaN61114...40348.07-0.37778913.28387364.64487492.7228.064.8546027.15-0.13835011.039269
2015-01-0569.368.9NaNNaNNaN00115...40348.07-0.37778913.28387364.26784793.1929.063.85108355.59-0.04624411.109158
2015-01-0672.369.3NaNNaNNaN10116...40348.07-0.37778913.28387364.26784793.1929.063.85108355.59-0.04624411.109158
2015-01-0772.772.3NaNNaNNaN20117...40348.07-0.37778913.28387364.26784793.1929.063.85108355.59-0.04624411.109158

5 rows × 45 columns

In [13]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

发现有些过拟合,删掉上述特征

In [14]:
data.columns
Out[14]:
Index(['y', 'lag_1', 'lag_14', 'lag_21', 'lag_28', 'weekday', 'is_weekend',
       'quarter', 'month', 'dayofmonth', 'weekday_mean', 'weekday_max',
       'weekday_min', 'weekday_median', 'weekday_sum', 'weekday_skew',
       'weekday_std', 'month_mean', 'month_max', 'month_min', 'month_median',
       'month_sum', 'month_skew', 'month_std', 'dayofmonth_mean',
       'dayofmonth_max', 'dayofmonth_min', 'dayofmonth_median',
       'dayofmonth_sum', 'dayofmonth_skew', 'dayofmonth_std', 'quarter_mean',
       'quarter_max', 'quarter_min', 'quarter_median', 'quarter_sum',
       'quarter_skew', 'quarter_std', 'is_weekend_mean', 'is_weekend_max',
       'is_weekend_min', 'is_weekend_median', 'is_weekend_sum',
       'is_weekend_skew', 'is_weekend_std'],
      dtype='object')
In [15]:
data = data.drop(columns=['weekday_mean', 'month_mean',
       'dayofmonth_mean', 'quarter_mean', 'is_weekend_mean', 'weekday_max',
       'weekday_min', 'weekday_median', 'weekday_sum', 'weekday_skew',
       'weekday_std', 'month_max', 'month_min', 'month_median', 'month_sum',
       'month_skew', 'month_std', 'dayofmonth_max', 'dayofmonth_min',
       'dayofmonth_median', 'dayofmonth_sum', 'dayofmonth_skew',
       'dayofmonth_std', 'quarter_max', 'quarter_min', 'quarter_median',
       'quarter_sum', 'quarter_skew', 'quarter_std', 'is_weekend_max',
       'is_weekend_min', 'is_weekend_median', 'is_weekend_sum',
       'is_weekend_skew', 'is_weekend_std'])

滚动特征

做7天、14天、28天移动均线

In [16]:
data['rolling_mean_7'] = data.rolling(7)['y'].mean()
data['rolling_mean_14'] = data.rolling(14)['y'].mean()
data['rolling_mean_28'] = data.rolling(28)['y'].mean()
In [17]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

每周的同一天,做滚动统计特征

In [18]:
data['week_rolling_mean_4'] = data.groupby('weekday')["y"].transform(lambda x: x.rolling(4).mean())
In [19]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

把每年的同个月份,做滚动均值(expanding)

In [20]:
data['month_expanding_mean'] = data.groupby('month')["y"].transform(lambda x: x.expanding(10).mean())
In [21]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

效果没有提升,删掉不重要特征

In [22]:
data = data.drop(columns=['month_expanding_mean','dayofmonth','is_weekend'])
In [23]:
# 划分训练集测试集
X_train, X_test, y_train, y_test = prepareData(data) 

# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)

# 画图
plotModelResults(lr, X_train, X_test, y_train, y_test, plot_intervals=True, plot_anomalies=True)
plotCoefficients(lr)

查看当前特征变量之间的相关性

In [24]:
plt.figure(figsize=(10, 8))
sns.heatmap(X_train.corr())
Out[24]:
<AxesSubplot:>

写出模型表达式

In [25]:
featuresRank(lr, X_train)
Out[25]:
coef
week_rolling_mean_47.910972
lag_16.853260
rolling_mean_14-5.050413
rolling_mean_74.433046
rolling_mean_28-3.149058
lag_21-1.368408
month0.692110
quarter-0.496375
lag_280.370576
weekday-0.322907
lag_14-0.264314
In [26]:
lr.intercept_
Out[26]:
62.794430379746814

Y=62.79+7.9Xweekma4+6.85Xlag1−5Xma14+4.4Xma7−3.15Xma28−1.3Xma21+0.69Xmonth−0.5Xquater+0.37Xlag28−0.32Xweekday−0.26Xlag14Y=62.79+7.9Xweekma4+6.85Xlag1−5Xma14+4.4Xma7−3.15Xma28−1.3Xma21+0.69Xmonth−0.5Xquater+0.37Xlag28−0.32Xweekday−0.26Xlag14


总结

以上是时序特征工程的一般方法和流程:

  1. 先增后减:逐步添加窗口平移、日期、全局、滚动特征,根据效果再删除特征
  2. 常用代码已经封装成了函数,如果觉得好用可以直接复制粘贴

如何进一步提升效果?

  1. 拟合残差
  2. 其他特征的添加
    • 比如可以寻找领先指标添加
    • 可以添加技术分析的一些指标等

Bonus: 使用xgb拟合残差

In [27]:
y_train_e = y_train - lr.predict(X_train)
y_train_e.head()
Out[27]:
指标名称
2015-01-31   -0.142392
2015-02-01   -0.748898
2015-02-02    1.780891
2015-02-03   -0.680682
2015-02-04   -1.622106
Name: y, dtype: float64
In [28]:
from sklearn.model_selection import cross_val_score
def objective(space):
    model = xgb.XGBRegressor(n_estimators=space['n_estimators'],
                             max_depth=int(space['max_depth']),
                             learning_rate=space['learning_rate'],
                             gamma=space['gamma'],
                             min_child_weight=space['min_child_weight'],
                             subsample=space['subsample'],
                             colsample_bytree=space['colsample_bytree']
                             )

    model.fit(X_train, y_train_e)

    # Applying k-Fold Cross Validation
    tscv = TimeSeriesSplit(n_splits=3)
    score = cross_val_score(estimator=model, X=X_train, y=y_train_e, cv=tscv)
    CrossValMean = score.mean()
    return {'loss': 1 - CrossValMean, 'status': STATUS_OK}


space = {
    'max_depth': hp.choice('max_depth', range(5, 30, 1)),
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators': hp.choice('n_estimators', range(20, 205, 5)),
    'gamma': hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}
In [29]:
# trials = Trials()
# best = fmin(fn=objective,
# space=space,
# algo=tpe.suggest,
# max_evals=50,
# trials=trials)
In [30]:
# print(best)
In [31]:
# xgbReg = xgb.XGBRegressor(n_estimators=best['n_estimators'],
# max_depth=best['max_depth'],
# learning_rate=best['learning_rate'],
# gamma=best['gamma'],
# min_child_weight=best['min_child_weight'],
# subsample=best['subsample'],
# )

xgbReg = xgb.XGBRegressor(n_estimators=20,
                        max_depth=22,
                        learning_rate=0.01,
                        gamma=0.19,
                        min_child_weight=7,
                        subsample=0.65,
                        colsample_bytree=0.14
                       )

xgbReg.fit(X_train, y_train_e)
Out[31]:
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.14,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0.19, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=22, max_leaves=0, min_child_weight=7,
             missing=nan, monotone_constraints='()', n_estimators=20, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)
In [32]:
plotModelBoostingResults(lr, xgbReg, X_test, y_test)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/106309
推荐阅读
相关标签
  

闽ICP备14008679号