当前位置:   article > 正文

Datawhale第二期AI夏令营|机器学习|Task3电力需求预测挑战赛笔记

Datawhale第二期AI夏令营|机器学习|Task3电力需求预测挑战赛笔记

根据‌⁠‍‬​‍‍⁠‬​‍‍​‌‌​​⁠⁠‍​‍‌​​‬​​​‌​‬​​​​‬​‬⁠​​​​‍​Task3:尝试使用深度学习方案 - 飞书云文档 (feishu.cn)优化方案详解的教学进行优化

1.进一步优化特征

(1)历史平移特征:通过历史平移获取上个阶段的信息;

(2)差分特征:可以帮助获取相邻阶段的增长差异,描述数据的涨减变化情况。

(3)窗口统计特征:然后基于窗口范围进统计均值、最大值、最小值、中位数、方差的信息,并尝试不同窗口测试

  1. # 历史平移
  2. for i in range(10,36):
  3. data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
  4. # 历史平移 + 差分特征
  5. for i in range(1,4):
  6. data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
  7. # 窗口统计
  8. for win in [15,30,50,70]:
  9. data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
  10. data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
  11. data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
  12. data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
  13. # 历史平移 + 窗口统计
  14. for win in [7,14,28,35,50,70]:
  15. data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
  16. data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
  17. data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
  18. data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
  19. data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
  20. from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold

2.通过模型融合改善预测结果

模型融合

尝试使用是catboost、xgboost和lightgbm三个模型分别输出三个结果,最终进行加权平均融合

  1. import lightgbm as lgb
  2. import xgboost as xgb
  3. from catboost import CatBoostRegressor
  4. from sklearn.metrics import mean_squared_error, mean_absolute_error
  5. # 进行数据切分
  6. train = data[data.target.notnull()].reset_index(drop=True)
  7. test = data[data.target.isnull()].reset_index(drop=True)
  8. # 确定输入特征
  9. train_cols = [f for f in train.columns if f not in ['id','target']]
  10. test_cols = [f for f in test.columns if f not in ['id','target']]
  11. def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
  12. '''
  13. clf:调用模型
  14. train_x:训练数据
  15. train_y:训练数据对应标签
  16. test_x:测试数据
  17. clf_name:选择使用模型名
  18. seed:随机种子
  19. '''
  20. folds = 5
  21. kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
  22. oof = np.zeros(train_x.shape[0])
  23. test_predict = np.zeros(test_x.shape[0])
  24. cv_scores = []
  25. for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
  26. print('************************************ {} ************************************'.format(str(i+1)))
  27. trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
  28. if clf_name == "lgb":
  29. train_matrix = clf.Dataset(trn_x, label=trn_y)
  30. valid_matrix = clf.Dataset(val_x, label=val_y)
  31. params = {
  32. 'boosting_type': 'gbdt',
  33. 'objective': 'regression',
  34. 'metric': 'mae',
  35. 'min_child_weight': 6,
  36. 'num_leaves': 2 ** 6,
  37. 'lambda_l2': 10,
  38. 'feature_fraction': 0.8,
  39. 'bagging_fraction': 0.8,
  40. 'bagging_freq': 4,
  41. 'learning_rate': 0.1,
  42. 'seed': 2023,
  43. 'nthread' : 16,
  44. 'verbose' : -1,
  45. }
  46. model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
  47. categorical_feature=[])
  48. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
  49. test_pred = model.predict(test_x, num_iteration=model.best_iteration)
  50. if clf_name == "xgb":
  51. xgb_params = {
  52. 'booster': 'gbtree',
  53. 'objective': 'reg:squarederror',
  54. 'eval_metric': 'mae',
  55. 'max_depth': 5,
  56. 'lambda': 10,
  57. 'subsample': 0.7,
  58. 'colsample_bytree': 0.7,
  59. 'colsample_bylevel': 0.7,
  60. 'eta': 0.1,
  61. 'tree_method': 'hist',
  62. 'seed': 520,
  63. 'nthread': 16
  64. }
  65. train_matrix = clf.DMatrix(trn_x , label=trn_y)
  66. valid_matrix = clf.DMatrix(val_x , label=val_y)
  67. test_matrix = clf.DMatrix(test_x)
  68. watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
  69. model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist)
  70. val_pred = model.predict(valid_matrix)
  71. test_pred = model.predict(test_matrix)
  72. if clf_name == "cat":
  73. params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
  74. 'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
  75. model = clf(iterations=1000, **params)
  76. model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
  77. metric_period=200,
  78. use_best_model=True,
  79. cat_features=[],
  80. verbose=1)
  81. val_pred = model.predict(val_x)
  82. test_pred = model.predict(test_x)
  83. oof[valid_index] = val_pred
  84. test_predict += test_pred / kf.n_splits
  85. score = mean_absolute_error(val_y, val_pred)
  86. cv_scores.append(score)
  87. print(cv_scores)
  88. return oof, test_predict,model
  89. lgb_oof, lgb_test, lgb_model = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
  90. xgb_oof, xgb_test, xgb_model = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
  91. cat_oof, cat_test, cat_model = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
  92. # 进行取平均融合
  93. final_test = (lgb_test + xgb_test + cat_test) / 3
  94. train_cols = [f for f in train.columns if f not in ['id','target']]
  95. test_cols = [f for f in test.columns if f not in ['id','target']]
  96. print(final_test)import matplotlib.pyplot as plt

 3.对特征重要性进行可视化,根据特征重要性调整特征,尝试改善预测结果(此处以lgb为例)

  1. import matplotlib.pyplot as plt
  2. # 获取特征重要性
  3. importances = lgb_model.feature_importance()
  4. # 将特征重要性和特征名称组合在一起,并按重要性降序排序
  5. sorted_importances = sorted(zip(importances, train_cols), reverse=True)
  6. # 创建更大的图形以容纳较长的特征名称
  7. plt.figure(figsize=(15, 20)) # 增加图形的高度以容纳更多特征
  8. # 生成水平条形图
  9. plt.barh([name for _, name in sorted_importances], [imp for imp, _ in sorted_importances])
  10. # 设置标签和标题
  11. plt.xlabel('重要性')
  12. plt.ylabel('特征')
  13. plt.title('特征重要性 (LightGBM)')
  14. plt.gca().invert_yaxis() # 反转y轴,使得最重要的特征显示在上方
  15. # 旋转x轴标签以提高可读性
  16. plt.xticks(rotation=45, ha='right')
  17. # 显示图形
  18. plt.show()

声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号