当前位置:   article > 正文

讯飞电力需求预测挑战赛上分笔记 Datawhale AI 夏令营

讯飞电力需求预测挑战赛上分笔记 Datawhale AI 夏令营

1 开始

比赛地址:2024 iFLYTEK A.I.开发者大赛-讯飞开放平台

datawhale夏令营手册:Docs

百度飞浆平台:飞桨AI Studio星河社区-人工智能学习与实训社区

以及别忘了我写过的一篇:飞桨PaddlePaddle平台算力白嫖从入门到入狱不完全指南(Datawhale AI 夏令营)-CSDN博客

2 lightgbm 闭着眼跑:259分

2.1 环境配置与文件导入

  1. !pip install lightgbm==3.3.0
  2. import numpy as np
  3. import pandas as pd
  4. import lightgbm as lgb
  5. from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
  6. import tqdm
  7. import sys
  8. import os
  9. import gc
  10. import argparse
  11. import warnings
  12. warnings.filterwarnings('ignore')
  13. train = pd.read_csv('./data/train.csv')
  14. test = pd.read_csv('./data/test.csv')

2.2 特征工程(有修改)

  1. # 合并训练数据和测试数据,并进行排序
  2. data = pd.concat([test, train], axis=0, ignore_index=True)
  3. data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
  4. # 历史平移
  5. for i in range(10,30):
  6. data[f'last{i}_target'] = data.groupby(['id'])['target'].shift(i)
  7. # 窗口统计
  8. data[f'win3_mean_target'] = (data['last10_target'] + data['last11_target'] + data['last12_target']) / 3
  9. # 进行数据切分
  10. train = data[data.target.notnull()].reset_index(drop=True)
  11. test = data[data.target.isnull()].reset_index(drop=True)
  12. # 确定输入特征
  13. train_cols = [f for f in data.columns if f not in ['id','target']]

2.3 跑啊

  1. def time_model(lgb, train_df, test_df, cols):
  2. # 训练集和验证集切分
  3. trn_x, trn_y = train_df[train_df.dt>=31][cols], train_df[train_df.dt>=31]['target']
  4. val_x, val_y = train_df[train_df.dt<=30][cols], train_df[train_df.dt<=30]['target']
  5. # 构建模型输入数据
  6. train_matrix = lgb.Dataset(trn_x, label=trn_y)
  7. valid_matrix = lgb.Dataset(val_x, label=val_y)
  8. # lightgbm参数
  9. lgb_params = {
  10. 'boosting_type': 'gbdt',
  11. 'objective': 'regression',
  12. 'metric': 'mse',
  13. 'min_child_weight': 5,
  14. 'num_leaves': 2 ** 5,
  15. 'lambda_l2': 10,
  16. 'feature_fraction': 0.8,
  17. 'bagging_fraction': 0.8,
  18. 'bagging_freq': 4,
  19. 'learning_rate': 0.05,
  20. 'seed': 2024,
  21. 'nthread' : 16,
  22. 'verbose' : -1,
  23. }
  24. # 训练模型
  25. model = lgb.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
  26. categorical_feature=[], verbose_eval=500, early_stopping_rounds=500)
  27. # from lightgbm.callback import log_evaluation
  28. # model = lgb.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
  29. # callbacks=[log_evaluation(period=100)])
  30. # 验证集和测试集结果预测
  31. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
  32. test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)
  33. # 离线分数评估
  34. score = mean_squared_error(val_pred, val_y)
  35. print(score)
  36. return val_pred, test_pred
  37. lgb_oof, lgb_test = time_model(lgb, train, test, train_cols)
  38. # 保存结果文件到本地
  39. test['target'] = lgb_test
  40. test[['id','dt','target']].to_csv('submit2.csv', index=None)

3 模型融合:236分

3.1 环境配置

  1. !pip install lightgbm==3.3.0
  2. !pip install xgboost -i https://pypi.tuna.tsinghua.edu.cn/simple # 指定清华镜像
  3. !pip install catboost -i https://pypi.tuna.tsinghua.edu.cn/simple --user

3.2 特征工程

  1. import numpy as np
  2. import pandas as pd
  3. import lightgbm as lgb
  4. from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
  5. import tqdm
  6. import sys
  7. import os
  8. import gc
  9. import argparse
  10. import warnings
  11. warnings.filterwarnings('ignore')
  12. train = pd.read_csv('./data/train.csv')
  13. test = pd.read_csv('./data/test.csv')
  14. # 合并训练数据和测试数据,并进行排序
  15. data = pd.concat([test, train], axis=0, ignore_index=True)
  16. data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
  17. # 历史平移
  18. for i in range(10,36):
  19. data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
  20. # 历史平移 + 差分特征
  21. for i in range(1,4):
  22. data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
  23. # 窗口统计
  24. for win in [15,30,50,70]:
  25. data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
  26. data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
  27. data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
  28. data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
  29. # 历史平移 + 窗口统计
  30. for win in [7,14,28,35,50,70]:
  31. data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
  32. data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
  33. data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
  34. data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
  35. data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
  36. # 进行数据切分
  37. train = data[data.target.notnull()].reset_index(drop=True)
  38. test = data[data.target.isnull()].reset_index(drop=True)
  39. # 确定输入特征
  40. train_cols = [f for f in data.columns if f not in ['id','target']]

3.3 解决爆内存

出现这个问题是内存炸了

解决方案1:更大的内存!!!

算力不够用可以参考我的《白嫖指南

解决方案2:将浮点型数据float64转换为float32或float16

参考Datawhale AI 夏令营-电力需求预测挑战赛 · 语雀

回头我再试试

3.4 跑起来了!!!

  1. from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
  2. import lightgbm as lgb
  3. import xgboost as xgb
  4. from catboost import CatBoostRegressor
  5. from sklearn.metrics import mean_squared_error, mean_absolute_error
  6. def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
  7. '''
  8. clf:调用模型
  9. train_x:训练数据
  10. train_y:训练数据对应标签
  11. test_x:测试数据
  12. clf_name:选择使用模型名
  13. seed:随机种子
  14. '''
  15. folds = 5
  16. kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
  17. oof = np.zeros(train_x.shape[0])
  18. test_predict = np.zeros(test_x.shape[0])
  19. cv_scores = []
  20. for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
  21. print('************************************ {} ************************************'.format(str(i+1)))
  22. trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
  23. if clf_name == "lgb":
  24. train_matrix = clf.Dataset(trn_x, label=trn_y)
  25. valid_matrix = clf.Dataset(val_x, label=val_y)
  26. params = {
  27. 'boosting_type': 'gbdt',
  28. 'objective': 'regression',
  29. 'metric': 'mae',
  30. 'min_child_weight': 6,
  31. 'num_leaves': 2 ** 6,
  32. 'lambda_l2': 10,
  33. 'feature_fraction': 0.8,
  34. 'bagging_fraction': 0.8,
  35. 'bagging_freq': 4,
  36. 'learning_rate': 0.1,
  37. 'seed': 2023,
  38. 'nthread' : 16,
  39. 'verbose' : -1,
  40. }
  41. model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
  42. categorical_feature=[], verbose_eval=200, early_stopping_rounds=100)
  43. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
  44. test_pred = model.predict(test_x, num_iteration=model.best_iteration)
  45. if clf_name == "xgb":
  46. xgb_params = {
  47. 'booster': 'gbtree',
  48. 'objective': 'reg:squarederror',
  49. 'eval_metric': 'mae',
  50. 'max_depth': 5,
  51. 'lambda': 10,
  52. 'subsample': 0.7,
  53. 'colsample_bytree': 0.7,
  54. 'colsample_bylevel': 0.7,
  55. 'eta': 0.1,
  56. 'tree_method': 'hist',
  57. 'seed': 520,
  58. 'nthread': 16
  59. }
  60. train_matrix = clf.DMatrix(trn_x , label=trn_y)
  61. valid_matrix = clf.DMatrix(val_x , label=val_y)
  62. test_matrix = clf.DMatrix(test_x)
  63. watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
  64. model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=200, early_stopping_rounds=100)
  65. val_pred = model.predict(valid_matrix)
  66. test_pred = model.predict(test_matrix)
  67. if clf_name == "cat":
  68. params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
  69. 'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
  70. model = clf(iterations=1000, **params)
  71. model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
  72. metric_period=200,
  73. use_best_model=True,
  74. cat_features=[],
  75. verbose=1)
  76. val_pred = model.predict(val_x)
  77. test_pred = model.predict(test_x)
  78. oof[valid_index] = val_pred
  79. test_predict += test_pred / kf.n_splits
  80. score = mean_absolute_error(val_y, val_pred)
  81. cv_scores.append(score)
  82. print(cv_scores)
  83. return oof, test_predict
  84. # 选择lightgbm模型
  85. lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
  86. # 选择xgboost模型
  87. xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
  88. # 选择catboost模型
  89. cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
  90. # 进行取平均融合
  91. final_test = (lgb_test + xgb_test + cat_test) / 3
  92. # 保存结果文件到本地
  93. test['target'] = final_test
  94. test[['id','dt','target']].to_csv('submit3.csv', index=None)

4 lightgbm 特征工程+回调函数

4.1 只加了特征工程

特征工程代码见3.2部分

4.2 再加个回调函数

回调函数参考了文章Datawhale AI夏令营 机器学习Task2 笔记打卡-CSDN博客的尝试3代码部分(72~92行)

  1. from lightgbm.callback import log_evaluation
  2. from lightgbm import early_stopping, log_evaluation
  3. def time_model(lgb, train_df, test_df, cols):
  4. # 训练集和验证集切分
  5. trn_x, trn_y = train_df[train_df.dt>=31][cols], train_df[train_df.dt>=31]['target']
  6. val_x, val_y = train_df[train_df.dt<=30][cols], train_df[train_df.dt<=30]['target']
  7. # 构建模型输入数据
  8. train_matrix = lgb.Dataset(trn_x, label=trn_y)
  9. valid_matrix = lgb.Dataset(val_x, label=val_y)
  10. # 设置参数
  11. params = {
  12. 'objective': 'regression',
  13. 'metric': 'mse',
  14. 'boosting_type': 'gbdt',
  15. 'num_leaves': 31,
  16. 'learning_rate': 0.05,
  17. 'feature_fraction': 0.9
  18. }
  19. # 设置回调函数
  20. callbacks = [
  21. log_evaluation(period=100), # 每100轮记录一次日志
  22. early_stopping(stopping_rounds=500) # 500轮没有提升时提前停止
  23. ]
  24. # 训练模型
  25. model = lgb.train(params, train_matrix, num_boost_round=50000, valid_sets=[train_matrix, valid_matrix],
  26. callbacks=callbacks)
  27. # 验证集和测试集结果预测
  28. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
  29. test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)
  30. # 离线分数评估
  31. score = mean_squared_error(val_pred, val_y)
  32. print(score)
  33. return val_pred, test_pred
  34. lgb_oof, lgb_test = time_model(lgb, train, test, train_cols)
  35. # 保存结果文件到本地
  36. test['target'] = lgb_test
  37. test[['id','dt','target']].to_csv('submit2.csv', index=None)

5 更多挣扎:还在整,回头补

打算试试看更猛的特征工程,但是用方法2(将浮点型数据float64转换为float32或float16)解决内存爆炸。

记得点赞收藏关注

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小惠珠哦/article/detail/1019052
推荐阅读
相关标签
  

闽ICP备14008679号