赞
踩
比赛地址:2024 iFLYTEK A.I.开发者大赛-讯飞开放平台
datawhale夏令营手册:Docs
百度飞浆平台:飞桨AI Studio星河社区-人工智能学习与实训社区
以及别忘了我写过的一篇:飞桨PaddlePaddle平台算力白嫖从入门到入狱不完全指南(Datawhale AI 夏令营)-CSDN博客
- !pip install lightgbm==3.3.0
- import numpy as np
- import pandas as pd
- import lightgbm as lgb
- from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
- import tqdm
- import sys
- import os
- import gc
- import argparse
- import warnings
- warnings.filterwarnings('ignore')
-
- train = pd.read_csv('./data/train.csv')
- test = pd.read_csv('./data/test.csv')
- # 合并训练数据和测试数据,并进行排序
- data = pd.concat([test, train], axis=0, ignore_index=True)
- data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
-
- # 历史平移
- for i in range(10,30):
- data[f'last{i}_target'] = data.groupby(['id'])['target'].shift(i)
-
- # 窗口统计
- data[f'win3_mean_target'] = (data['last10_target'] + data['last11_target'] + data['last12_target']) / 3
-
- # 进行数据切分
- train = data[data.target.notnull()].reset_index(drop=True)
- test = data[data.target.isnull()].reset_index(drop=True)
-
- # 确定输入特征
- train_cols = [f for f in data.columns if f not in ['id','target']]
- def time_model(lgb, train_df, test_df, cols):
- # 训练集和验证集切分
- trn_x, trn_y = train_df[train_df.dt>=31][cols], train_df[train_df.dt>=31]['target']
- val_x, val_y = train_df[train_df.dt<=30][cols], train_df[train_df.dt<=30]['target']
- # 构建模型输入数据
- train_matrix = lgb.Dataset(trn_x, label=trn_y)
- valid_matrix = lgb.Dataset(val_x, label=val_y)
- # lightgbm参数
- lgb_params = {
- 'boosting_type': 'gbdt',
- 'objective': 'regression',
- 'metric': 'mse',
- 'min_child_weight': 5,
- 'num_leaves': 2 ** 5,
- 'lambda_l2': 10,
- 'feature_fraction': 0.8,
- 'bagging_fraction': 0.8,
- 'bagging_freq': 4,
- 'learning_rate': 0.05,
- 'seed': 2024,
- 'nthread' : 16,
- 'verbose' : -1,
- }
-
- # 训练模型
- model = lgb.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
- categorical_feature=[], verbose_eval=500, early_stopping_rounds=500)
- # from lightgbm.callback import log_evaluation
- # model = lgb.train(lgb_params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
- # callbacks=[log_evaluation(period=100)])
- # 验证集和测试集结果预测
- val_pred = model.predict(val_x, num_iteration=model.best_iteration)
- test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)
- # 离线分数评估
- score = mean_squared_error(val_pred, val_y)
- print(score)
-
- return val_pred, test_pred
-
- lgb_oof, lgb_test = time_model(lgb, train, test, train_cols)
-
- # 保存结果文件到本地
- test['target'] = lgb_test
- test[['id','dt','target']].to_csv('submit2.csv', index=None)
- !pip install lightgbm==3.3.0
- !pip install xgboost -i https://pypi.tuna.tsinghua.edu.cn/simple # 指定清华镜像
- !pip install catboost -i https://pypi.tuna.tsinghua.edu.cn/simple --user
- import numpy as np
- import pandas as pd
- import lightgbm as lgb
- from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
- import tqdm
- import sys
- import os
- import gc
- import argparse
- import warnings
- warnings.filterwarnings('ignore')
-
- train = pd.read_csv('./data/train.csv')
- test = pd.read_csv('./data/test.csv')
-
- # 合并训练数据和测试数据,并进行排序
- data = pd.concat([test, train], axis=0, ignore_index=True)
- data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
-
- # 历史平移
- for i in range(10,36):
- data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
-
- # 历史平移 + 差分特征
- for i in range(1,4):
- data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
-
- # 窗口统计
- for win in [15,30,50,70]:
- data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
- data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
- data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
- data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
-
- # 历史平移 + 窗口统计
- for win in [7,14,28,35,50,70]:
- data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
- data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
- data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
- data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
- data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
-
- # 进行数据切分
- train = data[data.target.notnull()].reset_index(drop=True)
- test = data[data.target.isnull()].reset_index(drop=True)
-
- # 确定输入特征
- train_cols = [f for f in data.columns if f not in ['id','target']]
出现这个问题是内存炸了
解决方案1:更大的内存!!!
算力不够用可以参考我的《白嫖指南》
解决方案2:将浮点型数据float64转换为float32或float16
参考Datawhale AI 夏令营-电力需求预测挑战赛 · 语雀
回头我再试试
- from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
- import lightgbm as lgb
- import xgboost as xgb
- from catboost import CatBoostRegressor
- from sklearn.metrics import mean_squared_error, mean_absolute_error
- def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
- '''
- clf:调用模型
- train_x:训练数据
- train_y:训练数据对应标签
- test_x:测试数据
- clf_name:选择使用模型名
- seed:随机种子
- '''
- folds = 5
- kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
- oof = np.zeros(train_x.shape[0])
- test_predict = np.zeros(test_x.shape[0])
- cv_scores = []
-
- for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
- print('************************************ {} ************************************'.format(str(i+1)))
- trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
-
- if clf_name == "lgb":
- train_matrix = clf.Dataset(trn_x, label=trn_y)
- valid_matrix = clf.Dataset(val_x, label=val_y)
- params = {
- 'boosting_type': 'gbdt',
- 'objective': 'regression',
- 'metric': 'mae',
- 'min_child_weight': 6,
- 'num_leaves': 2 ** 6,
- 'lambda_l2': 10,
- 'feature_fraction': 0.8,
- 'bagging_fraction': 0.8,
- 'bagging_freq': 4,
- 'learning_rate': 0.1,
- 'seed': 2023,
- 'nthread' : 16,
- 'verbose' : -1,
- }
- model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
- categorical_feature=[], verbose_eval=200, early_stopping_rounds=100)
- val_pred = model.predict(val_x, num_iteration=model.best_iteration)
- test_pred = model.predict(test_x, num_iteration=model.best_iteration)
-
- if clf_name == "xgb":
- xgb_params = {
- 'booster': 'gbtree',
- 'objective': 'reg:squarederror',
- 'eval_metric': 'mae',
- 'max_depth': 5,
- 'lambda': 10,
- 'subsample': 0.7,
- 'colsample_bytree': 0.7,
- 'colsample_bylevel': 0.7,
- 'eta': 0.1,
- 'tree_method': 'hist',
- 'seed': 520,
- 'nthread': 16
- }
- train_matrix = clf.DMatrix(trn_x , label=trn_y)
- valid_matrix = clf.DMatrix(val_x , label=val_y)
- test_matrix = clf.DMatrix(test_x)
-
- watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
-
- model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=200, early_stopping_rounds=100)
- val_pred = model.predict(valid_matrix)
- test_pred = model.predict(test_matrix)
-
- if clf_name == "cat":
- params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
- 'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
-
- model = clf(iterations=1000, **params)
- model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
- metric_period=200,
- use_best_model=True,
- cat_features=[],
- verbose=1)
-
- val_pred = model.predict(val_x)
- test_pred = model.predict(test_x)
-
- oof[valid_index] = val_pred
- test_predict += test_pred / kf.n_splits
-
- score = mean_absolute_error(val_y, val_pred)
- cv_scores.append(score)
- print(cv_scores)
-
- return oof, test_predict
-
- # 选择lightgbm模型
- lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
- # 选择xgboost模型
- xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
- # 选择catboost模型
- cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
-
- # 进行取平均融合
- final_test = (lgb_test + xgb_test + cat_test) / 3
-
- # 保存结果文件到本地
- test['target'] = final_test
- test[['id','dt','target']].to_csv('submit3.csv', index=None)
特征工程代码见3.2部分
回调函数参考了文章Datawhale AI夏令营 机器学习Task2 笔记打卡-CSDN博客的尝试3代码部分(72~92行)
- from lightgbm.callback import log_evaluation
- from lightgbm import early_stopping, log_evaluation
-
- def time_model(lgb, train_df, test_df, cols):
- # 训练集和验证集切分
- trn_x, trn_y = train_df[train_df.dt>=31][cols], train_df[train_df.dt>=31]['target']
- val_x, val_y = train_df[train_df.dt<=30][cols], train_df[train_df.dt<=30]['target']
- # 构建模型输入数据
- train_matrix = lgb.Dataset(trn_x, label=trn_y)
- valid_matrix = lgb.Dataset(val_x, label=val_y)
-
- # 设置参数
- params = {
- 'objective': 'regression',
- 'metric': 'mse',
- 'boosting_type': 'gbdt',
- 'num_leaves': 31,
- 'learning_rate': 0.05,
- 'feature_fraction': 0.9
- }
-
- # 设置回调函数
- callbacks = [
- log_evaluation(period=100), # 每100轮记录一次日志
- early_stopping(stopping_rounds=500) # 500轮没有提升时提前停止
- ]
-
- # 训练模型
- model = lgb.train(params, train_matrix, num_boost_round=50000, valid_sets=[train_matrix, valid_matrix],
- callbacks=callbacks)
-
- # 验证集和测试集结果预测
- val_pred = model.predict(val_x, num_iteration=model.best_iteration)
- test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)
- # 离线分数评估
- score = mean_squared_error(val_pred, val_y)
- print(score)
-
- return val_pred, test_pred
-
- lgb_oof, lgb_test = time_model(lgb, train, test, train_cols)
-
- # 保存结果文件到本地
- test['target'] = lgb_test
- test[['id','dt','target']].to_csv('submit2.csv', index=None)
打算试试看更猛的特征工程,但是用方法2(将浮点型数据float64转换为float32或float16)解决内存爆炸。
记得点赞收藏关注
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。