赞
踩
根据Task3:尝试使用深度学习方案 - 飞书云文档 (feishu.cn)优化方案详解的教学进行优化
1.进一步优化特征
(1)历史平移特征:通过历史平移获取上个阶段的信息;
(2)差分特征:可以帮助获取相邻阶段的增长差异,描述数据的涨减变化情况。
(3)窗口统计特征:然后基于窗口范围进统计均值、最大值、最小值、中位数、方差的信息,并尝试不同窗口测试
-
- # 历史平移
- for i in range(10,36):
- data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
-
- # 历史平移 + 差分特征
- for i in range(1,4):
- data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
-
- # 窗口统计
- for win in [15,30,50,70]:
- data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
- data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
- data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
- data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
-
- # 历史平移 + 窗口统计
- for win in [7,14,28,35,50,70]:
- data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
- data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
- data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
- data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
- data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
- from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
2.通过模型融合改善预测结果
尝试使用是catboost、xgboost和lightgbm三个模型分别输出三个结果,最终进行加权平均融合
- import lightgbm as lgb
- import xgboost as xgb
- from catboost import CatBoostRegressor
- from sklearn.metrics import mean_squared_error, mean_absolute_error
-
- # 进行数据切分
- train = data[data.target.notnull()].reset_index(drop=True)
- test = data[data.target.isnull()].reset_index(drop=True)
-
- # 确定输入特征
- train_cols = [f for f in train.columns if f not in ['id','target']]
- test_cols = [f for f in test.columns if f not in ['id','target']]
- def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
- '''
- clf:调用模型
- train_x:训练数据
- train_y:训练数据对应标签
- test_x:测试数据
- clf_name:选择使用模型名
- seed:随机种子
- '''
- folds = 5
- kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
- oof = np.zeros(train_x.shape[0])
- test_predict = np.zeros(test_x.shape[0])
- cv_scores = []
-
- for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
- print('************************************ {} ************************************'.format(str(i+1)))
- trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
-
- if clf_name == "lgb":
- train_matrix = clf.Dataset(trn_x, label=trn_y)
- valid_matrix = clf.Dataset(val_x, label=val_y)
- params = {
- 'boosting_type': 'gbdt',
- 'objective': 'regression',
- 'metric': 'mae',
- 'min_child_weight': 6,
- 'num_leaves': 2 ** 6,
- 'lambda_l2': 10,
- 'feature_fraction': 0.8,
- 'bagging_fraction': 0.8,
- 'bagging_freq': 4,
- 'learning_rate': 0.1,
- 'seed': 2023,
- 'nthread' : 16,
- 'verbose' : -1,
- }
- model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
- categorical_feature=[])
- val_pred = model.predict(val_x, num_iteration=model.best_iteration)
- test_pred = model.predict(test_x, num_iteration=model.best_iteration)
-
- if clf_name == "xgb":
- xgb_params = {
- 'booster': 'gbtree',
- 'objective': 'reg:squarederror',
- 'eval_metric': 'mae',
- 'max_depth': 5,
- 'lambda': 10,
- 'subsample': 0.7,
- 'colsample_bytree': 0.7,
- 'colsample_bylevel': 0.7,
- 'eta': 0.1,
- 'tree_method': 'hist',
- 'seed': 520,
- 'nthread': 16
- }
- train_matrix = clf.DMatrix(trn_x , label=trn_y)
- valid_matrix = clf.DMatrix(val_x , label=val_y)
- test_matrix = clf.DMatrix(test_x)
-
- watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
-
- model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist)
- val_pred = model.predict(valid_matrix)
- test_pred = model.predict(test_matrix)
-
- if clf_name == "cat":
- params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
- 'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
-
- model = clf(iterations=1000, **params)
- model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
- metric_period=200,
- use_best_model=True,
- cat_features=[],
- verbose=1)
-
- val_pred = model.predict(val_x)
- test_pred = model.predict(test_x)
-
- oof[valid_index] = val_pred
- test_predict += test_pred / kf.n_splits
-
- score = mean_absolute_error(val_y, val_pred)
- cv_scores.append(score)
- print(cv_scores)
-
- return oof, test_predict,model
-
- lgb_oof, lgb_test, lgb_model = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
- xgb_oof, xgb_test, xgb_model = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
- cat_oof, cat_test, cat_model = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
- # 进行取平均融合
- final_test = (lgb_test + xgb_test + cat_test) / 3
- train_cols = [f for f in train.columns if f not in ['id','target']]
- test_cols = [f for f in test.columns if f not in ['id','target']]
- print(final_test)import matplotlib.pyplot as plt
3.对特征重要性进行可视化,根据特征重要性调整特征,尝试改善预测结果(此处以lgb为例)
- import matplotlib.pyplot as plt
-
- # 获取特征重要性
- importances = lgb_model.feature_importance()
- # 将特征重要性和特征名称组合在一起,并按重要性降序排序
- sorted_importances = sorted(zip(importances, train_cols), reverse=True)
-
- # 创建更大的图形以容纳较长的特征名称
- plt.figure(figsize=(15, 20)) # 增加图形的高度以容纳更多特征
-
- # 生成水平条形图
- plt.barh([name for _, name in sorted_importances], [imp for imp, _ in sorted_importances])
-
- # 设置标签和标题
- plt.xlabel('重要性')
- plt.ylabel('特征')
- plt.title('特征重要性 (LightGBM)')
- plt.gca().invert_yaxis() # 反转y轴,使得最重要的特征显示在上方
-
- # 旋转x轴标签以提高可读性
- plt.xticks(rotation=45, ha='right')
-
- # 显示图形
- plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。