赞
踩
任务
通过某新区供水管网的历史压力数据、天气数据和供水管网互通图,预测未来某时间点的压力数据。
数据
主办方提供某新区供水管网数据,数据划分如下:
训练集:2018至2019年的30个压力监测点近两年的压力数据、2018年至2019年的天气数据,以及标明了30个压力监测点位置的供水管网互通图。
测试集:以下4段时间的每小时的压力数据、每天的天气数据,需要分别去预测对应日期每小时的压力数据。
具体数据字段描述如下:
(1)压力数据
(2)气象数据
总体思路如下:
代码如下:
import gc import lightgbm as lgb import numpy as np import pandas as pd from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold train2018 = pd.read_csv('train_水压数据_2018.csv', engine='python') train2019 = pd.read_csv('train_水压数据_2019.csv', engine='python') test2020 = pd.read_csv('test_水压数据_2020.csv', engine='python') test = pd.read_csv('to_predict.csv', engine='python') submit = pd.read_csv('submit.csv', engine='python') ''' 通过某新区供水管网的历史压力数据、天气数据和供水管网互通图,预测未来某时间点的压力数据。 训练集:2018至2019年的30个压力监测点近两年的压力数据、2018年至2019年的天气数据,以及标明了30个压力监测点位置的供水管网互通图。 测试集:以下4段时间的每小时的压力数据、每天的天气数据,需要分别去预测对应日期每小时的压力数据。 相邻的管道结合起来建模 ''' # 把原本为列名的小时改成Hour字段,做行的条件分类字段 def reshape_data(df1): time = df1["Time"].values meas = df1["MeasName"].values df_list = [] for i in range(0, 24): hour = 'H' + str(i) pressure = df1[hour].values df2 = pd.DataFrame() df2["Time"] = time df2["MeasName"] = meas df2["Hour"] = hour df2["pressure"] = pressure df_list.append(df2) df3 = pd.concat(df_list) df3.sort_values(by=['Time', 'MeasName'], inplace=True) df3 = df3.reset_index(drop=True) return df3 train2018 = reshape_data(train2018) train2019 = reshape_data(train2019) test2020 = reshape_data(test2020) train2018['Time_time'] = pd.to_datetime(train2018['Time']) train2019['Time_time'] = pd.to_datetime(train2019['Time']) test2020['Time_time'] = pd.to_datetime(test2020['Time']) test['Time_time'] = pd.to_datetime(test['Time']) def abnormal(df): # 处理-9999异常值: 填充为nan index_value = list(df[df['pressure'] == -99999].index) for i in index_value: df.loc[i, 'pressure'] = np.nan # 把压力值小于0.1的数据设为nan index_value = list(df[df['pressure'] < 0.1].index) for i in index_value: df.loc[i, 'pressure'] = np.nan # 把压力值大于0.5的数据设为nan index_value = list(df[df['pressure'] > 0.5].index) for i in index_value: df.loc[i, 'pressure'] = np.nan return df.dropna() train2018 = abnormal(train2018) train2019 = abnormal(train2019) test2020 = abnormal(test2020) def feature1(df): df['Day'] = df['Time'].apply(lambda x: int(x.split('-')[-1])) df['Hour'] = df['Hour'].apply(lambda x: int(x.replace('H', ''))) df['MeasName'] = df['MeasName'].apply(lambda x: int(x.replace('站点', ''))) return df train2018 = feature1(train2018) train2019 = feature1(train2019) test2020 = feature1(test2020) test = feature1(test) train2019Mon2 = train2019[(train2019['Time_time'] >= '2019-2-1') & (train2019['Time_time'] <= '2019-2-28')] train2019Mon1 = train2019[(train2019['Time_time'] >= '2019-1-1') & (train2019['Time_time'] <= '2019-1-28')] Mon_2_1_2019 = train2019Mon2['pressure'].mean() - train2019Mon1['pressure'].mean() train1 = test2020[(test2020['Time_time'] >= '2020-1-1') & (test2020['Time_time'] <= '2020-1-31')] test1 = test[(test['Time_time'] >= '2020-2-3') & (test['Time_time'] <= '2020-2-16')] used_feat = [f for f in train1.columns if f not in ['id', 'pressure', 'Time', 'Time_time']] print('feat nums ', len(used_feat), used_feat) train_x = train1[used_feat] train_y = train1['pressure'] test_x = test1[used_feat] print(train_x.shape, test_x.shape) scores = [] params = {'learning_rate': 0.05, 'boosting_type': 'gbdt', 'objective': 'regression_l1', 'metric': 'mae', 'min_child_samples': 46, 'min_child_weight': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 2, 'num_leaves': 26, 'max_depth': 9, 'seed': 2019, 'verbosity': -1, } oof_train = np.zeros(len(train_x)) preds = np.zeros(len(test_x)) folds = 5 seeds = [2048, 1997] for seed in seeds: kfold = KFold(n_splits=folds, shuffle=True, random_state=seed) for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): print('fold ', fold + 1) x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[ val_idx] train_set = lgb.Dataset(x_trn, y_trn) val_set = lgb.Dataset(x_val, y_val) model = lgb.train(params, train_set, num_boost_round=5000, valid_sets=(train_set, val_set), early_stopping_rounds=90, verbose_eval=50) oof_train[val_idx] += model.predict(x_val) / len(seeds) preds += model.predict(test_x) / folds / len(seeds) del x_trn, y_trn, x_val, y_val, model, train_set, val_set gc.collect() mse = (mean_squared_error(oof_train, train1['pressure'])) print('-' * 120) print('mse ', round(mse, 5)) test1_pre = preds + Mon_2_1_2019 test1.loc[:, 'pressure'] = test1_pre.tolist() ####分段2 train2019Mon4 = train2019[(train2019['Time_time'] >= '2019-4-1') & (train2019['Time_time'] <= '2019-4-30')] train2019Mon3 = train2019[(train2019['Time_time'] >= '2019-3-1') & (train2019['Time_time'] <= '2019-3-30')] Mon_4_3_2019 = train2019Mon4['pressure'].mean() - train2019Mon3['pressure'].mean() train2 = test2020[(test2020['Time_time'] >= '2020-3-1') & (test2020['Time_time'] <= '2020-3-31')] test2 = test[(test['Time_time'] >= '2020-4-6') & (test['Time_time'] <= '2020-4-19')] used_feat = [f for f in train2.columns if f not in ['id', 'pressure', 'Time', 'Time_time']] print('feat nums ', len(used_feat), used_feat) train_x = train2[used_feat] train_y = train2['pressure'] test_x = test2[used_feat] print(train_x.shape, test_x.shape) oof_train = np.zeros(len(train_x)) preds = np.zeros(len(test_x)) folds = 5 seeds = [2048, 1997] for seed in seeds: kfold = KFold(n_splits=folds, shuffle=True, random_state=seed) for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): print('fold ', fold + 1) x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[ val_idx] train_set = lgb.Dataset(x_trn, y_trn) val_set = lgb.Dataset(x_val, y_val) model = lgb.train(params, train_set, num_boost_round=5000, valid_sets=(train_set, val_set), early_stopping_rounds=90, verbose_eval=50) oof_train[val_idx] += model.predict(x_val) / len(seeds) preds += model.predict(test_x) / folds / len(seeds) del x_trn, y_trn, x_val, y_val, model, train_set, val_set gc.collect() mse = (mean_squared_error(oof_train, train2['pressure'])) print('-' * 120) print('mse ', round(mse, 5)) test2_pre = preds + Mon_2_1_2019 test2.loc[:, 'pressure'] = test2_pre.tolist() ####分段3 train2019Mon6 = train2019[(train2019['Time_time'] >= '2019-6-1') & (train2019['Time_time'] <= '2019-6-30')] train2019Mon5 = train2019[(train2019['Time_time'] >= '2019-5-1') & (train2019['Time_time'] <= '2019-5-30')] Mon_6_5_2019 = train2019Mon6['pressure'].mean() - train2019Mon5['pressure'].mean() train3 = test2020[(test2020['Time_time'] >= '2020-5-1') & (test2020['Time_time'] <= '2020-5-31')] test3 = test[(test['Time_time'] >= '2020-6-1') & (test['Time_time'] <= '2020-6-14')] used_feat = [f for f in train3.columns if f not in ['id', 'pressure', 'Time', 'Time_time']] print('feat nums ', len(used_feat), used_feat) train_x = train3[used_feat] train_y = train3['pressure'] test_x = test3[used_feat] print(train_x.shape, test_x.shape) oof_train = np.zeros(len(train_x)) preds = np.zeros(len(test_x)) folds = 5 seeds = [2048, 1997] for seed in seeds: kfold = KFold(n_splits=folds, shuffle=True, random_state=seed) for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): print('fold ', fold + 1) x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[ val_idx] train_set = lgb.Dataset(x_trn, y_trn) val_set = lgb.Dataset(x_val, y_val) model = lgb.train(params, train_set, num_boost_round=5000, valid_sets=(train_set, val_set), early_stopping_rounds=90, verbose_eval=50) oof_train[val_idx] += model.predict(x_val) / len(seeds) preds += model.predict(test_x) / folds / len(seeds) del x_trn, y_trn, x_val, y_val, model, train_set, val_set gc.collect() mse = (mean_squared_error(oof_train, train3['pressure'])) print('-' * 120) print('mse ', round(mse, 5)) pre = preds + Mon_2_1_2019 test3.loc[:, 'pressure'] = pre.tolist() ###分段4 train2019Mon9 = train2019[(train2019['Time_time'] >= '2019-9-1') & (train2019['Time_time'] <= '2019-9-30')] train2019Mon8 = train2019[(train2019['Time_time'] >= '2019-8-1') & (train2019['Time_time'] <= '2019-8-30')] Mon_9_8_2019 = train2019Mon9['pressure'].mean() - train2019Mon8['pressure'].mean() train4 = test2020[(test2020['Time_time'] >= '2020-8-1') & (test2020['Time_time'] <= '2020-8-31')] test4 = test[(test['Time_time'] >= '2020-9-7') & (test['Time_time'] <= '2020-9-20')] used_feat = [f for f in train4.columns if f not in ['id', 'pressure', 'Time', 'Time_time']] print('feat nums ', len(used_feat), used_feat) train_x = train4[used_feat] train_y = train4['pressure'] test_x = test4[used_feat] print(train_x.shape, test_x.shape) oof_train = np.zeros(len(train_x)) preds = np.zeros(len(test_x)) folds = 5 seeds = [2048, 1997] for seed in seeds: kfold = KFold(n_splits=folds, shuffle=True, random_state=seed) for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): print('fold ', fold + 1) x_trn, y_trn, x_val, y_val = train_x.iloc[trn_idx], train_y.iloc[trn_idx], train_x.iloc[val_idx], train_y.iloc[ val_idx] train_set = lgb.Dataset(x_trn, y_trn) val_set = lgb.Dataset(x_val, y_val) model = lgb.train(params, train_set, num_boost_round=5000, valid_sets=(train_set, val_set), early_stopping_rounds=90, verbose_eval=50) oof_train[val_idx] += model.predict(x_val) / len(seeds) preds += model.predict(test_x) / folds / len(seeds) del x_trn, y_trn, x_val, y_val, model, train_set, val_set gc.collect() mse = (mean_squared_error(oof_train, train4['pressure'])) print('-' * 120) print('mse ', round(mse, 5)) pre = preds + Mon_2_1_2019 np.savetxt('pre4.csv', pre) # test4.loc[:, 'pressure'] = pre.tolist() test4.to_csv('test4.csv') test4 = pd.read_csv('test4.csv') pre = pd.read_csv('pre4.csv', header=None) test4.loc[:, 'pressure'] = pre.values.tolist() test = pd.concat([test1, test2, test3, test4], axis=0) test[['id', 'pressure']].to_csv('lgb_5000.csv', index=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。