赞
踩
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,RidgeCV from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor #支持向量机 from sklearn.svm import SVR from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures import warnings warnings.filterwarnings('ignore')
train_data = pd.read_csv('./zhengqi_train.txt',sep='\t')
test_data = pd.read_csv('./zhengqi_test.txt',sep='\t')
#合并训练数据和预测数据
train_data["origin"]="train"
test_data["origin"]="test"
data_all=pd.concat([train_data,test_data],axis=0,ignore_index=True)
#View data
data_all
V0 | V1 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V2 | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | V3 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | V4 | V5 | V6 | V7 | V8 | V9 | origin | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.940 | -0.307 | -0.073 | 0.550 | -0.484 | 0.000 | -1.707 | -1.162 | -0.573 | -0.991 | -0.143 | 0.610 | -0.400 | -0.063 | 0.356 | 0.800 | -0.223 | 0.796 | 0.168 | -0.450 | 0.136 | 0.407 | 0.109 | -0.615 | 0.327 | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 | 0.452 | -0.901 | -1.812 | -2.360 | -0.436 | -2.114 | train | 0.175 |
1 | 0.968 | 0.437 | 0.188 | -0.455 | -0.134 | 1.109 | -0.488 | 0.000 | -0.977 | -1.162 | -0.571 | -0.836 | 0.066 | 0.588 | -0.802 | -0.063 | 0.357 | 0.801 | -0.144 | 1.057 | 0.338 | 0.671 | -0.128 | 0.566 | 0.124 | 0.032 | 0.600 | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 | 0.194 | -0.893 | -1.566 | -2.360 | 0.332 | -2.114 | train | 0.676 |
2 | 1.013 | 0.568 | 0.874 | -0.051 | -0.072 | 0.767 | -0.493 | -0.212 | -0.618 | -0.897 | -0.564 | -0.558 | 0.235 | 0.576 | -0.477 | -0.063 | 0.355 | 0.961 | -0.067 | 0.915 | 0.326 | 1.287 | -0.009 | 0.370 | 0.361 | 0.277 | -0.116 | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 | 0.112 | -0.797 | -1.367 | -2.360 | 0.396 | -2.114 | train | 0.633 |
3 | 0.733 | 0.368 | 0.011 | 0.102 | -0.014 | 0.769 | -0.371 | -0.162 | -0.429 | -0.897 | -0.574 | -0.564 | 0.283 | 0.272 | -0.491 | -0.063 | 0.352 | 1.435 | 0.113 | 0.898 | 0.277 | 1.298 | 0.015 | 0.165 | 0.417 | 0.279 | 0.603 | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 | 0.599 | -0.679 | -1.200 | -2.086 | 0.403 | -2.114 | train | 0.206 |
4 | 0.684 | 0.638 | -0.251 | 0.570 | 0.199 | -0.349 | -0.342 | -0.138 | -0.391 | -0.897 | -0.572 | -0.394 | 0.260 | 0.106 | 0.309 | -0.259 | 0.352 | 0.881 | 0.221 | 0.386 | 0.332 | 1.289 | 0.183 | 0.209 | 1.078 | 0.328 | 0.418 | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 | 0.337 | -0.454 | -1.073 | -2.086 | 0.314 | -2.114 | train | 0.384 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4808 | -1.362 | -1.553 | -2.551 | 0.518 | 0.396 | 0.928 | 1.452 | 0.867 | -5.143 | 1.227 | -3.573 | 0.107 | -3.096 | -0.088 | 0.227 | 2.953 | -1.538 | -0.630 | -3.072 | -1.120 | -1.674 | 0.525 | 0.171 | -0.444 | -4.488 | -5.793 | -4.050 | -1.187 | -0.852 | -2.131 | -2.564 | 0.597 | 0.381 | 1.375 | -4.854 | -5.331 | -4.074 | -3.838 | test | NaN |
4809 | -2.698 | -3.452 | -2.525 | 0.311 | -1.786 | 1.871 | 1.885 | 1.135 | -5.774 | 1.227 | -0.965 | 0.193 | -3.620 | -0.506 | -0.574 | 3.149 | -1.479 | -0.204 | -3.432 | -2.101 | -1.773 | -0.446 | 1.297 | -1.066 | -0.613 | -7.698 | -0.674 | -1.187 | -0.852 | -2.131 | -2.564 | 1.215 | -1.385 | 1.378 | -4.927 | -5.103 | -4.393 | -1.683 | test | NaN |
4810 | -2.615 | -3.564 | -2.529 | -0.029 | -1.151 | 1.976 | 2.337 | 0.504 | -4.752 | 1.492 | -1.568 | 0.301 | -3.402 | 0.109 | -0.541 | 3.511 | -1.085 | 1.057 | -2.409 | 0.477 | -1.585 | -0.447 | 0.552 | -0.422 | 0.125 | -6.111 | 0.275 | -1.851 | -1.548 | -1.537 | -2.544 | 1.612 | -1.272 | 1.121 | -4.223 | -4.315 | -5.196 | -3.407 | test | NaN |
4811 | -2.661 | -3.646 | -2.560 | -0.028 | -1.512 | 1.520 | 2.243 | 0.206 | -4.200 | 1.492 | -1.282 | -0.036 | -3.271 | -1.015 | -0.203 | 3.511 | -1.084 | 0.800 | -2.339 | 0.050 | -1.410 | -0.447 | 0.318 | -0.699 | 1.086 | -5.268 | 0.683 | -1.645 | -1.471 | -1.537 | -2.549 | 1.431 | -1.270 | 1.116 | -3.716 | -3.809 | -4.735 | -2.976 | test | NaN |
4812 | -2.321 | -3.037 | 0.056 | 0.306 | -1.154 | 0.847 | 2.221 | 0.206 | -3.960 | 1.492 | -1.213 | 0.592 | -3.214 | -1.502 | 0.153 | 3.609 | -1.088 | 0.799 | -2.339 | -0.077 | -1.242 | -0.442 | 0.323 | -1.594 | -0.774 | -5.211 | 1.618 | -1.703 | -1.471 | -1.537 | -1.123 | 1.988 | -0.910 | 1.259 | -3.616 | -3.747 | -4.368 | -2.976 | test | NaN |
4813 rows × 40 columns
#探索出去最后两列的数字属性
data_all.columns[:-2]
Index(['V0', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17',
'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26',
'V27', 'V28', 'V29', 'V3', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35',
'V36', 'V37', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'],
dtype='object')
#38个特征将一些不重要的删除
#特征分布情况,训练和测试数据特征分布不均匀,删除
for column in data_all.columns[0:-2]:
g = sns.kdeplot(data_all[column][(data_all["origin"] == "train")], color="Red", shade = True)
g = sns.kdeplot(data_all[column][(data_all["origin"] == "test")], ax =g, color="Blue", shade= True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g = g.legend(["train","test"])
plt.show()
fig = plt.figure(figsize=(10, 10))
for i in range(len(data_all.columns)-2):
g = sns.FacetGrid(data_all, col='origin')
g = g.map(sns.distplot, data_all.columns[i])
<Figure size 720x720 with 0 Axes>
#通过图示可以看出'V11','V17','V22','V5',波动太大,删除
drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis=1,inplace=True)
# 找出相关程度
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
mcorr = train_data.corr() # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show
<function matplotlib.pyplot.show(*args, **kw)>
在这里插入图片描述
# 通过相关性系数找到7个相关性不大的属性
cond = mcorr.loc['target'].abs()<0.1
drop_labels = mcorr.loc['target'][cond].index
#['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34']
#查看属性分布后,将分布不好的删除 ('V14', 'V21', )
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis=1,inplace=True)
#删除了6个属性
data_all.shape
(4813, 34)
data = data_all.iloc[:,:-2]
minmaxscale = MinMaxScaler()
data = minmaxscale.fit_transform(data)
data
array([[0.77577505, 0.723449 , 0.22174265, ..., 0.43285165, 0.66410771,
0.73528007],
[0.83374189, 0.77878549, 0.37388724, ..., 0.43285165, 0.7548128 ,
0.73528007],
[0.84023071, 0.79600421, 0.46641489, ..., 0.43285165, 0.76237156,
0.73528007],
...,
[0.31708724, 0.25289169, 0.0074184 , ..., 0.17367095, 0.10192512,
0.64706284],
[0.31045422, 0.24211356, 0.00323712, ..., 0.24075302, 0.1563718 ,
0.67646858],
[0.35948089, 0.32216088, 0.35608309, ..., 0.24897256, 0.19971655,
0.67646858]])
#归一化数据
data_all_norm = pd.DataFrame(data,columns=data_all.columns[:-2])
data_all_norm
V0 | V1 | V10 | V12 | V13 | V15 | V16 | V18 | V19 | V2 | V20 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | V3 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | V4 | V6 | V7 | V8 | V9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.775775 | 0.723449 | 0.221743 | 0.570828 | 0.694786 | 0.402245 | 0.487950 | 0.375125 | 0.380238 | 0.582197 | 0.537946 | 0.792169 | 0.569153 | 0.375250 | 0.730736 | 0.902936 | 0.279341 | 0.406834 | 0.665193 | 0.603714 | 0.729379 | 0.679479 | 0.000000 | 0.000000 | 0.242424 | 0.000000 | 0.018343 | 0.571839 | 0.508616 | 0.432852 | 0.664108 | 0.735280 |
1 | 0.833742 | 0.778785 | 0.373887 | 0.564418 | 0.778544 | 0.402245 | 0.569779 | 0.375374 | 0.401962 | 0.611588 | 0.534996 | 0.792304 | 0.569419 | 0.381824 | 0.762915 | 0.924734 | 0.437095 | 0.371596 | 0.689434 | 0.605676 | 0.796005 | 0.721792 | 0.374950 | 0.499949 | 0.800020 | 0.289702 | 0.436025 | 0.544381 | 0.541225 | 0.432852 | 0.754813 | 0.735280 |
2 | 0.840231 | 0.796004 | 0.466415 | 0.570933 | 0.727300 | 0.372870 | 0.610021 | 0.376246 | 0.440925 | 0.635354 | 0.533387 | 0.792035 | 0.611893 | 0.388232 | 0.745407 | 0.923195 | 0.523783 | 0.387480 | 0.659552 | 0.636673 | 0.821234 | 0.610818 | 0.374950 | 0.499949 | 0.800020 | 0.429901 | 0.457224 | 0.535653 | 0.567603 | 0.432852 | 0.762372 | 0.735280 |
3 | 0.799856 | 0.769716 | 0.350013 | 0.577028 | 0.727600 | 0.379798 | 0.631207 | 0.375000 | 0.440084 | 0.642104 | 0.492625 | 0.791633 | 0.737722 | 0.403212 | 0.743312 | 0.916912 | 0.525331 | 0.390683 | 0.628297 | 0.643997 | 0.821440 | 0.722257 | 0.374950 | 0.477220 | 0.800020 | 0.374841 | 0.528943 | 0.587484 | 0.589740 | 0.469177 | 0.763198 | 0.735280 |
4 | 0.792790 | 0.805205 | 0.314675 | 0.599412 | 0.560084 | 0.383123 | 0.635467 | 0.375249 | 0.463910 | 0.638869 | 0.470367 | 0.791633 | 0.590656 | 0.412200 | 0.680187 | 0.923965 | 0.524064 | 0.413107 | 0.635005 | 0.730447 | 0.826485 | 0.693583 | 0.374950 | 0.462067 | 0.800020 | 0.296712 | 0.541573 | 0.559600 | 0.606575 | 0.469177 | 0.752687 | 0.735280 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4808 | 0.497765 | 0.517219 | 0.004451 | 0.620113 | 0.751423 | 0.522378 | 0.102791 | 0.001246 | 0.534128 | 0.166924 | 0.444355 | 0.538214 | 0.189541 | 0.138149 | 0.494514 | 0.666752 | 0.416549 | 0.411506 | 0.535447 | 0.002485 | 0.196169 | 0.001085 | 0.340864 | 0.397717 | 0.545455 | 0.005608 | 0.635544 | 0.564283 | 0.105382 | 0.038977 | 0.234440 | 0.617657 |
4809 | 0.305119 | 0.267613 | 0.007958 | 0.390815 | 0.892718 | 0.559512 | 0.032059 | 0.326271 | 0.546181 | 0.093236 | 0.388308 | 0.546125 | 0.302628 | 0.108189 | 0.373567 | 0.654058 | 0.279904 | 0.561799 | 0.440616 | 0.509286 | 0.000000 | 0.524334 | 0.340864 | 0.397717 | 0.545455 | 0.005608 | 0.728462 | 0.376330 | 0.095705 | 0.069203 | 0.196764 | 0.764686 |
4810 | 0.317087 | 0.252892 | 0.007418 | 0.457545 | 0.908451 | 0.472080 | 0.146620 | 0.251122 | 0.561317 | 0.123893 | 0.470770 | 0.598954 | 0.637377 | 0.193326 | 0.691407 | 0.678164 | 0.279764 | 0.462360 | 0.538802 | 0.605807 | 0.163423 | 0.671420 | 0.275069 | 0.327407 | 0.606061 | 0.008157 | 0.788152 | 0.388357 | 0.189024 | 0.173671 | 0.101925 | 0.647063 |
4811 | 0.310454 | 0.242114 | 0.003237 | 0.419609 | 0.840126 | 0.430788 | 0.208497 | 0.286765 | 0.514085 | 0.142315 | 0.320059 | 0.599088 | 0.569153 | 0.199151 | 0.638762 | 0.700603 | 0.279764 | 0.431127 | 0.496570 | 0.731494 | 0.250232 | 0.734656 | 0.295482 | 0.335185 | 0.606061 | 0.007520 | 0.760938 | 0.388570 | 0.256230 | 0.240753 | 0.156372 | 0.676469 |
4812 | 0.359481 | 0.322161 | 0.356083 | 0.457230 | 0.739287 | 0.430788 | 0.235400 | 0.295364 | 0.602102 | 0.150330 | 0.254760 | 0.598552 | 0.568888 | 0.199151 | 0.623104 | 0.722144 | 0.280467 | 0.431794 | 0.360116 | 0.488229 | 0.256101 | 0.879572 | 0.289734 | 0.335185 | 0.606061 | 0.189268 | 0.844685 | 0.426884 | 0.269486 | 0.248973 | 0.199717 | 0.676469 |
4813 rows × 32 columns
#将oringin和target属性merage上
data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index=True,right_index=True)
data_all_norm.describe()
V0 | V1 | V10 | V12 | V13 | V15 | V16 | V18 | V19 | V2 | V20 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | V3 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | V4 | V6 | V7 | V8 | V9 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 2888.000000 |
mean | 0.694172 | 0.721357 | 0.348518 | 0.578507 | 0.612372 | 0.402251 | 0.679294 | 0.446542 | 0.519158 | 0.602300 | 0.456147 | 0.744438 | 0.356712 | 0.393796 | 0.632582 | 0.881401 | 0.342653 | 0.388683 | 0.603139 | 0.589459 | 0.792709 | 0.628824 | 0.458493 | 0.483790 | 0.762873 | 0.332385 | 0.545795 | 0.523743 | 0.748823 | 0.745740 | 0.715607 | 0.879536 | 0.126353 |
std | 0.144198 | 0.131443 | 0.134882 | 0.105088 | 0.149835 | 0.138561 | 0.112095 | 0.124627 | 0.140166 | 0.140628 | 0.134083 | 0.134085 | 0.265512 | 0.083226 | 0.123294 | 0.128221 | 0.140731 | 0.133475 | 0.152462 | 0.130786 | 0.102976 | 0.155003 | 0.099095 | 0.101020 | 0.102037 | 0.127456 | 0.150356 | 0.106430 | 0.132560 | 0.132577 | 0.118105 | 0.068244 | 0.983966 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -3.044000 |
25% | 0.626676 | 0.679416 | 0.284327 | 0.532892 | 0.519928 | 0.299016 | 0.629414 | 0.399302 | 0.414436 | 0.514414 | 0.370475 | 0.719362 | 0.040616 | 0.347870 | 0.566515 | 0.888575 | 0.278778 | 0.292445 | 0.503888 | 0.550092 | 0.761816 | 0.562461 | 0.409037 | 0.454490 | 0.727273 | 0.270584 | 0.445647 | 0.478182 | 0.683324 | 0.696938 | 0.664934 | 0.852903 | -0.350250 |
50% | 0.729488 | 0.752497 | 0.366469 | 0.591635 | 0.627809 | 0.391437 | 0.700258 | 0.456256 | 0.540294 | 0.617072 | 0.447305 | 0.788817 | 0.381736 | 0.388815 | 0.641228 | 0.916015 | 0.279904 | 0.375734 | 0.614270 | 0.594428 | 0.815055 | 0.643056 | 0.454518 | 0.499949 | 0.800020 | 0.347056 | 0.539317 | 0.535866 | 0.774125 | 0.771974 | 0.742884 | 0.882377 | 0.313000 |
75% | 0.790195 | 0.799553 | 0.432965 | 0.641971 | 0.719958 | 0.489954 | 0.753279 | 0.501745 | 0.623125 | 0.700464 | 0.522660 | 0.792706 | 0.574728 | 0.427597 | 0.713599 | 0.932555 | 0.413031 | 0.471837 | 0.710474 | 0.650798 | 0.852229 | 0.719777 | 0.500000 | 0.511365 | 0.800020 | 0.414861 | 0.643061 | 0.585036 | 0.842259 | 0.836405 | 0.790835 | 0.941189 | 0.793250 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.538000 |
def scale_minmax(data):
return (data - data.min())/(data.max() - data.min())
#使用Box-Cox将连续数据转换的更加平滑(主要处理类似正太分布) from scipy import stats fcols = 6 frows = len(data_all_norm.columns[:10]) plt.figure(figsize=(4*fcols,4*frows)) i = 0 for col in data_all_norm.columns[:10]: dat = data_all_norm[[col, 'target']].dropna() # 这条线就是数据分布dist:distribution(分布) i+=1 plt.subplot(frows,fcols,i) sns.distplot(dat[col],fit = stats.norm); plt.title(col+' Original') plt.xlabel('') # 第二个图:skew统计分析中中一个属性 # skewness 偏斜系数,对正太分布的度量 i+=1 plt.subplot(frows,fcols,i) _=stats.probplot(dat[col], plot=plt)#画图,偏析度 plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col]))) plt.xlabel('') plt.ylabel('') # 散点图 i+=1 plt.subplot(frows,fcols,i) # plt.plot(dat[var], dat['target'],'.',alpha=0.5) plt.scatter(dat[col],dat['target'],alpha=0.5) plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1])) # !!!对数据进行了处理!!! # 数据分布图distribution i+=1 plt.subplot(frows,fcols,i) trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1) trans_var = scale_minmax(trans_var) sns.distplot(trans_var , fit=stats.norm); plt.title(col+' Tramsformed') plt.xlabel('') # 偏斜度 i+=1 plt.subplot(frows,fcols,i) _=stats.probplot(trans_var, plot=plt) plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var))) plt.xlabel('') plt.ylabel('') # 散点图 i+=1 plt.subplot(frows,fcols,i) plt.plot(trans_var, dat['target'],'.',alpha=0.5) plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
# 将数据进行Box-Cox转换
# 统计建模中常用的数据变化
# 数据更加正态化,标准化
for col in data_all_norm.columns[:-2]:
boxcox,maxlog = stats.boxcox(data_all_norm[col] + 1)
data_all_norm[col] = scale_minmax(boxcox)
data_all_norm
V0 | V1 | V10 | V12 | V13 | V15 | V16 | V18 | V19 | V2 | V20 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | V3 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | V4 | V6 | V7 | V8 | V9 | origin | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.507483 | 0.357070 | 0.134959 | 0.303471 | 0.561751 | 0.539735 | 0.136013 | 0.239798 | 0.272914 | 0.442658 | 0.694629 | 0.425929 | 0.592470 | 0.626176 | 0.552721 | 0.394651 | 0.377657 | 0.559002 | 0.581357 | 0.323667 | 0.267157 | 0.440715 | 0.000000 | 0.000000 | 0.026476 | 0.000000 | 0.020896 | 0.353680 | 0.165759 | 0.094056 | 0.304061 | 0.253539 | train | 0.175 |
1 | 0.610419 | 0.445015 | 0.253597 | 0.297055 | 0.668704 | 0.539735 | 0.197424 | 0.240004 | 0.292263 | 0.474668 | 0.692110 | 0.426178 | 0.592730 | 0.632878 | 0.597390 | 0.488267 | 0.547131 | 0.522334 | 0.608782 | 0.325784 | 0.376810 | 0.496931 | 0.355631 | 0.432280 | 0.467466 | 0.175457 | 0.466089 | 0.325746 | 0.190998 | 0.094056 | 0.430326 | 0.253539 | train | 0.676 |
2 | 0.622895 | 0.475812 | 0.336900 | 0.303577 | 0.602125 | 0.509062 | 0.234823 | 0.240729 | 0.328031 | 0.501306 | 0.690732 | 0.425680 | 0.634207 | 0.639320 | 0.572815 | 0.481023 | 0.630446 | 0.539067 | 0.575040 | 0.360554 | 0.427728 | 0.359039 | 0.355631 | 0.432280 | 0.467466 | 0.290367 | 0.487365 | 0.317154 | 0.213520 | 0.094056 | 0.442528 | 0.253539 | train | 0.633 |
3 | 0.548433 | 0.429476 | 0.233498 | 0.309766 | 0.602504 | 0.516390 | 0.256694 | 0.239694 | 0.327245 | 0.508997 | 0.654865 | 0.424935 | 0.755301 | 0.654039 | 0.569916 | 0.452488 | 0.631879 | 0.542400 | 0.540475 | 0.369141 | 0.428167 | 0.497575 | 0.355631 | 0.409624 | 0.467466 | 0.242724 | 0.558547 | 0.370220 | 0.233980 | 0.113283 | 0.443879 | 0.253539 | train | 0.206 |
4 | 0.536158 | 0.492987 | 0.204775 | 0.333233 | 0.409379 | 0.519887 | 0.261284 | 0.239901 | 0.349778 | 0.505305 | 0.634468 | 0.424935 | 0.613508 | 0.662648 | 0.486902 | 0.484632 | 0.630707 | 0.565360 | 0.547831 | 0.482005 | 0.439062 | 0.458936 | 0.355631 | 0.394675 | 0.467466 | 0.180709 | 0.570959 | 0.341058 | 0.250548 | 0.113283 | 0.426944 | 0.253539 | train | 0.384 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4808 | 0.184452 | 0.144638 | 0.002294 | 0.356001 | 0.633015 | 0.655194 | 0.009837 | 0.000579 | 0.419213 | 0.092622 | 0.609861 | 0.127612 | 0.205965 | 0.302314 | 0.286901 | 0.033158 | 0.526439 | 0.563741 | 0.442151 | 0.000452 | 0.007696 | 0.000245 | 0.322227 | 0.332586 | 0.152596 | 0.002675 | 0.662217 | 0.345855 | 0.012300 | 0.002622 | 0.035277 | 0.128271 | test | NaN |
4809 | 0.074357 | 0.036169 | 0.004113 | 0.155465 | 0.830440 | 0.687976 | 0.002518 | 0.200542 | 0.431591 | 0.048667 | 0.553814 | 0.132960 | 0.324364 | 0.246144 | 0.187745 | 0.028743 | 0.378309 | 0.702706 | 0.348424 | 0.232703 | 0.000000 | 0.271468 | 0.322227 | 0.332586 | 0.152596 | 0.002675 | 0.750654 | 0.182800 | 0.010893 | 0.005091 | 0.026747 | 0.298339 | test | NaN |
4810 | 0.079352 | 0.032715 | 0.003833 | 0.203011 | 0.854218 | 0.608691 | 0.015871 | 0.145112 | 0.447329 | 0.066348 | 0.634842 | 0.173746 | 0.658944 | 0.395001 | 0.501062 | 0.037668 | 0.378146 | 0.613591 | 0.445590 | 0.325925 | 0.005566 | 0.430530 | 0.258323 | 0.267339 | 0.203097 | 0.003899 | 0.806584 | 0.191546 | 0.027408 | 0.017464 | 0.010723 | 0.152833 | test | NaN |
4811 | 0.076558 | 0.030332 | 0.001667 | 0.175014 | 0.753601 | 0.568571 | 0.026862 | 0.170668 | 0.398930 | 0.077385 | 0.479359 | 0.173862 | 0.592470 | 0.404054 | 0.436769 | 0.048282 | 0.378146 | 0.583346 | 0.402911 | 0.483508 | 0.012412 | 0.514964 | 0.278067 | 0.274424 | 0.203097 | 0.003592 | 0.781167 | 0.191703 | 0.044201 | 0.029609 | 0.019058 | 0.181492 | test | NaN |
4812 | 0.098781 | 0.051178 | 0.238555 | 0.202767 | 0.617374 | 0.568571 | 0.032709 | 0.177030 | 0.490804 | 0.082286 | 0.400802 | 0.173400 | 0.592210 | 0.404054 | 0.418681 | 0.061087 | 0.378960 | 0.584005 | 0.274131 | 0.215143 | 0.013031 | 0.751303 | 0.272500 | 0.274424 | 0.203097 | 0.105645 | 0.858952 | 0.221029 | 0.048101 | 0.031384 | 0.027365 | 0.181492 | test | NaN |
4813 rows × 34 columns
ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50]) cond = data_all_norm['origin'] == 'train' X_train = data_all_norm[cond].iloc[:,:-2] # 真实值 y_train = data_all_norm[cond]['target'] # 算法拟合数据和目标值的时候,不可能100%拟合 ridge.fit(X_train,y_train) # 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值 y_ = ridge.predict(X_train) cond = abs(y_ - y_train) > y_train.std() print(cond.sum()) # 画图 plt.figure(figsize=(12,6)) axes = plt.subplot(1,3,1) axes.scatter(y_train,y_) axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20) axes = plt.subplot(1,3,2) axes.scatter(y_train,y_train - y_) axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red') axes = plt.subplot(1,3,3) # _ = axes.hist(y_train,bins = 50) (y_train - y_).plot.hist(bins = 50,ax = axes) (y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')
40
<matplotlib.axes._subplots.AxesSubplot at 0x2403c0836a0>
index = cond[cond].index
data_all_norm.drop(index,axis = 0,inplace=True)
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']
cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]
estimators = {}
estimators['forest'] = RandomForestRegressor(n_estimators=300)
estimators['gbdt'] = GradientBoostingRegressor(n_estimators=300)
estimators['ada'] = AdaBoostRegressor(n_estimators=300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators=300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators=300)
estimators['xgb'] = XGBRegressor(n_estimators=300)
#将结果存入列表中,求取平均值作为最后答案
result = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
result.append(y_)
y_ = np.mean(result,axis = 0)
pd.Series(y_).to_csv('./norm.txt',index = False)
[19:51:26] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。