赞
踩
从一个或者多个输入特征衍生来的特征(标准化和缩放法创建的特征不算),主要包含以下几个类型:
(1)将一个特征与其本身或者其他特征相乘(特征组合)
(2)两个特征相除
(3)对连续特征进行分箱,以分为多个去加分箱
1 数值特征的变换和组合
常用的数值特征的变换和组合如下:
(1)多项式特征
(2)比例特征
(3)绝对值
(4)max(x1,x2), min(x1,x2), x1 orx2
2 类别特征与数值特征的组合
用N1N2表示数值特征,C1C2表示类别特征
利用pandas的groupby创建以下几种新特征:
(1)中位数 median(N1)_by(C1)
(2)算数平均数 mean(N1)_by(C1)
(3)众数 mode(N1)_by(C1)
(4)最小值 min(N1)_by(C1)
(5)最大值 max(N1)_by(C1)
(6)标准差 std(N1)_by(C1)
(7)方差 var(N1)_by(C1)
(8)频数 freq(C2)_by(C1)
sklearn和xgboost中的apply()和decision_path()
1 对非线性规律进行编码
2 组合独热矢量
3 使用分桶特征列训练模型
import pandas as pd from sklearn.decomposition import PCA from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error import lightgbm as lgb import numpy as np train_data_file = "./zhengqi_train.txt" test_data_file = "./zhengqi_test.txt" train_data=pd.read_csv(train_data_file,sep='\t',encoding='utf-8') test_data=pd.read_csv(test_data_file,sep='\t',encoding='utf-8') #定义特征构造方法 epsilon=1e-5 #组交叉特征,可以自己定义,如增加x*x/y,log(x)/y等 func_dict={ 'add':lambda x,y:x+y, 'mins':lambda x,y:x-y, 'div':lambda x,y:x/(y+epsilon), 'multi':lambda x,y:x*y } #特征构造函数 def auto_features_make(train_data,test_data,func_dict,col_list): train_data,test_data=train_data.copy(),test_data.copy() for col_i in col_list: for col_j in col_list: for func_name,func in func_dict.items(): for data in [train_data,test_data]: func_features=func(data[col_i],data[col_j]) col_func_features='-'.join([col_i,func_name,col_j]) data[col_func_features]=func_features return train_data,test_data #特征降维PCA train_data2,test_data2=auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns) pca=PCA(n_components=500) train_data2_pca=pca.fit_transform(train_data2.iloc[:,0:-1]) test_data2_pca=pca.transform(test_data2) train_data2_pca=pd.DataFrame(train_data2_pca) test_data2_pca=pd.DataFrame(test_data2_pca) train_data2_pca['target']=train_data2['target'] x_train2=train_data2[test_data2.columns].values y_train2=train_data2['target'] #模型训练与评估 #5折交叉检验 folds=5 kf=KFold(n_splits=folds,shuffle=True,random_state=2019) #记录训练和预测MSE MSE_DICT={'train_mse':[],"test_mse":[]} #线下训练预测 for i,(train_index,test_index) in enumerate(kf.split(x_train2)): #LGB模型 lgb_reg=lgb.LGBMRegressor( learning_rate=0.01, max_depth=-1, n_estimators=5000, boosting_type='gbdt', random_state=2019, objective='regression', ) #切分训练集和预测集 x_train_kfold=x_train2[train_index] x_test_kfold=x_train2[test_index] y_train_kfold=y_train2[train_index] y_test_kfold=y_train2[test_index] #模型训练 lgb_reg.fit(X=x_train_kfold, y=y_train_kfold, eval_set=[(x_train_kfold,y_train_kfold), (x_test_kfold,y_test_kfold)], eval_names=['train','test'], early_stopping_rounds=100, eval_metric='MSE', verbose=50) #训练集和预测集预测 y_train_kfold_predict=lgb_reg.predict(x_train_kfold,num_iteration=lgb_reg.best_iteration_) y_test_kfold_predict=lgb_reg.predict(x_test_kfold,num_iteration=lgb_reg.best_iteration_) print('第{}折 训练和预测 训练MSE 预测MSE'.format(i)) train_mse=mean_squared_error(y_train_kfold_predict,y_train_kfold) print('-----\n','训练MSE\n',train_mse,'\n-----') test_mse=mean_squared_error(y_test_kfold_predict,y_test_kfold) print('-----\n','预测MSE\n',test_mse,'\n-----') MSE_DICT["train_mse"].append(train_mse) MSE_DICT['test_mse'].append(test_mse) print('-----\n', '训练MSE\n',MSE_DICT['train_mse'], '\n',np.mean(MSE_DICT['train_mse']),'\n-----') print('-----\n', '预测MSE\n',MSE_DICT['test_mse'], '\n',np.mean(MSE_DICT['test_mse']),'\n-----')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。