赞
踩
我们用的是保险公司的一份数据
# 各种库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import mean_absolute_error
from sklearn.decompositi on import PCA
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.cross_validation import cross_val_score
from scipy import stats
import seaborn as sns
from copy import deepcopy
%matplotlib inline
# 早期版本的Jupyter可能引发异常
%config InlineBackend.figure_format = 'retina'
我们用的是保险公司的一份数据
train = pd.read_csv('train.csv')
做对数转换
train['log_loss'] = np.log(train['loss'])
数据分成连续和离散特征
features = [x for x in train.columns if x not in ['id','loss', 'log_loss']]
cat_features = [x for x in train.select_dtypes(
include=['object']).columns if x not in ['id','loss', 'log_loss']]
num_features = [x for x in train.select_dtypes(
exclude=['object']).columns if x not in ['id','loss', 'log_loss']]
print ("Categorical features:", len(cat_features))
print ("Numerical features:", len(num_features))
Categorical features: 116
Numerical features: 14
ntrain = train.shape[0]
train_x = train[features]
train_y = train['log_loss']
for c in range(len(cat_features)):
train_x[cat_features[c]] = train_x[cat_features[c]].astype('category').cat.codes
print ("Xtrain:", train_x.shape)
print ("ytrain:", train_y.shape)
Xtrain: (188318, 130)
ytrain: (188318,)
首先,我们训练一个基本的xgboost模型,然后进行参数调节通过交叉验证来观察结果的变换,使用平均绝对误差来衡量
mean_absolute_error(np.exp(y), np.exp(yhat))。
xgboost 自定义了一个数据矩阵类 DMatrix,会在训练开始时进行一遍预处理,从而提高之后每次迭代的效率
def xg_eval_mae(yhat, dtrain):
y = dtrain.get_label()
return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat))
dtrain = xgb.DMatrix(train_x, train['log_loss'])
xgb_params = {
'seed': 0,
'eta': 0.1,
'colsample_bytree': 0.5,
'silent': 1,
'subsample': 0.5,
'o
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。