赞
踩
Wrapper (Recursive feature elimination,RFE)
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- import datetime
- from tqdm import tqdm
- from sklearn.preprocessing import LabelEncoder
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import chi2
- from sklearn.preprocessing import MinMaxScaler
- import xgboost as xgb
- import lightgbm as lgb
- from catboost import CatBoostRegressor
- import warnings
- from sklearn.model_selection import StratifiedKFold, KFold
- from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
- warnings.filterwarnings('ignore')
- data_train = pd.read_csv('../train.csv')
- data_test_a = pd.read_csv('../testA.csv')
- numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns) # 数值特征
- category_fea = list(filter(lambda x: x not in numerical_fea, list(data_train.columns))) # 类别特征
- label = 'isDefault' # 目标特征
- numerical_fea.remove(label)
data_train = data_train.fillna(0)
data_train = data_train.fillna(axis=0, method='ffill')
data_train = data_train.fillna(axis=0, method='bfill', limit=2)
查看缺失值情况:
- data_train.isnull().sum()
-
- # id 0
- # loanAmnt 0
- # term 0
- # interestRate 0
- # installment 0
- # grade 0
- # subGrade 0
- # employmentTitle 1
- # employmentLength 46799
- # homeOwnership 0
- # annualIncome 0
- # verificationStatus 0
- # issueDate 0
- # isDefault 0
- # purpose 0
- # postCode 1
- # regionCode 0
- # dti 239
- # delinquency_2years 0
- # ficoRangeLow 0
- # ficoRangeHigh 0
- # openAcc 0
- # pubRec 0
- # pubRecBankruptcies 405
- # revolBal 0
- # revolUtil 531
- # totalAcc 0
- # initialListStatus 0
- # applicationType 0
- # earliesCreditLine 0
- # title 1
- # policyCode 0
- # n0 40270
- # n1 40270
- # n2 40270
- # n2.1 40270
- # n4 33239
- # n5 40270
- # n6 40270
- # n7 40270
- # n8 40271
- # n9 40270
- # n10 33239
- # n11 69752
- # n12 40270
- # n13 40270
- # n14 40270
- # dtype: int64
- # 按照中值填充数值型特征
- data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
- data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
-
- # 按照众数填充类别型特征
- data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
- data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
- data_train.isnull().sum()
-
- # id 0
- # loanAmnt 0
- # term 0
- # interestRate 0
- # installment 0
- # grade 0
- # subGrade 0
- # employmentTitle 0
- # employmentLength 46799
- # homeOwnership 0
- # annualIncome 0
- # verificationStatus 0
- # issueDate 0
- # isDefault 0
- # purpose 0
- # postCode 0
- # regionCode 0
- # dti 0
- # delinquency_2years 0
- # ficoRangeLow 0
- # ficoRangeHigh 0
- # openAcc 0
- # pubRec 0
- # pubRecBankruptcies 0
- # revolBal 0
- # revolUtil 0
- # totalAcc 0
- # initialListStatus 0
- # applicationType 0
- # earliesCreditLine 0
- # title 0
- # policyCode 0
- # n0 0
- # n1 0
- # n2 0
- # n2.1 0
- # n4 0
- # n5 0
- # n6 0
- # n7 0
- # n8 0
- # n9 0
- # n10 0
- # n11 0
- # n12 0
- # n13 0
- # n14 0
- # dtype: int64
- # 转化成时间格式
- for data in [data_train, data_test_a]:
- data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
- startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
- # 构造时间特征
- data['issueDateDT'] = data['issueDate'].apply(lambda x: x - startdate).dt.days
- data_train['employmentLength'].value_counts(dropna=False).sort_index()
-
- # 1 year 52489
- # 10+ years 262753
- # 2 years 72358
- # 3 years 64152
- # 4 years 47985
- # 5 years 50102
- # 6 years 37254
- # 7 years 35407
- # 8 years 36192
- # 9 years 30272
- # < 1 year 64237
- # NaN 46799
- # Name: employmentLength, dtype: int64
- def employmentLength_to_int(s):
- if pd.isnull(s):
- return s
- else:
- return np.int8(s.split()[0])
-
- for data in [data_train, data_test_a]:
- data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
- data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
- data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
-
- data['employmentLength'].value_counts(dropna=False).sort_index()
-
- # 0.0 15989
- # 1.0 13182
- # 2.0 18207
- # 3.0 16011
- # 4.0 11833
- # 5.0 12543
- # 6.0 9328
- # 7.0 8823
- # 8.0 8976
- # 9.0 7594
- # 10.0 65772
- # NaN 11742
- # Name: employmentLength, dtype: int64
- data_train['earliesCreditLine'].sample(5)
-
- # 642880 Jun-1992
- # 77423 Aug-1983
- # 356008 Mar-1999
- # 84346 Aug-2007
- # 574182 Sep-2005
- # Name: earliesCreditLine, dtype: object
- for data in [data_train, data_test_a]:
- data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
- # 部分类别特征
- cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
- 'applicationType', 'initialListStatus', 'title', 'policyCode']
- for f in cate_features:
- print(f, '类型数:', data_train[f].nunique())
-
- # grade 类型数: 7
- # subGrade 类型数: 35
- # employmentTitle 类型数: 248683
- # homeOwnership 类型数: 6
- # verificationStatus 类型数: 3
- # purpose 类型数: 14
- # postCode 类型数: 932
- # regionCode 类型数: 51
- # applicationType 类型数: 2
- # initialListStatus 类型数: 2
- # title 类型数: 39644
- # policyCode 类型数: 1
- for data in [data_train, data_test_a]:
- data['grade'] = data['grade'].map({'A': 1,'B': 2,'C': 3,'D': 4,'E': 5,'F': 6,'G': 7})
- # 类型数在2之上,又不是高维稀疏的,且纯分类特征
- for data in [data_train, data_test_a]:
- data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
- def find_outliers_by_3segama(data, fea):
- data_std = np.std(data[fea])
- data_mean = np.mean(data[fea])
- outliers_cut_off = data_std * 3
- lower_rule = data_mean - outliers_cut_off
- upper_rule = data_mean + outliers_cut_off
- data[fea + '_outliers'] = data[fea].apply(lambda x: str('异常值') if x > upper_rule or x < lower_rule else '正常值')
- return data
-
- data_train = data_train.copy()
- for fea in numerical_fea:
- data_train = find_outliers_by_3segama(data_train, fea)
- print(data_train[fea + '_outliers'].value_counts())
- print(data_train.groupby(fea + '_outliers')['isDefault'].sum())
- print('*' * 10)
-
- # 正常值 800000
- # Name: id_outliers, dtype: int64
- # id_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 800000
- # Name: loanAmnt_outliers, dtype: int64
- # loanAmnt_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 800000
- # Name: term_outliers, dtype: int64
- # term_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 794259
- # 异常值 5741
- # Name: interestRate_outliers, dtype: int64
- # interestRate_outliers
- # 异常值 2916
- # 正常值 156694
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 792046
- # 异常值 7954
- # Name: installment_outliers, dtype: int64
- # installment_outliers
- # 异常值 2152
- # 正常值 157458
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 800000
- # Name: employmentTitle_outliers, dtype: int64
- # employmentTitle_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 799701
- # 异常值 299
- # Name: homeOwnership_outliers, dtype: int64
- # homeOwnership_outliers
- # 异常值 62
- # 正常值 159548
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 793973
- # 异常值 6027
- # Name: annualIncome_outliers, dtype: int64
- # annualIncome_outliers
- # 异常值 756
- # 正常值 158854
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 800000
- # Name: verificationStatus_outliers, dtype: int64
- # verificationStatus_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 783003
- # 异常值 16997
- # Name: purpose_outliers, dtype: int64
- # purpose_outliers
- # 异常值 3635
- # 正常值 155975
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 798931
- # 异常值 1069
- # Name: postCode_outliers, dtype: int64
- # postCode_outliers
- # 异常值 221
- # 正常值 159389
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 799994
- # 异常值 6
- # Name: regionCode_outliers, dtype: int64
- # regionCode_outliers
- # 异常值 1
- # 正常值 159609
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 798440
- # 异常值 1560
- # Name: dti_outliers, dtype: int64
- # dti_outliers
- # 异常值 466
- # 正常值 159144
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 778245
- # 异常值 21755
- # Name: delinquency_2years_outliers, dtype: int64
- # delinquency_2years_outliers
- # 异常值 5089
- # 正常值 154521
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788261
- # 异常值 11739
- # Name: ficoRangeLow_outliers, dtype: int64
- # ficoRangeLow_outliers
- # 异常值 778
- # 正常值 158832
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788261
- # 异常值 11739
- # Name: ficoRangeHigh_outliers, dtype: int64
- # ficoRangeHigh_outliers
- # 异常值 778
- # 正常值 158832
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 790889
- # 异常值 9111
- # Name: openAcc_outliers, dtype: int64
- # openAcc_outliers
- # 异常值 2195
- # 正常值 157415
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 792471
- # 异常值 7529
- # Name: pubRec_outliers, dtype: int64
- # pubRec_outliers
- # 异常值 1701
- # 正常值 157909
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 794120
- # 异常值 5880
- # Name: pubRecBankruptcies_outliers, dtype: int64
- # pubRecBankruptcies_outliers
- # 异常值 1423
- # 正常值 158187
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 790001
- # 异常值 9999
- # Name: revolBal_outliers, dtype: int64
- # revolBal_outliers
- # 异常值 1359
- # 正常值 158251
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 799948
- # 异常值 52
- # Name: revolUtil_outliers, dtype: int64
- # revolUtil_outliers
- # 异常值 23
- # 正常值 159587
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 791663
- # 异常值 8337
- # Name: totalAcc_outliers, dtype: int64
- # totalAcc_outliers
- # 异常值 1668
- # 正常值 157942
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 800000
- # Name: initialListStatus_outliers, dtype: int64
- # initialListStatus_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 784586
- # 异常值 15414
- # Name: applicationType_outliers, dtype: int64
- # applicationType_outliers
- # 异常值 3875
- # 正常值 155735
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 775134
- # 异常值 24866
- # Name: title_outliers, dtype: int64
- # title_outliers
- # 异常值 3900
- # 正常值 155710
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 800000
- # Name: policyCode_outliers, dtype: int64
- # policyCode_outliers
- # 正常值 159610
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 782773
- # 异常值 17227
- # Name: n0_outliers, dtype: int64
- # n0_outliers
- # 异常值 3485
- # 正常值 156125
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 790500
- # 异常值 9500
- # Name: n1_outliers, dtype: int64
- # n1_outliers
- # 异常值 2491
- # 正常值 157119
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 789067
- # 异常值 10933
- # Name: n2_outliers, dtype: int64
- # n2_outliers
- # 异常值 3205
- # 正常值 156405
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 789067
- # 异常值 10933
- # Name: n2.1_outliers, dtype: int64
- # n2.1_outliers
- # 异常值 3205
- # 正常值 156405
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788660
- # 异常值 11340
- # Name: n4_outliers, dtype: int64
- # n4_outliers
- # 异常值 2476
- # 正常值 157134
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 790355
- # 异常值 9645
- # Name: n5_outliers, dtype: int64
- # n5_outliers
- # 异常值 1858
- # 正常值 157752
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 786006
- # 异常值 13994
- # Name: n6_outliers, dtype: int64
- # n6_outliers
- # 异常值 3182
- # 正常值 156428
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788430
- # 异常值 11570
- # Name: n7_outliers, dtype: int64
- # n7_outliers
- # 异常值 2746
- # 正常值 156864
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 789625
- # 异常值 10375
- # Name: n8_outliers, dtype: int64
- # n8_outliers
- # 异常值 2131
- # 正常值 157479
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 786384
- # 异常值 13616
- # Name: n9_outliers, dtype: int64
- # n9_outliers
- # 异常值 3953
- # 正常值 155657
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788979
- # 异常值 11021
- # Name: n10_outliers, dtype: int64
- # n10_outliers
- # 异常值 2639
- # 正常值 156971
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 799434
- # 异常值 566
- # Name: n11_outliers, dtype: int64
- # n11_outliers
- # 异常值 112
- # 正常值 159498
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 797585
- # 异常值 2415
- # Name: n12_outliers, dtype: int64
- # n12_outliers
- # 异常值 545
- # 正常值 159065
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788907
- # 异常值 11093
- # Name: n13_outliers, dtype: int64
- # n13_outliers
- # 异常值 2482
- # 正常值 157128
- # Name: isDefault, dtype: int64
- # **********
- # 正常值 788884
- # 异常值 11116
- # Name: n14_outliers, dtype: int64
- # n14_outliers
- # 异常值 3364
- # 正常值 156246
- # Name: isDefault, dtype: int64
- # **********
- # 删除异常值
- for fea in numerical_fea:
- data_train = data_train[data_train[fea + '_outliers'] == '正常值']
- data_train = data_train.reset_index(drop=True)
- # 通过除法映射到间隔均匀的分桶中,每个分桶的取值范围都是loanAmnt/1000
- data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
- data['loanAmnt_bin1']
-
- # 0 14.0
- # 1 20.0
- # 2 12.0
- # 3 17.0
- # 4 35.0
- # ...
- # 199995 7.0
- # 199996 6.0
- # 199997 14.0
- # 199998 8.0
- # 199999 8.0
- # Name: loanAmnt_bin1, Length: 200000, dtype: float64
- # 通过对数函数映射到指数宽度分桶
- data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
- data['loanAmnt_bin2']
-
- # 0 4.0
- # 1 4.0
- # 2 4.0
- # 3 4.0
- # 4 4.0
- # ...
- # 199995 3.0
- # 199996 3.0
- # 199997 4.0
- # 199998 3.0
- # 199999 3.0
- # Name: loanAmnt_bin2, Length: 200000, dtype: float64
- data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
- data['loanAmnt_bin3']
-
- # 0 5
- # 1 7
- # 2 4
- # 3 6
- # 4 9
- # ..
- # 199995 2
- # 199996 1
- # 199997 5
- # 199998 2
- # 199999 2
- # Name: loanAmnt_bin3, Length: 200000, dtype: int64
- for col in ['grade', 'subGrade']:
- temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
- temp_dict.index = temp_dict[col].values
- temp_dict = temp_dict[col + '_target_mean'].to_dict()
-
- data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
- data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
- # 其他衍生变量 mean 和 std
- for df in [data_train, data_test_a]:
- for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
- df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
- df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
- # label-encode: subGrade,postCode,title
- # 高维类别特征需要进行转换
- for col in tqdm(['employmentTitle', 'postCode', 'title', 'subGrade']):
- le = LabelEncoder()
- le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
- data_train[col] = le.transform(list(data_train[col].astype(str).values))
- data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
- print('Label Encoding 完成')
-
- # 100%|██████████| 4/4 [00:07<00:00, 1.76s/it]
- # Label Encoding 完成
- # 举例归一化过程
- # 伪代码
- for fea in [要归一化的特征列表]:
- data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
方差选择法
- from sklearn.feature_selection import VarianceThreshold
- # 其中参数threshold为方差的阈值
- VarianceThreshold(threshold=3).fit_transform(train, target_train)
相关系数法
- from sklearn.feature_selection import SelectKBest
- from scipy.stats import pearsonr
- # 选择K个最好的特征,返回选择特征后的数据
- # 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
- # 输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
- # 参数k为选择的特征个数
-
- SelectKBest(k=5).fit_transform(train, target_train)
卡方检验
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import chi2
- # 参数k为选择的特征个数
-
- SelectKBest(chi2, k=5).fit_transform(train, target_train)
互信息法
- from sklearn.feature_selection import SelectKBest
- from minepy import MINE
- # 由于MINE的设计不是函数式的,定义mic方法将其作为函数式的,
- # 返回一个二元组,二元组的第2项设置成固定的P值0.5
- def mic(x, y):
- m = MINE()
- m.compute_score(x, y)
- return (m.mic(), 0.5)
-
- # 参数k为选择的特征个数
- SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(train, target_train)
- from sklearn.feature_selection import RFE
- from sklearn.linear_model import LogisticRegression
- # 递归特征消除法,返回特征选择后的数据
- # 参数estimator为基模型
- # 参数n_features_to_select为选择的特征个数
-
- RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train, target_train)
- from sklearn.feature_selection import SelectFromModel
- from sklearn.linear_model import LogisticRegression
- # 带L1惩罚项的逻辑回归作为基模型的特征选择
-
- SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train, target_train)
- from sklearn.feature_selection import SelectFromModel
- from sklearn.ensemble import GradientBoostingClassifier
- # GBDT作为基模型的特征选择
- SelectFromModel(GradientBoostingClassifier()).fit_transform(train, target_train)
- # 删除不需要的数据
- for data in [data_train, data_test_a]:
- data.drop(['issueDate','id'], axis=1, inplace=True)
- # 纵向用缺失值上面的值替换缺失值
- data_train = data_train.fillna(axis=0, method='ffill')
- x_train = data_train.drop(['isDefault'], axis=1)
- # 计算协方差
- data_corr = x_train.corrwith(data_train.isDefault) # 计算相关性
- result = pd.DataFrame(columns=['features', 'corr'])
- result['features'] = data_corr.index
- result['corr'] = data_corr.values
- # 当然也可以直接看图
- data_numeric = data_train[numerical_fea]
- correlation = data_numeric.corr()
-
- f , ax = plt.subplots(figsize = (7, 7))
- plt.title('Correlation of Numeric Features with Price',y=1,size=16)
- sns.heatmap(correlation,square = True, vmax=0.8)
- features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
- x_train = data_train[features]
- x_test = data_test_a[features]
- y_train = data_train['isDefault']
- def cv_model(clf, train_x, train_y, test_x, clf_name):
- folds = 5
- seed = 2020
- kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
-
- train = np.zeros(train_x.shape[0])
- test = np.zeros(test_x.shape[0])
-
- cv_scores = []
-
- for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
- print('************************************ {} ************************************'.format(str(i+1)))
- trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
-
- if clf_name == "lgb":
- train_matrix = clf.Dataset(trn_x, label=trn_y)
- valid_matrix = clf.Dataset(val_x, label=val_y)
-
- params = {
- 'boosting_type': 'gbdt',
- 'objective': 'binary',
- 'metric': 'auc',
- 'min_child_weight': 5,
- 'num_leaves': 2 ** 5,
- 'lambda_l2': 10,
- 'feature_fraction': 0.8,
- 'bagging_fraction': 0.8,
- 'bagging_freq': 4,
- 'learning_rate': 0.1,
- 'seed': 2020,
- 'nthread': 28,
- 'n_jobs':24,
- 'silent': True,
- 'verbose': -1,
- }
-
- model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
- val_pred = model.predict(val_x, num_iteration=model.best_iteration)
- test_pred = model.predict(test_x, num_iteration=model.best_iteration)
-
- # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
-
- if clf_name == "xgb":
- train_matrix = clf.DMatrix(trn_x , label=trn_y)
- valid_matrix = clf.DMatrix(val_x , label=val_y)
-
- params = {'booster': 'gbtree',
- 'objective': 'binary:logistic',
- 'eval_metric': 'auc',
- 'gamma': 1,
- 'min_child_weight': 1.5,
- 'max_depth': 5,
- 'lambda': 10,
- 'subsample': 0.7,
- 'colsample_bytree': 0.7,
- 'colsample_bylevel': 0.7,
- 'eta': 0.04,
- 'tree_method': 'exact',
- 'seed': 2020,
- 'nthread': 36,
- "silent": True,
- }
-
- watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
-
- model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
- val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
- test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
-
- if clf_name == "cat":
- params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
- 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
-
- model = clf(iterations=20000, **params)
- model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
- cat_features=[], use_best_model=True, verbose=500)
-
- val_pred = model.predict(val_x)
- test_pred = model.predict(test_x)
-
- train[valid_index] = val_pred
- test = test_pred / kf.n_splits
- cv_scores.append(roc_auc_score(val_y, val_pred))
-
- print(cv_scores)
-
- print("%s_scotrainre_list:" % clf_name, cv_scores)
- print("%s_score_mean:" % clf_name, np.mean(cv_scores))
- print("%s_score_std:" % clf_name, np.std(cv_scores))
- return train, test
- def lgb_model(x_train, y_train, x_test):
- lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
- return lgb_train, lgb_test
-
- def xgb_model(x_train, y_train, x_test):
- xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
- return xgb_train, xgb_test
-
- def cat_model(x_train, y_train, x_test):
- cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
- return cat_train, cat_test
- lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
-
- # ************************************ 1 ************************************
- # [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
- # [LightGBM] [Warning] Unknown parameter: silent
- # Training until validation scores don't improve for 200 rounds
- # [200] training's auc: 0.749114 valid_1's auc: 0.729275
- # [400] training's auc: 0.764716 valid_1's auc: 0.730125
- # [600] training's auc: 0.778489 valid_1's auc: 0.729928
- # Early stopping, best iteration is:
- # [446] training's auc: 0.768137 valid_1's auc: 0.730186
- # [0.7301862239949224]
- # ************************************ 2 ************************************
- # [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
- # [LightGBM] [Warning] Unknown parameter: silent
- # Training until validation scores don't improve for 200 rounds
- # [200] training's auc: 0.748999 valid_1's auc: 0.731035
- # [400] training's auc: 0.764879 valid_1's auc: 0.731436
- # [600] training's auc: 0.778506 valid_1's auc: 0.730823
- # Early stopping, best iteration is:
- # [414] training's auc: 0.765823 valid_1's auc: 0.731478
- # [0.7301862239949224, 0.7314779648434573]
- # ************************************ 3 ************************************
- # [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
- # [LightGBM] [Warning] Unknown parameter: silent
- # Training until validation scores don't improve for 200 rounds
- # [200] training's auc: 0.748145 valid_1's auc: 0.73253
- # [400] training's auc: 0.763814 valid_1's auc: 0.733272
- # [600] training's auc: 0.777895 valid_1's auc: 0.733354
- # Early stopping, best iteration is:
- # [475] training's auc: 0.769215 valid_1's auc: 0.73355
- # [0.7301862239949224, 0.7314779648434573, 0.7335502065719879]
- # ************************************ 4 ************************************
- # [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
- # [LightGBM] [Warning] Unknown parameter: silent
- # Training until validation scores don't improve for 200 rounds
- # [200] training's auc: 0.749417 valid_1's auc: 0.727507
- # [400] training's auc: 0.765066 valid_1's auc: 0.728261
- # Early stopping, best iteration is:
- # [353] training's auc: 0.761647 valid_1's auc: 0.728349
- # [0.7301862239949224, 0.7314779648434573, 0.7335502065719879, 0.7283491938614568]
- # ************************************ 5 ************************************
- # [LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
- # [LightGBM] [Warning] Unknown parameter: silent
- # Training until validation scores don't improve for 200 rounds
- # [200] training's auc: 0.748562 valid_1's auc: 0.73262
- # [400] training's auc: 0.764493 valid_1's auc: 0.733365
- # Early stopping, best iteration is:
- # [394] training's auc: 0.764109 valid_1's auc: 0.733381
- # [0.7301862239949224, 0.7314779648434573, 0.7335502065719879, 0.7283491938614568, 0.7333810157041901]
- # lgb_scotrainre_list: [0.7301862239949224, 0.7314779648434573, 0.7335502065719879, 0.7283491938614568, 0.7333810157041901]
- # lgb_score_mean: 0.7313889209952029
- # lgb_score_std: 0.001966415347937543
- lgb_train, lgb_test = xgb_model(x_train, y_train, x_test)
-
- # ************************************ 1 ************************************
- # [15:02:32] WARNING: ../src/learner.cc:516:
- # Parameters: { silent } might not be used.
-
- # This may not be accurate due to some parameters are only used in language bindings but
- # passed down to XGBoost core. Or some parameters are not used but slip through this
- # verification. Please open an issue if you find above cases.
-
-
- # [0] train-auc:0.69713 eval-auc:0.69580
- # Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
-
- # Will train until eval-auc hasn't improved in 200 rounds.
- # [200] train-auc:0.73103 eval-auc:0.72371
- # [400] train-auc:0.74040 eval-auc:0.72807
- # [600] train-auc:0.74624 eval-auc:0.72966
- # [800] train-auc:0.75132 eval-auc:0.73055
- # [1000] train-auc:0.75580 eval-auc:0.73101
- # [1200] train-auc:0.76004 eval-auc:0.73127
- # [1400] train-auc:0.76409 eval-auc:0.73156
- # [1600] train-auc:0.76791 eval-auc:0.73169
- # [1800] train-auc:0.77156 eval-auc:0.73173
- # [2000] train-auc:0.77506 eval-auc:0.73167
- # Stopping. Best iteration:
- # [1852] train-auc:0.77251 eval-auc:0.73177
-
- # [0.731769339538683]
- # ************************************ 2 ************************************
- # [15:07:16] WARNING: ../src/learner.cc:516:
- # Parameters: { silent } might not be used.
-
- # This may not be accurate due to some parameters are only used in language bindings but
- # passed down to XGBoost core. Or some parameters are not used but slip through this
- # verification. Please open an issue if you find above cases.
-
-
- # [0] train-auc:0.69687 eval-auc:0.69574
- # Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
-
- # Will train until eval-auc hasn't improved in 200 rounds.
- # [200] train-auc:0.73078 eval-auc:0.72640
- # [400] train-auc:0.74020 eval-auc:0.73023
- # [600] train-auc:0.74605 eval-auc:0.73156
- # [800] train-auc:0.75114 eval-auc:0.73231
- # [1000] train-auc:0.75562 eval-auc:0.73275
- # [1200] train-auc:0.75987 eval-auc:0.73310
- # [1400] train-auc:0.76372 eval-auc:0.73317
- # [1600] train-auc:0.76757 eval-auc:0.73330
- # [1800] train-auc:0.77123 eval-auc:0.73335
- # [2000] train-auc:0.77484 eval-auc:0.73339
- # Stopping. Best iteration:
- # [1829] train-auc:0.77173 eval-auc:0.73340
-
- # [0.731769339538683, 0.733395913606802]
- # ************************************ 3 ************************************
- # [15:11:52] WARNING: ../src/learner.cc:516:
- # Parameters: { silent } might not be used.
-
- # This may not be accurate due to some parameters are only used in language bindings but
- # passed down to XGBoost core. Or some parameters are not used but slip through this
- # verification. Please open an issue if you find above cases.
-
-
- # [0] train-auc:0.69730 eval-auc:0.69647
- # Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
-
- # Will train until eval-auc hasn't improved in 200 rounds.
- # [200] train-auc:0.73072 eval-auc:0.72604
- # [400] train-auc:0.73965 eval-auc:0.73076
- # [600] train-auc:0.74548 eval-auc:0.73241
- # [800] train-auc:0.75050 eval-auc:0.73356
- # [1000] train-auc:0.75501 eval-auc:0.73416
- # [1200] train-auc:0.75898 eval-auc:0.73460
- # [1400] train-auc:0.76303 eval-auc:0.73487
- # [1600] train-auc:0.76689 eval-auc:0.73507
- # [1800] train-auc:0.77059 eval-auc:0.73507
- # Stopping. Best iteration:
- # [1703] train-auc:0.76871 eval-auc:0.73515
-
- # [0.731769339538683, 0.733395913606802, 0.7351456720593506]
- # ************************************ 4 ************************************
- # [15:16:15] WARNING: ../src/learner.cc:516:
- # Parameters: { silent } might not be used.
-
- # This may not be accurate due to some parameters are only used in language bindings but
- # passed down to XGBoost core. Or some parameters are not used but slip through this
- # verification. Please open an issue if you find above cases.
-
-
- # [0] train-auc:0.69737 eval-auc:0.69375
- # Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
-
- # Will train until eval-auc hasn't improved in 200 rounds.
- # [200] train-auc:0.73148 eval-auc:0.72250
- # [400] train-auc:0.74044 eval-auc:0.72639
- # [600] train-auc:0.74649 eval-auc:0.72804
- # [800] train-auc:0.75154 eval-auc:0.72887
- # [1000] train-auc:0.75598 eval-auc:0.72934
- # [1200] train-auc:0.75997 eval-auc:0.72954
- # [1400] train-auc:0.76401 eval-auc:0.72977
- # [1600] train-auc:0.76793 eval-auc:0.72989
- # [1800] train-auc:0.77159 eval-auc:0.72993
- # [2000] train-auc:0.77511 eval-auc:0.73002
- # [2200] train-auc:0.77850 eval-auc:0.72996
- # Stopping. Best iteration:
- # [2011] train-auc:0.77531 eval-auc:0.73004
-
- # [0.731769339538683, 0.733395913606802, 0.7351456720593506, 0.7300361842852358]
- # ************************************ 5 ************************************
- # [15:21:18] WARNING: ../src/learner.cc:516:
- # Parameters: { silent } might not be used.
-
- # This may not be accurate due to some parameters are only used in language bindings but
- # passed down to XGBoost core. Or some parameters are not used but slip through this
- # verification. Please open an issue if you find above cases.
-
-
- # [0] train-auc:0.69647 eval-auc:0.69701
- # Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
-
- # Will train until eval-auc hasn't improved in 200 rounds.
- # [200] train-auc:0.73059 eval-auc:0.72675
- # [400] train-auc:0.73972 eval-auc:0.73089
- # [600] train-auc:0.74589 eval-auc:0.73256
- # [800] train-auc:0.75073 eval-auc:0.73347
- # [1000] train-auc:0.75523 eval-auc:0.73401
- # [1200] train-auc:0.75941 eval-auc:0.73419
- # [1400] train-auc:0.76342 eval-auc:0.73438
- # [1600] train-auc:0.76730 eval-auc:0.73458
- # [1800] train-auc:0.77105 eval-auc:0.73454
- # Stopping. Best iteration:
- # [1694] train-auc:0.76910 eval-auc:0.73464
-
- # [0.731769339538683, 0.733395913606802, 0.7351456720593506, 0.7300361842852358, 0.734639280693211]
- # xgb_scotrainre_list: [0.731769339538683, 0.733395913606802, 0.7351456720593506, 0.7300361842852358, 0.734639280693211]
- # xgb_score_mean: 0.7329972780366564
- # xgb_score_std: 0.0018839633265100187
- lgb_train, lgb_test = cat_model(x_train, y_train, x_test)
-
- # ************************************ 1 ************************************
- # 0: learn: 0.3944330 test: 0.3964727 best: 0.3964727 (0) total: 138ms remaining: 45m 59s
- # 500: learn: 0.3728126 test: 0.3756408 best: 0.3756408 (500) total: 28.1s remaining: 18m 13s
- # 1000: learn: 0.3711980 test: 0.3750523 best: 0.3750523 (1000) total: 56.2s remaining: 17m 47s
- # 1500: learn: 0.3699538 test: 0.3748118 best: 0.3748107 (1476) total: 1m 23s remaining: 17m 11s
- # 2000: learn: 0.3688546 test: 0.3746815 best: 0.3746815 (2000) total: 1m 51s remaining: 16m 44s
- # Stopped by overfitting detector (50 iterations wait)
-
- # bestTest = 0.3746253358
- # bestIteration = 2266
-
- # Shrink model to first 2267 iterations.
- # [0.7306375926022922]
- # ************************************ 2 ************************************
- # 0: learn: 0.3947513 test: 0.3951211 best: 0.3951211 (0) total: 71.1ms remaining: 23m 41s
- # 500: learn: 0.3731076 test: 0.3743412 best: 0.3743412 (500) total: 28.6s remaining: 18m 32s
- # 1000: learn: 0.3714544 test: 0.3737577 best: 0.3737570 (999) total: 56.7s remaining: 17m 56s
- # 1500: learn: 0.3702186 test: 0.3735397 best: 0.3735396 (1498) total: 1m 24s remaining: 17m 23s
- # 2000: learn: 0.3691118 test: 0.3734092 best: 0.3734074 (1977) total: 1m 52s remaining: 16m 54s
- # 2500: learn: 0.3680796 test: 0.3733234 best: 0.3733218 (2484) total: 2m 21s remaining: 16m 28s
- # Stopped by overfitting detector (50 iterations wait)
-
- # bestTest = 0.373251629
- # bestIteration = 2919
-
- # Shrink model to first 2920 iterations.
- # [0.7306375926022922, 0.7325015175914498]
- # ************************************ 3 ************************************
- # 0: learn: 0.3951060 test: 0.3937487 best: 0.3937487 (0) total: 70.2ms remaining: 23m 24s
- # 500: learn: 0.3734715 test: 0.3730983 best: 0.3730983 (500) total: 28.4s remaining: 18m 26s
- # 1000: learn: 0.3718399 test: 0.3724184 best: 0.3724184 (1000) total: 56.5s remaining: 17m 53s
- # 1500: learn: 0.3706048 test: 0.3721639 best: 0.3721639 (1500) total: 1m 24s remaining: 17m 24s
- # 2000: learn: 0.3695127 test: 0.3720199 best: 0.3720199 (2000) total: 1m 52s remaining: 16m 52s
- # 2500: learn: 0.3685041 test: 0.3719052 best: 0.3719025 (2479) total: 2m 20s remaining: 16m 20s
- # Stopped by overfitting detector (50 iterations wait)
-
- # bestTest = 0.3719024831
- # bestIteration = 2479
-
- # Shrink model to first 2480 iterations.
- # [0.7306375926022922, 0.7325015175914498, 0.7340103693991001]
- # ************************************ 4 ************************************
- # 0: learn: 0.3949491 test: 0.3943298 best: 0.3943298 (0) total: 66.8ms remaining: 22m 16s
- # 500: learn: 0.3732214 test: 0.3741316 best: 0.3741316 (500) total: 28.2s remaining: 18m 18s
- # 1000: learn: 0.3715666 test: 0.3735451 best: 0.3735414 (995) total: 56s remaining: 17m 42s
- # 1500: learn: 0.3703238 test: 0.3733058 best: 0.3733045 (1498) total: 1m 23s remaining: 17m 9s
- # 2000: learn: 0.3692105 test: 0.3731636 best: 0.3731634 (1999) total: 1m 51s remaining: 16m 41s
- # 2500: learn: 0.3681907 test: 0.3730490 best: 0.3730490 (2500) total: 2m 19s remaining: 16m 13s
- # Stopped by overfitting detector (50 iterations wait)
-
- # bestTest = 0.3730185197
- # bestIteration = 2723
-
- # Shrink model to first 2724 iterations.
- # [0.7306375926022922, 0.7325015175914498, 0.7340103693991001, 0.7291287412227256]
- # ************************************ 5 ************************************
- # 0: learn: 0.3948860 test: 0.3944692 best: 0.3944692 (0) total: 68.4ms remaining: 22m 47s
- # 500: learn: 0.3733508 test: 0.3734623 best: 0.3734623 (500) total: 28.7s remaining: 18m 37s
- # 1000: learn: 0.3717222 test: 0.3729094 best: 0.3729094 (1000) total: 57s remaining: 18m 2s
- # 1500: learn: 0.3704933 test: 0.3726407 best: 0.3726407 (1500) total: 1m 25s remaining: 17m 32s
- # 2000: learn: 0.3693930 test: 0.3725202 best: 0.3725200 (1998) total: 1m 53s remaining: 17m 4s
- # 2500: learn: 0.3683883 test: 0.3724494 best: 0.3724494 (2500) total: 2m 22s remaining: 16m 36s
- # Stopped by overfitting detector (50 iterations wait)
-
- # bestTest = 0.3724045318
- # bestIteration = 2904
-
- # Shrink model to first 2905 iterations.
- # [0.7306375926022922, 0.7325015175914498, 0.7340103693991001, 0.7291287412227256, 0.7342835786894728]
- # cat_scotrainre_list: [0.7306375926022922, 0.7325015175914498, 0.7340103693991001, 0.7291287412227256, 0.7342835786894728]
- # cat_score_mean: 0.7321123599010082
- # cat_score_std: 0.0019771188023493848
特征工程是机器学习,甚至是深度学习中最为重要的一部分,在实际应用中往往也是所花费时间最多的一步。各种算法书中对特征工程部分的讲解往往少得可怜,因为特征工程和具体的数据结合的太紧密,很难系统地覆盖所有场景。本章主要是通过一些常用的方法来做介绍,例如缺失值异常值的处理方法详细对任何数据集来说都是适用的。但对于分箱等操作本章给出了具体的几种思路,需要读者自己探索。在特征工程中比赛和具体的应用还是有所不同的,在实际的金融风控评分卡制作过程中,由于强调特征的可解释性,特征分箱尤其重要。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。