赞
踩
import pandas as pd import os import gc import lightgbm as lgb import xgboost as xgb from catboost import CatBoostRegressor from sklearn.linear_model import SGDRegressor,LinearRegression,Ridge from sklearn.preprocessing import MinMaxScaler import math import numpy as np from tqdm import tqdm from sklearn.model_selection import StratifiedKFold,KFold from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,log_loss import matplotlib.pyplot as plt import time import warnings warnings.filterwarnings('ignore')
train=pd.read_csv('train.csv')
testA=pd.read_csv('testA.csv')
train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
list(train.select_dtypes('object'))
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
data=pd.concat([train,testA],axis=0,ignore_index=True)
可以看到很多变量不能直接训练,比如’grade’, ‘subGrade’,‘employmentLength’, ‘issueDate’, ‘earliesCreditLine’,需要进行预处理
print(sorted(data.grade.unique()))
print(sorted(data.subGrade.unique()))
['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
data['employmentLength'].value_counts(dropna=False).sort_index()
1 year 65671
10+ years 328525
2 years 90565
3 years 80163
4 years 59818
5 years 62645
6 years 46582
7 years 44230
8 years 45168
9 years 37866
< 1 year 80226
NaN 58541
Name: employmentLength, dtype: int64
data['employmentLength'].replace('10+ years','10 years',inplace=True)
data['employmentLength'].replace('< 1 year','0 years',inplace=True)
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
data['employmentLength']=data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
#dropna=False 表示不删除NaN
0.0 80226
1.0 65671
2.0 90565
3.0 80163
4.0 59818
5.0 62645
6.0 46582
7.0 44230
8.0 45168
9.0 37866
10.0 328525
NaN 58541
Name: employmentLength, dtype: int64
data.earliesCreditLine.sample(5)
618907 Nov-2004
145773 Oct-2001
21633 Mar-2005
697120 Sep-1990
815318 Feb-2004
Name: earliesCreditLine, dtype: object
data.earliesCreditLine=data.earliesCreditLine.apply(lambda s:int(s[-4:]))
data.earliesCreditLine.describe()
count 1000000.000000
mean 1998.688632
std 7.606231
min 1944.000000
25% 1995.000000
50% 2000.000000
75% 2004.000000
max 2015.000000
Name: earliesCreditLine, dtype: float64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 47 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 1000000 non-null int64 1 loanAmnt 1000000 non-null float64 2 term 1000000 non-null int64 3 interestRate 1000000 non-null float64 4 installment 1000000 non-null float64 5 grade 1000000 non-null object 6 subGrade 1000000 non-null object 7 employmentTitle 999999 non-null float64 8 employmentLength 941459 non-null float64 9 homeOwnership 1000000 non-null int64 10 annualIncome 1000000 non-null float64 11 verificationStatus 1000000 non-null int64 12 issueDate 1000000 non-null object 13 isDefault 800000 non-null float64 14 purpose 1000000 non-null int64 15 postCode 999999 non-null float64 16 regionCode 1000000 non-null int64 17 dti 999700 non-null float64 18 delinquency_2years 1000000 non-null float64 19 ficoRangeLow 1000000 non-null float64 20 ficoRangeHigh 1000000 non-null float64 21 openAcc 1000000 non-null float64 22 pubRec 1000000 non-null float64 23 pubRecBankruptcies 999479 non-null float64 24 revolBal 1000000 non-null float64 25 revolUtil 999342 non-null float64 26 totalAcc 1000000 non-null float64 27 initialListStatus 1000000 non-null int64 28 applicationType 1000000 non-null int64 29 earliesCreditLine 1000000 non-null int64 30 title 999999 non-null float64 31 policyCode 1000000 non-null float64 32 n0 949619 non-null float64 33 n1 949619 non-null float64 34 n2 949619 non-null float64 35 n3 949619 non-null float64 36 n4 958367 non-null float64 37 n5 949619 non-null float64 38 n6 949619 non-null float64 39 n7 949619 non-null float64 40 n8 949618 non-null float64 41 n9 949619 non-null float64 42 n10 958367 non-null float64 43 n11 912673 non-null float64 44 n12 949619 non-null float64 45 n13 949619 non-null float64 46 n14 949619 non-null float64 dtypes: float64(35), int64(9), object(3) memory usage: 358.6+ MB
#部分类别特征
cate_features=['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f,'类型数:',data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 298101
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 935
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 47903
policyCode 类型数: 1
#类型数在2之上,又不是高维稀疏的
data=pd.get_dummies(data,columns=['grade','subGrade','homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
#高维类别特征需要进行转换
for f in ['employmentTitle','postCode','title']:
data[f+'_cnts']=data.groupby([f])['id'].transform('count')
data[f+'_rank']=data.groupby([f])['id'].rank(ascending=False).astype(int)
del data[f]
features=[f for f in data.columns if f not in ['id','issueDate','isDefault']]
train=data[data.isDefault.notnull()].reset_index(drop=True)
test=data[data.isDefault.isnull()].reset_index(drop=True)
x_train=train[features]
x_test=test[features]
y_train=train['isDefault']
def cv_model(clf,train_x,train_y,test_x,clf_name): folds=5 seed=2020 kf=KFold(n_splits=folds,shuffle=True,random_state=seed) train=np.zeros(train_x.shape[0])# shape[0]:表示矩阵的行数 test=np.zeros(test_x.shape[0]) cv_scores=[] for i,(train_index,valid_index) in enumerate(kf.split(train_x,train_y)): print('******{}******'.format(str(i+1))) trn_x,trn_y,val_x,val_y=train_x.iloc[train_index],train_y[train_index],train_x.iloc[valid_index],train_y[valid_index] if clf_name=='lgb': train_matrix=clf.Dataset(trn_x,label=trn_y) valid_matrix=clf.Dataset(val_x,label=val_y) params={'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020, 'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1,} model=clf.train(params,train_matrix,50000,valid_sets=[train_matrix,valid_matrix],verbose_eval=200,early_stopping_rounds=200) val_pred=model.predict(val_x,num_iteration=model.best_iteration) test_pred=model.predict(test_x,num_iteration=model.best_iteration) # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]) if clf_name=='xgb': train_matrix=clf.DMatrix(trn_x,label=trn_y) valid_matrix=clf.DMatrix(val_x,label=val_y) test_matrix=clf.DMatrix(test_x) params={'booster':'gbtree', 'objective':'binary:logietic', 'eval_metric':'auc', 'gamma':1, 'min_child_weight':1.5, 'max_depth':5, 'lambda':10, 'subsample':0.7, 'colsample_bytree':0.7, 'colsample_bylevel':0.7, 'eta':0.04, 'tree_method':'exact', 'seed':2020, 'nthread':36, 'silent':True, } watchlist=[(train_matrix,'train'),(valid_matrix,'eval')] model=clf.train(params,train_matrix,num_boost_round=50000,evals=watchlist,verbose_eval=200,early_stopping_rounds=200) val_pred=model.predict(valid_matrix,ntree_limit=model.best_ntree_limit) test_pred=model.predict(test_matrix,ntree_limit=model.best_ntree_limit) if clf_name=='cat': params={'learning_rate':0.05,'depth':5,'l2_leaf_reg':10,'bootstrap_type':'Bernoulli', 'od_type':'Iter','od_wait':50,'random_seed':11,'allow_writing_files':False} model=clf(iterations=20000,**params) model.fit(trn_x,trn_y,eval_set=(val_x,val_y), cat_features=[],use_best_model=True,verbose=500) val_pred=model.predict(val_x) test_pred=model.predict(test_x) train[valid_index]=val_pred test=test_pred/kf.n_splits cv_scores.append(roc_auc_score(val_y,val_pred)) print(cv_scores) print('%s_scotrainre_list:'%clf_name,cv_scores) print('%s_score_mean:'%clf_name,np.mean(cv_scores)) print('%s_score_std:'%clf_name,np.std(cv_scores)) return train,test
K折交叉验证:
调参:
def lgb_model(x_train,y_train,x_test):
lgb_train,lgb_test=cv_model(lgb,x_train,y_train,x_test,'lgb')
return lgb_train,lgb_test
def xgb_model(x_train,y_train,x_test):
xgb_train,xgb_test=cv_model(xgb,x_train,y_train,x_test,'xgb')
return xgb_train,xgb_test
def cat_model(x_train,y_train,x_test):
cat_train,cat_test=cv_model(CatBoostRegressor,x_train,y_train,x_test,'cat')
return cat_train,cat_test
lgb_train,lgb_test=lgb_model(x_train,y_train,x_test)
******1****** [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.742898 valid_1's auc: 0.730406 [400] training's auc: 0.755553 valid_1's auc: 0.731185 [600] training's auc: 0.766567 valid_1's auc: 0.731421 [800] training's auc: 0.77656 valid_1's auc: 0.731297 Early stopping, best iteration is: [658] training's auc: 0.769561 valid_1's auc: 0.731571 [0.7315707699391983] ******2****** [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.743889 valid_1's auc: 0.726598 [400] training's auc: 0.756346 valid_1's auc: 0.727829 [600] training's auc: 0.767237 valid_1's auc: 0.728122 [800] training's auc: 0.777257 valid_1's auc: 0.728164 Early stopping, best iteration is: [700] training's auc: 0.772432 valid_1's auc: 0.728318 [0.7315707699391983, 0.7283181812019169] ******3****** [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.743204 valid_1's auc: 0.731376 [400] training's auc: 0.7554 valid_1's auc: 0.732444 [600] training's auc: 0.766372 valid_1's auc: 0.732822 [800] training's auc: 0.776228 valid_1's auc: 0.732611 Early stopping, best iteration is: [620] training's auc: 0.767377 valid_1's auc: 0.732834 [0.7315707699391983, 0.7283181812019169, 0.732833858510838] ******4****** [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.742844 valid_1's auc: 0.730001 [400] training's auc: 0.755185 valid_1's auc: 0.731181 [600] training's auc: 0.766741 valid_1's auc: 0.731697 [800] training's auc: 0.776848 valid_1's auc: 0.731685 Early stopping, best iteration is: [722] training's auc: 0.773097 valid_1's auc: 0.731733 [0.7315707699391983, 0.7283181812019169, 0.732833858510838, 0.7317333003550207] ******5****** [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.743219 valid_1's auc: 0.729179 [400] training's auc: 0.755904 valid_1's auc: 0.730599 [600] training's auc: 0.766513 valid_1's auc: 0.731059 [800] training's auc: 0.776506 valid_1's auc: 0.730971 Early stopping, best iteration is: [735] training's auc: 0.773511 valid_1's auc: 0.731143 [0.7315707699391983, 0.7283181812019169, 0.732833858510838, 0.7317333003550207, 0.7311427854544066] lgb_scotrainre_list: [0.7315707699391983, 0.7283181812019169, 0.732833858510838, 0.7317333003550207, 0.7311427854544066] lgb_score_mean: 0.7311197790922761 lgb_score_std: 0.001507802995682687
#听说时间很久,那我就不跑了hh
#xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
cat_train,cat_test=cat_model(x_train,y_train,x_test)
******1****** 0: learn: 0.3985252 test: 0.3966187 best: 0.3966187 (0) total: 178ms remaining: 59m 12s 500: learn: 0.3771946 test: 0.3759285 best: 0.3759285 (500) total: 30.8s remaining: 19m 59s 1000: learn: 0.3756449 test: 0.3751634 best: 0.3751634 (1000) total: 1m 4s remaining: 20m 32s 1500: learn: 0.3745709 test: 0.3748276 best: 0.3748276 (1500) total: 1m 39s remaining: 20m 28s 2000: learn: 0.3736588 test: 0.3746263 best: 0.3746258 (1998) total: 2m 14s remaining: 20m 10s 2500: learn: 0.3728292 test: 0.3744849 best: 0.3744849 (2500) total: 2m 48s remaining: 19m 41s Stopped by overfitting detector (50 iterations wait) bestTest = 0.3744018679 bestIteration = 2905 Shrink model to first 2906 iterations. [0.7327200609336475] ******2****** 0: learn: 0.3979537 test: 0.3988945 best: 0.3988945 (0) total: 126ms remaining: 42m 3s 500: learn: 0.3764995 test: 0.3787237 best: 0.3787237 (500) total: 41.2s remaining: 26m 42s 1000: learn: 0.3749374 test: 0.3779174 best: 0.3779174 (1000) total: 1m 19s remaining: 24m 59s 1500: learn: 0.3738552 test: 0.3775812 best: 0.3775812 (1500) total: 1m 54s remaining: 23m 34s 2000: learn: 0.3729340 test: 0.3773443 best: 0.3773436 (1998) total: 2m 31s remaining: 22m 46s Stopped by overfitting detector (50 iterations wait) bestTest = 0.3773239679 bestIteration = 2052 Shrink model to first 2053 iterations. [0.7327200609336475, 0.7282917118426803] ******3****** 0: learn: 0.3980280 test: 0.3987527 best: 0.3987527 (0) total: 153ms remaining: 51m 8s 500: learn: 0.3767797 test: 0.3776461 best: 0.3776461 (500) total: 37.8s remaining: 24m 32s 1000: learn: 0.3752307 test: 0.3768433 best: 0.3768433 (1000) total: 1m 12s remaining: 22m 58s 1500: learn: 0.3741403 test: 0.3764607 best: 0.3764605 (1499) total: 1m 47s remaining: 22m 3s 2000: learn: 0.3732161 test: 0.3762495 best: 0.3762493 (1997) total: 2m 21s remaining: 21m 9s 2500: learn: 0.3723968 test: 0.3761103 best: 0.3761098 (2495) total: 2m 55s remaining: 20m 27s 3000: learn: 0.3716474 test: 0.3760186 best: 0.3760186 (3000) total: 3m 29s remaining: 19m 45s Stopped by overfitting detector (50 iterations wait) bestTest = 0.3759560948 bestIteration = 3410 Shrink model to first 3411 iterations. [0.7327200609336475, 0.7282917118426803, 0.7338399687776773] ******4****** 0: learn: 0.3980748 test: 0.3983970 best: 0.3983970 (0) total: 129ms remaining: 42m 51s 500: learn: 0.3767830 test: 0.3777709 best: 0.3777709 (500) total: 36.3s remaining: 23m 33s 1000: learn: 0.3752528 test: 0.3769020 best: 0.3769020 (1000) total: 1m 25s remaining: 26m 53s 1500: learn: 0.3741987 test: 0.3765448 best: 0.3765448 (1500) total: 2m 20s remaining: 28m 54s 2000: learn: 0.3732910 test: 0.3763156 best: 0.3763156 (2000) total: 3m 16s remaining: 29m 23s 2500: learn: 0.3724645 test: 0.3761445 best: 0.3761435 (2498) total: 4m 10s remaining: 29m 15s 3000: learn: 0.3716982 test: 0.3760409 best: 0.3760409 (3000) total: 5m 6s remaining: 28m 58s 3500: learn: 0.3709615 test: 0.3759851 best: 0.3759842 (3495) total: 6m 2s remaining: 28m 28s Stopped by overfitting detector (50 iterations wait) bestTest = 0.3759786172 bestIteration = 3597 Shrink model to first 3598 iterations. [0.7327200609336475, 0.7282917118426803, 0.7338399687776773, 0.7325672923232748] ******5****** 0: learn: 0.3981448 test: 0.3980859 best: 0.3980859 (0) total: 144ms remaining: 47m 51s 500: learn: 0.3767559 test: 0.3775909 best: 0.3775909 (500) total: 58.4s remaining: 37m 54s 1000: learn: 0.3752239 test: 0.3768122 best: 0.3768122 (1000) total: 1m 51s remaining: 35m 7s 1500: learn: 0.3741592 test: 0.3764654 best: 0.3764654 (1500) total: 2m 21s remaining: 28m 59s 2000: learn: 0.3732513 test: 0.3762308 best: 0.3762294 (1997) total: 2m 58s remaining: 26m 42s 2500: learn: 0.3724325 test: 0.3760785 best: 0.3760785 (2500) total: 3m 37s remaining: 25m 23s 3000: learn: 0.3716690 test: 0.3759789 best: 0.3759789 (3000) total: 4m 15s remaining: 24m 8s 3500: learn: 0.3709385 test: 0.3759029 best: 0.3759014 (3491) total: 4m 55s remaining: 23m 13s 4000: learn: 0.3702519 test: 0.3758301 best: 0.3758288 (3970) total: 5m 35s remaining: 22m 20s Stopped by overfitting detector (50 iterations wait) bestTest = 0.3758115042 bestIteration = 4164 Shrink model to first 4165 iterations. [0.7327200609336475, 0.7282917118426803, 0.7338399687776773, 0.7325672923232748, 0.7317952826099017] cat_scotrainre_list: [0.7327200609336475, 0.7282917118426803, 0.7338399687776773, 0.7325672923232748, 0.7317952826099017] cat_score_mean: 0.7318428632974363 cat_score_std: 0.0018918585561348224
rh_test=lgb_test*0.5+xgb_test*0.5
testA['isDefault']=rh_test
testA[['id','isDefault']].to_csv
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-49-81ca03ed5f9f> in <module>
----> 1 rh_test=lgb_test*0.5+xgb_test*0.5
2 testA['isDefault']=rh_test
3 testA[['id','isDefault']].to_csv
NameError: name 'xgb_test' is not defined
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。