赞
踩
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime from tqdm import tqdm from sklearn.preprocessing import LabelEncoder from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import MinMaxScaler import xgboost as xgb import lightgbm as lgb from catboost import CatBoostRegressor import warnings from sklearn.model_selection import StratifiedKFold,KFold from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,log_loss warnings.filterwarnings('ignore') data_train=pd.read_csv('train.csv') data_test_a=pd.read_csv('testA.csv')
ps:上面写了贼多我第一次见的包,以下是一些关于这些包的简略说明和使用方法
import time
for i in tqdm(range(0,10)):
time.sleep(0.1)
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00, 9.89it/s]
在进行特征向量的处理的时候,我们经常需要将分类的变量变化以数字形式表达的变量,比如某一个特征向量里为:‘amsterdam’, ‘paris’, 'tokyo‘ 三个地名,不能直接运用于模型中,需要转换为数字变量,比如:amsterdam对于为0,paris为1,tokyo为2,这时我们可以通过LabelEncoder对特征值进行编码。
其功能主要有一下两点:
LabelEncoder可用于规范标签
import sklearn
le=sklearn.preprocessing.LabelEncoder()
le.fit([1, 2, 2, 6]) #将list:[1,2,2,6]进行标签编码
le.classes_ #获取标签值
le.transform([1, 1, 2, 6])#将标签值标准化
le.inverse_transform([0, 0, 1, 2])#返回标签值原来的编码
array([1, 1, 2, 6])
le.fit(["paris", "paris", "tokyo", "amsterdam"]) #将上述数组进行增添标签
le.classes_ #获取标签值
le.transform(["paris", "paris", "tokyo", "amsterdam"])#将标签值标准化
le.inverse_transform([1, 1, 2, 0])#返回标签值原来的编码
array(['paris', 'paris', 'tokyo', 'amsterdam'], dtype='<U9')
data_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 800000 entries, 0 to 799999 Data columns (total 47 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 800000 non-null int64 1 loanAmnt 800000 non-null float64 2 term 800000 non-null int64 3 interestRate 800000 non-null float64 4 installment 800000 non-null float64 5 grade 800000 non-null object 6 subGrade 800000 non-null object 7 employmentTitle 799999 non-null float64 8 employmentLength 753201 non-null object 9 homeOwnership 800000 non-null int64 10 annualIncome 800000 non-null float64 11 verificationStatus 800000 non-null int64 12 issueDate 800000 non-null object 13 isDefault 800000 non-null int64 14 purpose 800000 non-null int64 15 postCode 799999 non-null float64 16 regionCode 800000 non-null int64 17 dti 799761 non-null float64 18 delinquency_2years 800000 non-null float64 19 ficoRangeLow 800000 non-null float64 20 ficoRangeHigh 800000 non-null float64 21 openAcc 800000 non-null float64 22 pubRec 800000 non-null float64 23 pubRecBankruptcies 799595 non-null float64 24 revolBal 800000 non-null float64 25 revolUtil 799469 non-null float64 26 totalAcc 800000 non-null float64 27 initialListStatus 800000 non-null int64 28 applicationType 800000 non-null int64 29 earliesCreditLine 800000 non-null object 30 title 799999 non-null float64 31 policyCode 800000 non-null float64 32 n0 759730 non-null float64 33 n1 759730 non-null float64 34 n2 759730 non-null float64 35 n3 759730 non-null float64 36 n4 766761 non-null float64 37 n5 759730 non-null float64 38 n6 759730 non-null float64 39 n7 759730 non-null float64 40 n8 759729 non-null float64 41 n9 759730 non-null float64 42 n10 766761 non-null float64 43 n11 730248 non-null float64 44 n12 759730 non-null float64 45 n13 759730 non-null float64 46 n14 759730 non-null float64 dtypes: float64(33), int64(9), object(5) memory usage: 286.9+ MB
numerical_fea=list(data_train.select_dtypes(exclude=['object']).columns)
category_fea=list(data_train.select_dtypes('object').columns)
label = 'isDefault'
numerical_fea.remove(label)
在比赛中数据预处理是必不可少的一部分,对于缺失值的填充往往会影响比赛的结果,在比赛中不妨尝试多种填充然后比较结果选择结果最优的一种; 比赛数据相比真实场景的数据相对要“干净”一些,但是还是会有一定的“脏”数据存在,清洗一些异常值往往会获得意想不到的效果。
numerical_fea
['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
data_train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
#查看缺失情况
data_train.isnull().sum()
id 0 loanAmnt 0 term 0 interestRate 0 installment 0 grade 0 subGrade 0 employmentTitle 1 employmentLength 46799 homeOwnership 0 annualIncome 0 verificationStatus 0 issueDate 0 isDefault 0 purpose 0 postCode 1 regionCode 0 dti 239 delinquency_2years 0 ficoRangeLow 0 ficoRangeHigh 0 openAcc 0 pubRec 0 pubRecBankruptcies 405 revolBal 0 revolUtil 531 totalAcc 0 initialListStatus 0 applicationType 0 earliesCreditLine 0 title 1 policyCode 0 n0 40270 n1 40270 n2 40270 n3 40270 n4 33239 n5 40270 n6 40270 n7 40270 n8 40271 n9 40270 n10 33239 n11 69752 n12 40270 n13 40270 n14 40270 dtype: int64
#按照平均数填充数值型特征
data_train[numerical_fea]=data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea]=data_test_a[numerical_fea].fillna(data_test_a[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea]=data_train[category_fea].fillna(data_train[numerical_fea].mode())
data_test_a[category_fea]=data_test_a[category_fea].fillna(data_test_a[category_fea].mode())
data_train.isnull().sum()
id 0 loanAmnt 0 term 0 interestRate 0 installment 0 grade 0 subGrade 0 employmentTitle 0 employmentLength 46799 homeOwnership 0 annualIncome 0 verificationStatus 0 issueDate 0 isDefault 0 purpose 0 postCode 0 regionCode 0 dti 0 delinquency_2years 0 ficoRangeLow 0 ficoRangeHigh 0 openAcc 0 pubRec 0 pubRecBankruptcies 0 revolBal 0 revolUtil 0 totalAcc 0 initialListStatus 0 applicationType 0 earliesCreditLine 0 title 0 policyCode 0 n0 0 n1 0 n2 0 n3 0 n4 0 n5 0 n6 0 n7 0 n8 0 n9 0 n10 0 n11 0 n12 0 n13 0 n14 0 dtype: int64
#查看类别特征
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
时间格式处理
#转换成时间格式
for data in [data_train,data_test_a]:
data.issueDate=pd.to_datetime(data.issueDate,format='%Y-%m-%d')
startdate=datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
#构造时间特征
data['issueDateDT']=data['issueDate'].apply(lambda x:x-startdate).dt.days
data
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | issueDateDT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 800000 | 14000.0 | 3 | 10.99 | 458.28 | B | B3 | 7027.0 | 10+ years | 0 | ... | 4.0 | 15.0 | 19.0 | 6.0 | 17.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2587 |
1 | 800001 | 20000.0 | 5 | 14.65 | 472.14 | C | C5 | 60426.0 | 10+ years | 0 | ... | 3.0 | 3.0 | 9.0 | 3.0 | 5.0 | 0.0 | 0.0 | 2.0 | 2.0 | 2952 |
2 | 800002 | 12000.0 | 3 | 19.99 | 445.91 | D | D4 | 23547.0 | 2 years | 1 | ... | 36.0 | 5.0 | 6.0 | 4.0 | 12.0 | 0.0 | 0.0 | 0.0 | 7.0 | 3410 |
3 | 800003 | 17500.0 | 5 | 14.31 | 410.02 | C | C4 | 636.0 | 4 years | 0 | ... | 2.0 | 8.0 | 14.0 | 2.0 | 10.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2710 |
4 | 800004 | 35000.0 | 3 | 17.09 | 1249.42 | D | D1 | 368446.0 | < 1 year | 1 | ... | 3.0 | 16.0 | 18.0 | 11.0 | 19.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3775 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
199995 | 999995 | 7000.0 | 3 | 11.14 | 229.64 | B | B2 | 330967.0 | 7 years | 1 | ... | 11.0 | 2.0 | 6.0 | 2.0 | 8.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1949 |
199996 | 999996 | 6000.0 | 3 | 6.24 | 183.19 | A | A2 | 38930.0 | 1 year | 1 | ... | 14.0 | 12.0 | 13.0 | 6.0 | 25.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3044 |
199997 | 999997 | 14000.0 | 5 | 15.88 | 339.57 | C | C4 | 282016.0 | 8 years | 2 | ... | 18.0 | 21.0 | 42.0 | 13.0 | 21.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2222 |
199998 | 999998 | 8000.0 | 3 | 18.06 | 289.47 | D | D2 | 97.0 | 4 years | 1 | ... | 5.0 | 8.0 | 19.0 | 6.0 | 11.0 | 0.0 | 0.0 | 0.0 | 2.0 | 3775 |
199999 | 999999 | 8000.0 | 3 | 6.68 | 245.85 | A | A3 | 320.0 | 7 years | 1 | ... | 4.0 | 3.0 | 4.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2802 |
200000 rows × 47 columns
data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year 52489
10+ years 262753
2 years 72358
3 years 64152
4 years 47985
5 years 50102
6 years 37254
7 years 35407
8 years 36192
9 years 30272
< 1 year 64237
NaN 46799
Name: employmentLength, dtype: int64
对象类型特征转换到数值
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [data_train,data_test_a]:
data.employmentLength.replace('10+ years','10 years',inplace=True)
data.employmentLength.replace('< 1 year','0 years',inplace=True)
data.employmentLength=data.employmentLength.apply(employmentLength_to_int)
data
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | issueDateDT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 800000 | 14000.0 | 3 | 10.99 | 458.28 | B | B3 | 7027.0 | 10.0 | 0 | ... | 4.0 | 15.0 | 19.0 | 6.0 | 17.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2587 |
1 | 800001 | 20000.0 | 5 | 14.65 | 472.14 | C | C5 | 60426.0 | 10.0 | 0 | ... | 3.0 | 3.0 | 9.0 | 3.0 | 5.0 | 0.0 | 0.0 | 2.0 | 2.0 | 2952 |
2 | 800002 | 12000.0 | 3 | 19.99 | 445.91 | D | D4 | 23547.0 | 2.0 | 1 | ... | 36.0 | 5.0 | 6.0 | 4.0 | 12.0 | 0.0 | 0.0 | 0.0 | 7.0 | 3410 |
3 | 800003 | 17500.0 | 5 | 14.31 | 410.02 | C | C4 | 636.0 | 4.0 | 0 | ... | 2.0 | 8.0 | 14.0 | 2.0 | 10.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2710 |
4 | 800004 | 35000.0 | 3 | 17.09 | 1249.42 | D | D1 | 368446.0 | 0.0 | 1 | ... | 3.0 | 16.0 | 18.0 | 11.0 | 19.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3775 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
199995 | 999995 | 7000.0 | 3 | 11.14 | 229.64 | B | B2 | 330967.0 | 7.0 | 1 | ... | 11.0 | 2.0 | 6.0 | 2.0 | 8.0 | 0.0 | 0.0 | 0.0 | 4.0 | 1949 |
199996 | 999996 | 6000.0 | 3 | 6.24 | 183.19 | A | A2 | 38930.0 | 1.0 | 1 | ... | 14.0 | 12.0 | 13.0 | 6.0 | 25.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3044 |
199997 | 999997 | 14000.0 | 5 | 15.88 | 339.57 | C | C4 | 282016.0 | 8.0 | 2 | ... | 18.0 | 21.0 | 42.0 | 13.0 | 21.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2222 |
199998 | 999998 | 8000.0 | 3 | 18.06 | 289.47 | D | D2 | 97.0 | 4.0 | 1 | ... | 5.0 | 8.0 | 19.0 | 6.0 | 11.0 | 0.0 | 0.0 | 0.0 | 2.0 | 3775 |
199999 | 999999 | 8000.0 | 3 | 6.68 | 245.85 | A | A3 | 320.0 | 7.0 | 1 | ... | 4.0 | 3.0 | 4.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2802 |
200000 rows × 47 columns
data.employmentLength.value_counts(dropna=False).sort_index()
0.0 15989
1.0 13182
2.0 18207
3.0 16011
4.0 11833
5.0 12543
6.0 9328
7.0 8823
8.0 8976
9.0 7594
10.0 65772
NaN 11742
Name: employmentLength, dtype: int64
data_train['earliesCreditLine'].sample(5)
513127 Aug-1999
4329 Feb-2000
719226 Dec-1984
316424 Oct-1999
157586 Sep-2002
Name: earliesCreditLine, dtype: object
https://blog.csdn.net/marraybug/article/details/84972816
for data in [data_train,data_test_a]:
data['earliesCreditLine']=data.earliesCreditLine.apply(lambda x:int(x[-4:]))
类别特征处理
#部分类别特征
cate_features=['grade','subGrade','employmentTitle','homeOwnership','verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f,'类型数:',data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
policyCode 类型数: 1
for data in [data_train, data_test_a]:
data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
#类型数在2之上,又不是高维稀疏的,且纯分类特征
for data in [data_train,data_test_a]:
data=pd.get_dummies(data,columns=['subGrade','homeOwnership','verificationStatus', 'purpose', 'regionCode'],drop_first=True)
检测异常值的方法一:均方差
在统计学中,如果一个数据分布近似正态,那么大约 68% 的数据值会在均值的一个标准差范围内,大约 95% 会在两个标准差范围内,大约 99.7% 会在三个标准差范围内。
def find_outliers_by_3segama(data,fea):
data_std=np.std(data[fea])
data_mean=np.mean(data[fea])
outliers_cut_off=data_std*3
lower_rule=data_mean-outliers_cut_off
upper_rule=data_mean+outliers_cut_off
data[fea+'_outliers']=data[fea].apply(lambda x:str('异常值') if x>upper_rule or x< lower_rule else '正常值')
return data
data_train=data_train.copy()
for fea in numerical_fea:
data_train=find_outliers_by_3segama(data_train,fea)
print(data_train[fea+'_outliers'].value_counts())
print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
print('*'*10)
正常值 800000 Name: id_outliers, dtype: int64 id_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: loanAmnt_outliers, dtype: int64 loanAmnt_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: term_outliers, dtype: int64 term_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 794259 异常值 5741 Name: interestRate_outliers, dtype: int64 interestRate_outliers 异常值 2916 正常值 156694 Name: isDefault, dtype: int64 ********** 正常值 792046 异常值 7954 Name: installment_outliers, dtype: int64 installment_outliers 异常值 2152 正常值 157458 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: employmentTitle_outliers, dtype: int64 employmentTitle_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 799701 异常值 299 Name: homeOwnership_outliers, dtype: int64 homeOwnership_outliers 异常值 62 正常值 159548 Name: isDefault, dtype: int64 ********** 正常值 793973 异常值 6027 Name: annualIncome_outliers, dtype: int64 annualIncome_outliers 异常值 756 正常值 158854 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: verificationStatus_outliers, dtype: int64 verificationStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 783003 异常值 16997 Name: purpose_outliers, dtype: int64 purpose_outliers 异常值 3635 正常值 155975 Name: isDefault, dtype: int64 ********** 正常值 798931 异常值 1069 Name: postCode_outliers, dtype: int64 postCode_outliers 异常值 221 正常值 159389 Name: isDefault, dtype: int64 ********** 正常值 799994 异常值 6 Name: regionCode_outliers, dtype: int64 regionCode_outliers 异常值 1 正常值 159609 Name: isDefault, dtype: int64 ********** 正常值 798440 异常值 1560 Name: dti_outliers, dtype: int64 dti_outliers 异常值 466 正常值 159144 Name: isDefault, dtype: int64 ********** 正常值 778245 异常值 21755 Name: delinquency_2years_outliers, dtype: int64 delinquency_2years_outliers 异常值 5089 正常值 154521 Name: isDefault, dtype: int64 ********** 正常值 788261 异常值 11739 Name: ficoRangeLow_outliers, dtype: int64 ficoRangeLow_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 ********** 正常值 788261 异常值 11739 Name: ficoRangeHigh_outliers, dtype: int64 ficoRangeHigh_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 ********** 正常值 790889 异常值 9111 Name: openAcc_outliers, dtype: int64 openAcc_outliers 异常值 2195 正常值 157415 Name: isDefault, dtype: int64 ********** 正常值 792471 异常值 7529 Name: pubRec_outliers, dtype: int64 pubRec_outliers 异常值 1701 正常值 157909 Name: isDefault, dtype: int64 ********** 正常值 794120 异常值 5880 Name: pubRecBankruptcies_outliers, dtype: int64 pubRecBankruptcies_outliers 异常值 1423 正常值 158187 Name: isDefault, dtype: int64 ********** 正常值 790001 异常值 9999 Name: revolBal_outliers, dtype: int64 revolBal_outliers 异常值 1359 正常值 158251 Name: isDefault, dtype: int64 ********** 正常值 799948 异常值 52 Name: revolUtil_outliers, dtype: int64 revolUtil_outliers 异常值 23 正常值 159587 Name: isDefault, dtype: int64 ********** 正常值 791663 异常值 8337 Name: totalAcc_outliers, dtype: int64 totalAcc_outliers 异常值 1668 正常值 157942 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: initialListStatus_outliers, dtype: int64 initialListStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 784586 异常值 15414 Name: applicationType_outliers, dtype: int64 applicationType_outliers 异常值 3875 正常值 155735 Name: isDefault, dtype: int64 ********** 正常值 775134 异常值 24866 Name: title_outliers, dtype: int64 title_outliers 异常值 3900 正常值 155710 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: policyCode_outliers, dtype: int64 policyCode_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 782773 异常值 17227 Name: n0_outliers, dtype: int64 n0_outliers 异常值 3485 正常值 156125 Name: isDefault, dtype: int64 ********** 正常值 790500 异常值 9500 Name: n1_outliers, dtype: int64 n1_outliers 异常值 2491 正常值 157119 Name: isDefault, dtype: int64 ********** 正常值 789067 异常值 10933 Name: n2_outliers, dtype: int64 n2_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 ********** 正常值 789067 异常值 10933 Name: n3_outliers, dtype: int64 n3_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 ********** 正常值 788660 异常值 11340 Name: n4_outliers, dtype: int64 n4_outliers 异常值 2476 正常值 157134 Name: isDefault, dtype: int64 ********** 正常值 790355 异常值 9645 Name: n5_outliers, dtype: int64 n5_outliers 异常值 1858 正常值 157752 Name: isDefault, dtype: int64 ********** 正常值 786006 异常值 13994 Name: n6_outliers, dtype: int64 n6_outliers 异常值 3182 正常值 156428 Name: isDefault, dtype: int64 ********** 正常值 788430 异常值 11570 Name: n7_outliers, dtype: int64 n7_outliers 异常值 2746 正常值 156864 Name: isDefault, dtype: int64 ********** 正常值 789625 异常值 10375 Name: n8_outliers, dtype: int64 n8_outliers 异常值 2131 正常值 157479 Name: isDefault, dtype: int64 ********** 正常值 786384 异常值 13616 Name: n9_outliers, dtype: int64 n9_outliers 异常值 3953 正常值 155657 Name: isDefault, dtype: int64 ********** 正常值 788979 异常值 11021 Name: n10_outliers, dtype: int64 n10_outliers 异常值 2639 正常值 156971 Name: isDefault, dtype: int64 ********** 正常值 799434 异常值 566 Name: n11_outliers, dtype: int64 n11_outliers 异常值 112 正常值 159498 Name: isDefault, dtype: int64 ********** 正常值 797585 异常值 2415 Name: n12_outliers, dtype: int64 n12_outliers 异常值 545 正常值 159065 Name: isDefault, dtype: int64 ********** 正常值 788907 异常值 11093 Name: n13_outliers, dtype: int64 n13_outliers 异常值 2482 正常值 157128 Name: isDefault, dtype: int64 ********** 正常值 788884 异常值 11116 Name: n14_outliers, dtype: int64 n14_outliers 异常值 3364 正常值 156246 Name: isDefault, dtype: int64 **********
#删除异常值
for fea in numerical_fea:
data_train=data_train[data_train[fea+'_outliers']=='正常值']
data_train=data_train.reset_index(drop=True)
检测异常的方法二:箱型图
1.固定宽度分箱
#通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1']=np.floor_divide(data['loanAmnt'],1000)
#通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2']=np.floor(np.log10(data['loanAmnt']))
2.分位数分箱
data['loanAmnt_bin3']=pd.qcut(data['loanAmnt'],10,labels=False)
3.卡方分箱及其他分箱方法的尝试
交互特征的构造非常简单,使用起来却代价不菲。如果线性模型中包含有交互特征对,那它的训练时间和评分时间就会从 O(n) 增加到 O(n2),其中 n 是单一特征的数量。
难难难难难难!!!!
for col in ['grade','subGrade']:
temp_dict=data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean':col+'_target_mean'})
temp_dict.index=temp_dict[col].values
temp_dict=temp_dict[col+'_target_mean'].to_dict()
data_train[col+'_target_mean']=data_train[col].map(temp_dict)
data_test_a[col+'_target_mean']=data_test_a[col].map(temp_dict)
#其他衍生变量mean和std
for df in [data_train,data_test_a]:
for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
df['grade_to_mean_'+item]=df['grade']/df.groupby([item])['grade'].transform('mean')
df['grade_to_std_'+item]=df['grade']/df.groupby([item])['grade'].transform('std')
labelEncode直接放入数模型中
#label-encode:subGrade,postGode,title
#高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
le=LabelEncoder()
le.fit(list(data_train[col].astype(str).values)+list(data_test_a[col].astype(str).values))
data_train[col]=le.transform(list(data_train[col].astype(str).values))
data_test_a[col]=le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00, 1.31s/it]
Label Encoding 完成
a=le.inverse_transform(list(data_train[col]))
a
data_train[col]
0 21
1 16
2 17
3 3
4 12
..
612737 21
612738 13
612739 12
612740 3
612741 7
Name: subGrade, Length: 612742, dtype: int64
逻辑回归等模型要单独增加的特征工程
#举例归一化过程(书上P175)
#伪代码
for fea in [要归一化的特征列表]:
data[fea]=((data[fea]-np.min(data[fea]))/(np.max(data[fea])-np.min(data[fea])))
特征选择的方法:
Filter:基于特征间的关系进行筛选
from sklearn.feature_selection import VarianceThreshold
#其中参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train,target_train)
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择k个最好的特征,返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
#输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#参数k为选择的特征个数
SelectKBest(chi2,k=5).fit_transform(train,target_train)
from sklearn.feature_selection import SelectKBest
from minepy import MINE
#由于MINE的设计不是函数式的,定义mic方法将其为函数式的,
#返回一个二元组,二元组的第2项设置成固定的P值0.5
def mic(x, y):
m = MINE()
m.compute_score(x, y)
return (m.mic(), 0.5)
#参数k为选择的特征个数
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)
Wrapper (Recursive feature elimination,RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)
Embedded
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)
数据处理
本数据集中我们删除非入模特征后,并对缺失值填充,然后用计算协方差的方式看一下特征间相关性,然后进行模型训练
#删除不需要的数据
for data in [data_train,data_test_a]:
data.drop(['issueDate','id'],axis=1,inplace=True)
#纵向用缺失值上面的值替换缺失值
data_train=data.fillna(axis=0,method='ffill')
x_train=data_train.drop(['isDefault'], axis=1)
#计算协方差
#计算相关性
data_corr = x_train.corrwith(data_train.isDefault) #计算相关性
#result = pd.DataFrame(columns=['features', 'corr'])
#result['features'] = data_corr.index
#result['corr'] = data_corr.values
data_corr=data_corr.reset_index()
data_corr.columns=['features','corr']
# 特征相关系数可视化
data_numeric = data_train[numerical_fea[1:]]
correlation = data_numeric.corr()
plt.figure(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)
plt.show()
features=[f for f in data_train.columns if f not in ['id','issueDate','isDefault']and '_outliers' not in f]
x_train=data_train[features]
x_test=data_test_a[features]
y_train=data_train['isDefault']
def cv_model(clf,train_x,train_y,test_x,clf_name): folds=5 seed=2020 kf=KFold(n_splits=folds,shuffle=True,random_state=seed) train=np.zeros(train_x.shape[0]) test=np.zeros(test_x.shape[0]) cv_scores=[] for i,(train_index,valid_index) in enumerate(kf.split(train_x,train_y)): print('************************************ {} ************************************'.format(str(i+1))) trn_x,trn_y,val_x,val_y=train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] if clf_name == "lgb": train_matrix = clf.Dataset(trn_x, label=trn_y) valid_matrix = clf.Dataset(val_x, label=val_y) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020, 'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1, } model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test_x, num_iteration=model.best_iteration) # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]) if clf_name == "xgb": train_matrix = clf.DMatrix(trn_x , label=trn_y) valid_matrix = clf.DMatrix(val_x , label=val_y) params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.04, 'tree_method': 'exact', 'seed': 2020, 'nthread': 36, "silent": True, } watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')] model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200) val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit) test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit) if clf_name == "cat": params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False} model = clf(iterations=20000, **params) model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=500) val_pred = model.predict(val_x) test_pred = model.predict(test_x) train[valid_index] = val_pred test = test_pred / kf.n_splits cv_scores.append(roc_auc_score(val_y, val_pred)) print(cv_scores) print("%s_scotrainre_list:" % clf_name, cv_scores) print("%s_score_mean:" % clf_name, np.mean(cv_scores)) print("%s_score_std:" % clf_name, np.std(cv_scores)) return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
lgb_train,lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.74923 valid_1's auc: 0.729754 [400] training's auc: 0.76495 valid_1's auc: 0.730429 Early stopping, best iteration is: [387] training's auc: 0.763868 valid_1's auc: 0.730485 [0.7304850956718165] ************************************ 2 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749181 valid_1's auc: 0.731532 [400] training's auc: 0.764625 valid_1's auc: 0.731788 Early stopping, best iteration is: [325] training's auc: 0.759049 valid_1's auc: 0.731971 [0.7304850956718165, 0.73197059577665] ************************************ 3 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.74861 valid_1's auc: 0.732765 [400] training's auc: 0.764095 valid_1's auc: 0.733776 [600] training's auc: 0.77777 valid_1's auc: 0.733716 Early stopping, best iteration is: [459] training's auc: 0.768269 valid_1's auc: 0.733943 [0.7304850956718165, 0.73197059577665, 0.7339429680725573] ************************************ 4 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749754 valid_1's auc: 0.728774 [400] training's auc: 0.765111 valid_1's auc: 0.729632 [600] training's auc: 0.77826 valid_1's auc: 0.729179 Early stopping, best iteration is: [401] training's auc: 0.765161 valid_1's auc: 0.729646 [0.7304850956718165, 0.73197059577665, 0.7339429680725573, 0.7296461716819063] ************************************ 5 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.74861 valid_1's auc: 0.732897 [400] training's auc: 0.764348 valid_1's auc: 0.733471 [600] training's auc: 0.778164 valid_1's auc: 0.733496 Early stopping, best iteration is: [475] training's auc: 0.769796 valid_1's auc: 0.733656 [0.7304850956718165, 0.73197059577665, 0.7339429680725573, 0.7296461716819063, 0.7336557307068068] lgb_scotrainre_list: [0.7304850956718165, 0.73197059577665, 0.7339429680725573, 0.7296461716819063, 0.7336557307068068] lgb_score_mean: 0.7319401123819473 lgb_score_std: 0.0016932184710438153
#保存处理好的数据
data_train.to_csv('train_data_v1.csv',index=False)
data_test_a.to_csv('test_data_v1.csv',index=False)
data_train.to_csv('data_for_model.csv',index=None)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。