赞
踩
任务
从真实场景和实际应用出发,利用个人的基本身份信息、个人的住房公积金缴存和贷款等数据信息,需要参赛者建立准确的风险控制模型,来预测用户是否会逾期还款。
提交说明:
数据
训练集提供40000名,测试集提供15000名的缴存人基本信息、缴存信息,贷款信息。选手可以下载数据,在本地进行算法调试,在比赛页面提交结果。
数据样本如下:
1.导包
import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" # export CUDA_VISIBLE_DEVICES=0 # 打印 TF 可用的 GPU print(os.environ['CUDA_VISIBLE_DEVICES']) import warnings import numpy as np import pandas as pd # import xgboost as xgb from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score from sklearn.model_selection import GridSearchCV from matplotlib import pyplot as plt warnings.filterwarnings("ignore") plt.rcParams['font.sans-serif']=['Simhei'] plt.rcParams['axes.unicode_minus']=False import json import matplotlib from scipy.stats import chi2 import scipy import seaborn as sns from lightgbm.sklearn import LGBMClassifier from sklearn.ensemble import RandomForestClassifier,VotingClassifier # from xgboost import XGBClassifier import time from sklearn.ensemble import GradientBoostingClassifier # from lightgbm import LGBMClassifier from tqdm import tqdm from sklearn.preprocessing import PolynomialFeatures from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import GradientBoostingClassifier
2.工具函数
2.1统计相关系数较大的字段
def relation(df, poly_num=0.15): """ DataFrame.corr(method='pearson', min_periods=1) 参数说明: method:可选值为{‘pearson’, ‘kendall’, ‘spearman’} pearson:Pearson相关系数来衡量两个数据集合是否在一条线上面,即针对线性数据的相关系数计算,针对非线性 数据便会有误差。 kendall:用于反映分类变量相关性的指标,即针对无序序列的相关系数,非正太分布的数据 spearman:非线性的,非正太分析的数据的相关系数 min_periods:样本最少的数据量 返回值:各类型之间的相关系数DataFrame表格。 """ all_cate_2_col=[] method=['pearson','kendall','spearman'] for m in method: poly_corrs = df[:40000].corr(method=m)['label'].sort_values() po_temp = [] for i in range(len(poly_corrs)): if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in ['label']): po_temp.append(poly_corrs.index[i]) print(str(m)+'相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个') #取并集 all_cate_2_col=list(set(all_cate_2_col).union(set(po_temp))) print(len(all_cate_2_col)) # print(all_cate_2_col) return all_cate_2_col
2.2循环递归消除法RFECV,进行特征选择
# 循环递归消除法RFECV def clf_rfecv(df,cate_2_cols,rank_num=1): cate_2_cols=[col for col in df.columns if col not in ['id', 'label']] X=df[:40000][cate_2_cols] y=df[:40000]['label'] print(X.shape) print(y.shape) # RFECV clf_rfecv = LGBMClassifier( boosting_type='gbdt', objective='binary', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9 ) from sklearn.model_selection import StratifiedKFold from sklearn.feature_selection import RFECV rfecv = RFECV( estimator=clf_rfecv, # 学习器 step=1, # 移除特征个数 cv=StratifiedKFold(5), # 交叉验证次数 scoring='accuracy', # 学习器的评价标准 verbose = 1, n_jobs = 12 ).fit(X, y) X_RFECV = rfecv.transform(X) print("RFECV特征选择结果——————————————————————————————————————————————————") # 和传参对应,所选择的属性的个数 print("有效特征个数: \n"+str(rfecv.n_features_)) # # 打印的是相应位置上属性的排名 # print("全部特征等级: \n"+str(rfecv.ranking_)) # # 属性选择的一种模糊表示,选择的是true,未选择的是false # print(rfecv.support_) rfecv_cate_2_col=[] for i in range(len(cate_2_cols)): if(rfecv.ranking_[i]<=rank_num): print(cate_2_cols[i]) rfecv_cate_2_col.append(cate_2_cols[i]) print(len(rfecv_cate_2_col)) # return rfecv_cate_2_col
2.3找到数值变化较少的字段
# 数值类型较少的数据
def find_weak_filed(df):
weak_filed=[]
for i in range(len(df.columns)):
# print('-------------'+str(df.columns[i])+'---------------')
else_sum=0
for j in range(1,len(df[df.columns[i]].value_counts().index)):
else_sum=else_sum+df[df.columns[i]].value_counts().values[j]
# print(else_sum)
if(else_sum<=50):
weak_filed.append(df.columns[i])
return weak_filed
2.4 统计单值,二值,多分类,连续型字段
def find_filed_class(df,n=20): cate_1_cols=[] cate_2_cols=[] cate_cols=[] num_cols1=[] for i in tqdm(range(len(df.columns))): if(len(df[df.columns[i]].value_counts().index)==1): cate_1_cols.append(df.columns[i]) if(len(df[df.columns[i]].value_counts().index)==2 and df.columns[i]!='label'): cate_2_cols.append(df.columns[i]) elif(2<len(df[df.columns[i]].value_counts().index)<=n and df.columns[i]!='DKLL'): cate_cols.append(df.columns[i]) elif(len(df[df.columns[i]].value_counts().index)>n and df.columns[i]!='id'): num_cols1.append(df.columns[i]) print(len(cate_1_cols)) print(len(cate_2_cols)) print(len(cate_cols)) print(len(num_cols1)) return cate_1_cols,cate_2_cols,cate_cols,num_cols1
2.5GBDT衡量特征的重要性,进行特征选择
# GBDT是如何衡量特征的重要性的? # 计算所有的非叶子节点在分裂时加权不纯度的减少,减少得越多说明特征越重要。 # 不纯度的减少实际上就是该节点此次分裂的收益,因此我们也可以这样理解,节点分裂时收益越大, # 该节点对应的特征的重要度越高。 # 基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。 # 在feature_selection库的SelectFromModel类结合GBDT模型可以用于选择特征,相关代码如下: #GBDT作为基模型的特征选择 def GBDTselectfea(df,max_num=200): cols = [col for col in df.columns if col not in ['id', 'label']] X=df[:40000][cols] y=df[:40000]['label'] print(X.shape) print(y.shape) if(X.shape[1]<max_num): max_num=X.shape[1] grd = SelectFromModel(GradientBoostingClassifier(),max_features=max_num) grd.fit_transform(X,y) # print(grd.feature_importances_) gbdt_fea_select = grd.get_support() gbdt_select=[] gbdt_fea_select = grd.get_support() print(gbdt_fea_select) for i in range(len(gbdt_fea_select)): # print(gbdt_fea_select[i]) # print(X.columns) if(gbdt_fea_select[i]==True): print(X.columns[i]) gbdt_select.append(X.columns[i]) # print(gbdt_select) print(len(gbdt_select)) return gbdt_select
2.6构造多项式特征并计算相关系数较大的字段
def polynomial_features(df, poly_num=0.15 ,change=0,degreenum=2): """ poly_num:相关性 change:0原始字段不变,1输出新增字段 degreenum:阶数 """ num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE', 'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN', 'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL', 'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE', 'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE', 'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE', 'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE', 'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE', 'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE', 'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL'] poly_features = df[:40000][num_gen_feats] # Create the polynomial object with specified degree poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False) poly_transformer.fit(poly_features) poly_features = poly_transformer.transform(poly_features) # 新特征是否与target有相关性。 poly_features = pd.DataFrame(poly_features , columns = poly_transformer.get_feature_names(input_features = num_gen_feats) ) poly_features['TARGET'] =df[:40000]['label'] poly_corrs = poly_features.corr()['TARGET'].sort_values() po_temp = [] for i in range(len(poly_corrs)): if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_cols + gen_feats + ['TARGET']): po_temp.append(poly_corrs.index[i]) print('相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个') if(change == 1): dfpo = df[num_cols + gen_feats] dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False) dfpo_transformer.fit(dfpo) dfpo = dfpo_transformer.transform(dfpo) dfpo = pd.DataFrame(dfpo , columns = poly_transformer.get_feature_names(input_features = num_gen_feats) ) # 新的字段拼接到df上 df = dfpo[po_temp] return df
2.7展示连续数据的分布和其log后的分布
def Normal_distribution(df, value_vars, change=0): """ value_vars:需要查看的字段 change:0不变,1变log,2新增log """ for i in tqdm(range(len(value_vars))): plt.figure(figsize=(16,5)) plt.suptitle(str(value_vars[i])+'Distribution', fontsize=10) plt.subplot(1,2,1) sub_plot_1 = sns.distplot(df[value_vars[i]]) sub_plot_1.set_title(str(value_vars[i])+" Distribuition", fontsize=10) sub_plot_1.set_xlabel("数值") sub_plot_1.set_ylabel("Probability", fontsize=10) plt.subplot(1,2,2) sub_plot_2 = sns.distplot(np.log(df[value_vars[i]]+1)) sub_plot_2.set_title(str(value_vars[i])+"(Log) Distribuition", fontsize=10) sub_plot_2.set_xlabel("数值") sub_plot_2.set_ylabel("Probability", fontsize=10) if(change==1): for i in tqdm(range(len(value_vars))): df[value_vars[i]] = np.log(df[value_vars[i]]+1) if(change==2): for i in tqdm(range(len(value_vars))): df[str(value_vars[i])+'_log'] = np.log(df[value_vars[i]]+1) return df return df
2.8检测并删除数值完全一样的字段
#查看是否存在重复的行列 def if_field_is_same(df): all_df_cols = df.columns hight = len(df) del_filed=[] # # 删除存在重复的行 # print('是否存在重复行: ',any(df.duplicated())) # if(any(df.duplicated())==True): # df.drop_duplicates(inplace = True) # 检查是否存在重复的列 for i in tqdm(range(0,len(all_df_cols)-1,1)): # print("---------"+str(all_df_cols[i])+"---------") if(all_df_cols[i] not in del_filed): for j in range(i+1,len(all_df_cols),1): # print(all_df_cols[j]) for k in range(hight): # print(k) if(df[all_df_cols[i]][k]!=df[all_df_cols[j]][k]): # print("not_same") break if(k==hight-1): # print("字段 "+str(all_df_cols[i])+" 与字段 "+str(all_df_cols[j])+" 完全一样") del_filed.append(all_df_cols[j]) if(len(del_filed)==0): print('是否存在重复列: 否') else: del_filed = set(del_filed) print('存在重复列: '+str(len(del_filed))+'个\n为:'+str(del_filed)) df=df.drop(del_filed,axis=1) return df
2.9绘制变量分布的散点图
# 变量的数值分布
def shuzhifenbu(cols, high=40):
for i in range(len(cols)):
plt.figure(figsize=(15,high))
print(str(cols[i])+"的数值分布")
plt.subplot(len(cols), 1, i+1)
plt.title(cols[i])
x = df[cols[i]]
y = df.index
plt.scatter(x, y , s=1)
plt.show()
return
2.10卡方分箱
# 卡方分箱 # 计算卡方值 def chi3(arr): ''' 计算卡方值 arr:频数统计表,二维numpy数组。 ''' assert(arr.ndim==2) #计算每行总频数 R_N = arr.sum(axis=1) #每列总频数 C_N = arr.sum(axis=0) #总频数 N = arr.sum() # 计算期望频数 C_i * R_j / N。 E = np.ones(arr.shape)* C_N / N E = (E.T * R_N).T square = (arr-E)**2 / E #期望频数为0时,做除数没有意义,不计入卡方值 square[E==0] = 0 #卡方值 v = square.sum() return v # 确定卡方分箱点 def chiMerge(df,col,target,max_groups=None,threshold=None): ''' 卡方分箱 df: pandas dataframe数据集 col: 需要分箱的变量名(数值型) target: 类标签 max_groups: 最大分组数。 threshold: 卡方阈值,如果未指定max_groups,默认使用置信度95%设置threshold。 return: 包括各组的起始值的列表. ''' freq_tab = pd.crosstab(df[col],df[target]) #转成numpy数组用于计算。 freq = freq_tab.values #初始分组切分点,每个变量值都是切分点。每组中只包含一个变量值. #分组区间是左闭右开的,如cutoffs = [1,2,3],则表示区间 [1,2) , [2,3) ,[3,3+)。 cutoffs = freq_tab.index.values #如果没有指定最大分组 if max_groups is None: #如果没有指定卡方阈值,就以95%的置信度(自由度为类数目-1)设定阈值。 if threshold is None: #类数目 cls_num = freq.shape[-1] threshold = chi2.isf(0.05,df= cls_num - 1) while True: minvalue = None minidx = None #从第1组开始,依次取两组计算卡方值,并判断是否小于当前最小的卡方 for i in range(len(freq) - 1): v = chi3(freq[i:i+2]) if minvalue is None or (minvalue > v): #小于当前最小卡方,更新最小值 minvalue = v minidx = i #如果最小卡方值小于阈值,则合并最小卡方值的相邻两组,并继续循环 if (max_groups is not None and max_groups< len(freq) ) or (threshold is not None and minvalue < threshold): #minidx后一行合并到minidx tmp = freq[minidx] + freq[minidx+1] freq[minidx] = tmp #删除minidx后一行 freq = np.delete(freq,minidx+1,0) #删除对应的切分点 cutoffs = np.delete(cutoffs,minidx+1,0) else: #最小卡方值不小于阈值,停止合并。 break return cutoffs # 生成分组后的新变量 def value2group(x,cutoffs): ''' 将变量的值转换成相应的组。 x: 需要转换到分组的值 cutoffs: 各组的起始值。 return: x对应的组,如group1。从group1开始。 ''' #切分点从小到大排序。 cutoffs = sorted(cutoffs) num_groups = len(cutoffs) #异常情况:小于第一组的起始值。这里直接放到第一组。 #异常值建议在分组之前先处理妥善。 if x < cutoffs[0]: return 'group1' for i in range(1,num_groups): if cutoffs[i-1] <= x < cutoffs[i]: return 'group{}'.format(i) #最后一组,也可能会包括一些非常大的异常值。 return 'group{}'.format(num_groups) # 实现WOE 编码 def calWOE(df ,var ,target): ''' 计算WOE编码 param df:数据集pandas.dataframe param var:已分组的列名,无缺失值 param target:响应变量(0,1) return:编码字典 ''' eps = 0.000001 #避免除以0 gbi = pd.crosstab(df[var],df[target]) + eps gb = df[target].value_counts() + eps gbri = gbi/gb gbri['woe'] = np.log(gbri[1]/gbri[0]) return gbri['woe'].to_dict() # 实现IV值计算 def calIV(df,var,target): ''' 计算IV值 param df:数据集pandas.dataframe param var:已分组的列名,无缺失值 param target:响应变量(0,1) return:IV值 ''' eps = 0.000001 #避免除以0 gbi = pd.crosstab(df[var],df[target]) + eps gb = df[target].value_counts() + eps gbri = gbi/gb gbri['woe'] = np.log(gbri[1]/gbri[0]) gbri['iv'] = (gbri[1] - gbri[0])*gbri['woe'] return gbri['iv'].sum()
2.11 使用LGBMClassifier计算feature_importances_
# 筛选相关性>0的字段 def important_featrue(pre_train, pre_train_label): svc = LGBMClassifier( boosting_type='dart', #提升树的类型,常用的梯度提升方法包括gbdt、dart、goss、rf。 learning_rate=0.23, #0.05->0.918 0.07->0.924 0.08->0.926 n_estimators=150, #拟合的树的棵树,可以理解为训练的轮数。弱学习器的个数,其中gbdt原理是利用通过梯度不断拟合新的弱学习器,直到达到设定的弱学习器的数量。 max_depth=31, #最大树的深度。每个弱学习器也就是决策树的最大深度。其中,-1表示不限制。 num_leaves=1053, #树的最大叶子数,控制模型复杂性的最重要参数之一。对比在xgboost中,一般为2^(max_depth) subsample=0.2707, #训练样本采样率,行 colsample_bytree=0.95, #训练特征采样率,列 random_state=6, #随机种子数 min_data_in_leaf=124, # 可防止在叶子树中过度拟合,最佳值取决于训练样本和的数量num_leaves reg_alpha= 0.2462, reg_lambda=0.3140, # lambda_l1= 0.89, # 0.1 # lambda_l2=0.69, # 0.2 min_split_gain=0.22, min_child_weight=0.84, metric='auc',#模型度量标准,"rmse"、"auc"、'binary_logloss' n_jobs=12, #并行运行多线程核心数 verbose=-1 ) x_train = pre_train y_train = pre_train_label #fit svc.fit(x_train, y_train) feat_labels = x_train.columns[0:] fold_importance_df = pd.DataFrame() fold_importance_df["importance"] = svc.feature_importances_ fold_importance_df["featrue_name"] = feat_labels importances = fold_importance_df["importance"] useful_featrue=[] for i in tqdm(range(len(fold_importance_df))): if(fold_importance_df['importance'][i]!=0): # print(fold_importance_df['featrue_name'][i]) useful_featrue.append(fold_importance_df['featrue_name'][i]) # print(importance_0) useful_featrue = pd.DataFrame(useful_featrue, columns=['featrue_name']) useful_featrue.to_csv('D:/useful_featrue.csv',index=0) print(len(useful_featrue)) return
2.12 找到空字段
# 判断是否有空字段 """ 输入:df 输出:col_is_null有空值的字段;missing空值率 """ def pankong(df): temp = [] col_is_null = [] j = 0 temp=df.isnull().any() # print(temp)#返回每列是否有空值 colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values}) for i in range(len(colnull['isnulls'])): if(colnull['isnulls'][i] == True): print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i])) col_is_null.append(colnull['colname'][i]) j=j+1 print("共有字段:"+str(len(colnull))+"个 "+" 含有空值的:"+str(j)+"个") if(j>0): missing = [] missing = df.isnull().sum()/len(df) missing = missing[missing > 0] missing.sort_values(inplace=True) plt.figure(figsize=(20, 8), dpi=80) missing.plot.bar() return col_is_null,missing
2.13选出大于缺失率>0.1的字段,并删除
#选出大于缺失率>0.1的字段,并删除
def select_missing_rate(df,missing,rate=0.1):
temp = []
for i in range(len(missing)):
if(missing.index[i]!='label'):
if(missing.values[i]>rate):
temp.append(missing.index[i])
print(temp)
if('label' in temp):
temp.remove('label')
df=df.drop(temp,axis=1)
return df
# df = select_missing_rate(df,missing,rate=0.1)
2.14
# 用众数填空字段 def fill_kongzhi(df,fill="del"): # sub_label_cols=[col for col in df.columns if col not in ['id', 'label']] # df=df[sub_label_cols] temp = [] exist_nan = [] j = 0 temp=df.isnull().any() # print(temp)#返回每列是否有空值 colnull=pd.DataFrame(data={'colname': temp.index,'isnulls':temp.values}) for i in range(len(colnull['isnulls'])): if(colnull['isnulls'][i] == True): print(str(colnull['colname'][i]) + "---------" + str(colnull['isnulls'][i])) if(colnull['colname'][i] != 'label'): exist_nan.append(colnull['colname'][i]) j=j+1 print("共有字段:"+str(len(colnull))+"个 "+" 含有空值的:"+str(j)+"个") if(fill=="mode"): print(len(exist_nan)) for j in range(len(exist_nan)): print(str(exist_nan[j])+"---众数为---"+str(df[exist_nan[j]].mode())) df[exist_nan[j]].fillna(df[exist_nan[j]].median(), inplace = True) if(fill=="del"): df=df.drop(exist_nan,axis=1) return df
3 数据准备和简单的特征工程
3.1载入数据
train_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/train.csv')
test_df = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/test.csv')
submit = pd.read_csv('J:\BaiduNetdiskDownload\公积金逾期预测-数据\公积金逾期预测-数据/submit.csv')
train_df.shape, test_df.shape, submit.shape
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df_label = train_df_copy['label'] #保存label
# train_df_copy=train_df_copy.drop(['label'],axis=1)
#合并训练集测试集
df = pd.concat([train_df_copy, test_df_copy], axis = 0).reset_index(drop = True)
# df = pd.concat((train_df_copy, test_df_copy), axis=0)
print(df.shape)
3.2将原始变量区分连续变量,多类别变量,两类别变量
train = train_df_copy
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
# train[num_cols]
# train[cate_cols]
# train['XUELI'].value_counts()
3.3查看原始变量的数值分布
shuzhifenbu(cate_cols,25)
shuzhifenbu(cate_2_cols,15)
shuzhifenbu(num_cols)
#可见其中,训练集与测试集中数值分布有明显区别的字段为:DKLL、ZHIWU、DWSSHY、HYZK
3.4查看异常值
检测异常的方法一:均方差
在统计学中,如果一个数据分布近似正态,那么大约 68% 的数据值会在均值的一个标准差范围内,大约 95% 会在两个标准差范围内,大约 99.7% 会在三个标准差范围内。
# 暂时不删除异常值! # def find_outliers_by_3segama(data,fea): # data_std = np.std(data[fea]) # data_mean = np.mean(data[fea]) # outliers_cut_off = data_std * 3 # lower_rule = data_mean - outliers_cut_off # upper_rule = data_mean + outliers_cut_off # data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值') # return data # for fea in num_cols: # data_train = find_outliers_by_3segama(train,fea) # print(train[fea+'_outliers'].value_counts()) # print(train.groupby(fea+'_outliers')['label'].sum()) # print('*'*10) # #删除异常值 # for fea in num_cols: # train = train[train[fea+'_outliers']=='正常值'] # train = train.reset_index(drop=True) # print(train)
# numerical_fea = list(df.select_dtypes(exclude=['object']).columns)
# category_fea = list(filter(lambda x: x not in numerical_fea,list(df.columns)))
# print(numerical_fea)
# print(category_fea)
此段值观测了异常值,未进行修改
检测异常的方法二:箱型图(未做)
3.5 出生年月
# 修改出生年月为年龄.'CSY'为出生的月份,CSNY为年龄的分箱值,age为年龄 # 先 import time 然后 time.gmtime(Unix timestamp) # import time def transform_csny_to_age(i): # print(i) if(len(str(i))>10): i=i/1000 a = time.gmtime(int(i)) # print("year:"+str(a[0])+" "+"month:"+str(a[1])) age = 2020-a[0] # print(age) return age def transform_csny_to_month(i): if(len(str(i))>10): i=i/1000 a = time.gmtime(int(i)) # print("year:"+str(a[0])+" "+"month:"+str(a[1])) month = a[1] return month # 月份 df['CSY'] = df['CSNY'] df['CSY'] = df['CSY'].transform(transform_csny_to_month) # 年龄 # df['CSN'] = df['CSNY'] df['CSNY'] = df['CSNY'].transform(transform_csny_to_age) sns.distplot(df['CSY'][df['CSY'] > 0]) print(df['CSY'].value_counts())
def get_age(df,col = 'age'): df[col+"_genFeat1"]=(df['age'] > 23).astype(int) df[col+"_genFeat2"]=(df['age'] > 28).astype(int) df[col+"_genFeat3"]=(df['age'] > 32).astype(int) df[col+"_genFeat4"]=(df['age'] > 36).astype(int) df[col+"_genFeat5"]=(df['age'] > 43).astype(int) df[col+"_genFeat6"]=(df['age'] > 50).astype(int) return df, [col + f'_genFeat{i}' for i in range(1, 7)] df['age'] = df['CSNY'] df, genFeats1 = get_age(df, col = 'age') sns.distplot(df['age'][df['age'] > 0]) #将数据按照年龄(青年:20-35;中年:36-60;老年:61-85)和性别(男女)分组,共分为6组 # print(set(train_test_data_copy["年龄"])) def transform_age(x_age): # print(x_age) if x_age<23: return 1 elif 23<=x_age<28: return 2 elif 28<=x_age<32: return 3 elif 32<=x_age<36: return 4 elif 36<=x_age<43: return 5 elif 43<=x_age<50: return 6 elif x_age>=50: return 7 df['CSNY'] = df['CSNY'].transform(transform_age) print(df['CSNY'].value_counts())
3.6 贷款余额、贷款发放额
def get_daikuanYE(df,col): df[col + '_genFeat1'] = (df[col] > 100000).astype(int) df[col + '_genFeat2'] = (df[col] > 120000).astype(int) df[col + '_genFeat3'] = (df[col] > 140000).astype(int) df[col + '_genFeat4'] = (df[col] > 180000).astype(int) df[col + '_genFeat5'] = (df[col] > 220000).astype(int) df[col + '_genFeat6'] = (df[col] > 260000).astype(int) df[col + '_genFeat7'] = (df[col] > 300000).astype(int) return df, [col + f'_genFeat{i}' for i in range(1, 8)] df, genFeats2 = get_daikuanYE(df, col = 'DKYE') def get_daikuanFFE(df,col): df[col + '_genFeat1'] = (df[col] > 100000).astype(int) df[col + '_genFeat2'] = (df[col] > 120000).astype(int) df[col + '_genFeat3'] = (df[col] > 140000).astype(int) df[col + '_genFeat4'] = (df[col] > 180000).astype(int) df[col + '_genFeat5'] = (df[col] > 220000).astype(int) df[col + '_genFeat6'] = (df[col] > 260000).astype(int) df[col + '_genFeat7'] = (df[col] > 300000).astype(int) return df, [col + f'_genFeat{i}' for i in range(1, 8)] df, genFeats3 = get_daikuanFFE(df, col = 'DKFFE') plt.figure(figsize = (8, 2)) plt.subplot(1,2,1) sns.distplot(df['DKYE'][df['label'] == 1]) plt.subplot(1,2,2) sns.distplot(df['DKFFE'][df['label'] == 1])
# 小额贷款(MicroCredit)是以个人或家庭为核心的经营类贷款, # 其主要的服务对象为广大工商个体户、小作坊、小业主。 # 贷款的金额一般为20万元以下,1000元以上。 def transform_dkye(dkye): if 0<=dkye<1000: return 1 elif 1000<=dkye<50000: return 2 elif 50000<=dkye<100000: return 3 elif 100000<=dkye<150000: return 4 elif 150000<=dkye<200000: return 5 elif 200000<=dkye<250000: return 6 elif 250000<=dkye<300000: return 7 elif dkye>=300000: return 8 df['DKYE_class'] = df['DKYE'] df['DKYE_class'] = df['DKYE_class'].transform(transform_dkye) def transform_dkffe(dkye): if 0<=dkye<1000: return 1 elif 1000<=dkye<50000: return 2 elif 50000<=dkye<100000: return 3 elif 100000<=dkye<150000: return 4 elif 150000<=dkye<200000: return 5 elif 200000<=dkye<250000: return 6 elif 250000<=dkye<300000: return 7 elif dkye>=300000: return 8 df['DKFFE_class'] = df['DKFFE'] df['DKFFE_class'] = df['DKFFE_class'].transform(transform_dkffe) print(df['DKYE_class'].value_counts(), df['DKFFE_class'].value_counts())
3.7个人月缴存额
def get_GRYJCE(df,col): df[col + '_genFeat1'] = (df[col] > 400).astype(int) df[col + '_genFeat2'] = (df[col] > 600).astype(int) df[col + '_genFeat3'] = (df[col] > 800).astype(int) df[col + '_genFeat4'] = (df[col] > 1000).astype(int) df[col + '_genFeat5'] = (df[col] > 1200).astype(int) df[col + '_genFeat6'] = (df[col] > 1400).astype(int) df[col + '_genFeat7'] = (df[col] > 1600).astype(int) return df, [col + f'_genFeat{i}' for i in range(1, 8)] df, genFeats4 = get_GRYJCE(df, col = 'GRYJCE') plt.figure(figsize = (8, 2)) plt.subplot(1,2,1) sns.distplot(df['GRYJCE'][df['label'] == 1]) def transform_GRYJCE(dkye): if dkye<=400: return 1 elif 400<dkye<=600: return 2 elif 600<dkye<=800: return 3 elif 800<dkye<=1000: return 4 elif 1000<dkye<=1200: return 5 elif 1200<dkye<=1400: return 6 elif 1400<dkye<=1600: return 7 elif dkye>1600: return 8 df['GRYJCE_class'] = df['GRYJCE'] df['GRYJCE_class'] = df['GRYJCE_class'].transform(transform_GRYJCE) print(df['GRYJCE_class'].value_counts())
3.8个人缴款基数
def get_GRYJCE(df,col): df[col + '_genFeat1'] = (df[col] > 2000).astype(int) df[col + '_genFeat2'] = (df[col] > 4000).astype(int) df[col + '_genFeat3'] = (df[col] > 6000).astype(int) df[col + '_genFeat4'] = (df[col] > 8000).astype(int) df[col + '_genFeat5'] = (df[col] > 1200).astype(int) return df, [col + f'_genFeat{i}' for i in range(1, 6)] df, genFeats5 = get_GRYJCE(df, col = 'GRJCJS') plt.figure(figsize = (8, 2)) plt.subplot(1,2,1) sns.distplot(df['GRJCJS'][df['label'] == 1]) def transform_GRJCJS(dkye): # print(x_age) if 0<=dkye<2000: return 1 elif 2000<=dkye<4000: return 2 elif 4000<=dkye<6000: return 3 elif 6000<=dkye<8000: return 4 elif 8000<=dkye<12000: return 5 elif dkye>=12000: return 6 df['GRJCJS_class'] = df['GRJCJS'] df['GRJCJS_class'] = df['GRJCJS_class'].transform(transform_GRJCJS) print(df['GRJCJS_class'].value_counts())
3.9 个人账户余额、上年归结余额
def get_GRZHYE(df,col): df[col + '_genFeat1'] = (df[col] > 2000).astype(int) df[col + '_genFeat2'] = (df[col] > 4000).astype(int) df[col + '_genFeat3'] = (df[col] > 8000).astype(int) df[col + '_genFeat4'] = (df[col] > 12000).astype(int) df[col + '_genFeat5'] = (df[col] > 20000).astype(int) return df, [col + f'_genFeat{i}' for i in range(1, 6)] df, genFeats6 = get_GRZHYE(df, col = 'GRZHYE') df, genFeats7 = get_GRZHYE(df, col = 'GRZHSNJZYE') plt.figure(figsize = (8, 2)) plt.subplot(1,2,1) sns.distplot(df['GRZHYE'][df['label'] == 1]) plt.subplot(1,2,2) sns.distplot(df['GRZHSNJZYE'][df['label'] == 1]) def transform_GRZHYE(dkye): # print(x_age) if 0<=dkye<2000: return 1 elif 2000<=dkye<4000: return 2 elif 4000<=dkye<8000: return 3 elif 8000<=dkye<12000: return 4 elif 12000<=dkye<20000: return 5 elif dkye>=20000: return 6 df['GRZHYE_class'] = df['GRZHYE'] df['GRZHYE_class'] = df['GRZHYE_class'].transform(transform_GRZHYE) df['GRZHSNJZYE_class'] = df['GRZHSNJZYE'] df['GRZHSNJZYE_class'] = df['GRZHSNJZYE_class'].transform(transform_GRZHYE) print(df['GRZHYE_class'].value_counts(), df['GRZHSNJZYE_class'].value_counts())
3.10 消除DKLL的扰动
# 消除DKLL的扰动 dkll = test_df_copy['DKLL'].value_counts() dkll_value = pd.DataFrame(data={'colname': dkll.index,'value':dkll.values}) dkll_value[:6] temp_dkll_value = dkll_value[:6]['colname'] print(temp_dkll_value) # 找出df中所有值为最常出现的六种贷款利率的行作为训练集 dkll_index=[] for i in tqdm(range(len(df))): for j in range(len(temp_dkll_value)): if (df['DKLL'][i]==temp_dkll_value[j]): dkll_index.append(i) print(len(dkll_index)) # print(dkll_index) #dkll_index中索引在40000-54999区间,且DKLL不是最常见的六种的行最为测试集,预测其真实值 test_all_index = list(range(40000,55000)) test_index = [i for i in test_all_index if i not in dkll_index] # print(test_index) print(len(test_index)) # dkll训练集 tarin_df_dkll = [] tarin_df_dkll = df.loc[dkll_index] print(tarin_df_dkll) # dkll测试集 test_df_dkll= [] test_df_dkll = df.loc[test_index] print(test_df_dkll)
pankong(tarin_df_dkll)
dkll_cols = [col for col in tarin_df_dkll.columns if col not in ['DKLL','label','id']] X = tarin_df_dkll[dkll_cols] Y = pd.get_dummies(tarin_df_dkll['DKLL']) print(Y) #决策树 # from sklearn import tree # clf = tree.DecisionTreeClassifier(criterion='entropy') # 随机森林 # from sklearn.ensemble import RandomForestClassifier # clf = RandomForestClassifier(n_estimators=200) # # 导入KNN 分类器 from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier() clf.fit(X,Y) test_df_dkll = test_df_dkll[dkll_cols] res = clf.predict(test_df_dkll) print(len(res)) res_temp = [] for i in range(len(res)): # print(res[i]) if(res[i][0]==1): res_temp.append(2.292) elif(res[i][1]==1): res_temp.append(2.521) elif(res[i][2]==1): res_temp.append(2.708) elif(res[i][3]==1): res_temp.append(2.979) elif(res[i][4]==1): res_temp.append(3.250) elif(res[i][5]==1): res_temp.append(3.575) else: res_temp.append(2.708) # res_temp.append(1.111111111111111111) print(len(res_temp)) print(res_temp) test_df_dkll['DKLL'] = res_temp for i in (test_index): df.at[i,'DKLL'] = test_df_dkll['DKLL'][i] #使用at来改变df # df['DKLL'] plt.figure(figsize=(15,5)) print("DKLL的数值分布") plt.title('DKLL') x = df['DKLL'] y = df.index plt.scatter(x, y , s=1) plt.show() df['DKLL_CLASS']=df['DKLL']
4. 特征工程
# 可以把生成后的新特征也归类到以下三种,生成更多的新特征!!!!!!!!!!!!!!!!!!!!!!!!!
# 类别变量
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
# 连续变量
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
old_fea=[]
old_fea = [col for col in df.columns if col not in ['id', 'label']]
print(old_fea)
4.1 构造业务特征
# 数据脱敏bai处理就是对敏感数据du进行变形处zhi理,其目的是保护隐私dao数据zhuan等信息的安全, # 例如机构和企业收集的个人身份信息、手机号码、银行卡信息等敏感数据。 #个人月缴存额,单位月缴存额---新建相关字段 df['YEAR_GRYJCE'] = df['GRYJCE']*12 #一年的总个人缴存额 df['MONTH_GRYJCE_DWYJCE'] = df['GRYJCE'] + df['DWYJCE'] #一个月的总缴存额 df['YEAR_GRYJCE_DWYJCE'] = (df['GRYJCE'] + df['DWYJCE'])*12 #一年的总缴存额 #贷款余额,贷款发放额---新建相关字段 df['DKYE_TO_DKFFE'] = df['DKYE'] / df['DKFFE'] #已还本金占比 df['DKFFE_SUB_DKYE'] = df['DKFFE'] - df['DKYE'] #贷款未还本金 df['DKFFE_SUB_DKYE_TO_DKFFE'] = (df['DKFFE'] - df['DKYE'])/ df['DKFFE'] #未还本金占比 df['WEIHUAN_TO_YIHUAN'] = df['DKFFE_SUB_DKYE']/df['DKYE'] #未还比已还 # df['YIHUAN_TO_WEIHUAN'] = df['DKYE']/df['DKFFE_SUB_DKYE'] #已还比未还 df['REAL_DKLL'] = df['DKLL']/100 df['DKFFE_SUB_DKYE_DKLL'] = (df['DKFFE'] - df['DKYE'])*df['REAL_DKLL'] #贷款未还本金*利率=未还利息 df['DKFFE_SUB_DKYE_1_DKLL'] = (df['DKFFE'] - df['DKYE'])*(1+df['REAL_DKLL']) #贷款未还本金*利率=未还本息和 df['DKYE_DKLL'] = df['DKYE']*df['REAL_DKLL'] #贷款已还本金*利率=已还利息 df['DKYE_1_DKLL'] = df['DKYE']*(1+df['REAL_DKLL']) #贷款已还本金*1+利率=已还本息和 df['DKFFE_DKLL'] = df['DKFFE']*df['REAL_DKLL'] #贷款总利息 df['DKFFE_1_DKLL'] = df['DKFFE']*(1+df['REAL_DKLL']) #贷款总本息和 df['DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL'] = df['DKFFE_SUB_DKYE_1_DKLL'] / df['DKFFE_1_DKLL'] #未还本息和/贷款总本息和 df['DKYE_TO_DKFFE_1_DKLL'] = df['DKYE_1_DKLL']/ df['DKFFE_1_DKLL'] #已还本息和/贷款总本息和 df['DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKFFE_SUB_DKYE_DKLL']/ df['DKFFE_DKLL'] #未还利息/贷款总利息 df['DKYE_DKLL_TO_DKFFE_DKLL'] = df['DKYE_DKLL']/ df['DKFFE_DKLL'] #已还利息/贷款总利息 # 个人账户当年归集余额 = 汇缴+补缴+结息+转入-提取额 #个人账户当年归结余额,个人账户上年转结余额,个人账户余额---新建相关字段 df['GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE_DWYJCE'] #个人账户当年归结余额 - 一年的总缴存额 df['GRZHDNGJYE_SUB_YEAR_GRYJCE'] = df['GRZHDNGJYE'] - df['YEAR_GRYJCE'] #个人账户当年归结余额 - 一年的总个人缴存额 df['GRZHDNGJYE_SUB_GRZHSNJZYE'] = df['GRZHDNGJYE'] + df['GRZHSNJZYE'] #账户余额(暂当做未脱敏的数据) df['JIEXI'] = (df['GRYJCE'] + df['DWYJCE'])*12*0.015 #一年的结息额 (结息按1.5%) df['BUJIAO_ZHUANRU_SUB_TIQVE']=df['GRZHDNGJYE']-df['YEAR_GRYJCE_DWYJCE']-df['JIEXI']#补缴+转入-提取额=个人账户当年归集余额-汇缴-结息 df['GRYJCE_TO_GRZHYE'] = df['GRYJCE']/df['GRZHYE'] #个人月缴存额/个人账户余额 df['YEAR_GRYJCE_TO_GRZHYE'] = df['YEAR_GRYJCE']/df['GRZHYE'] #一年的总个人缴存额/个人账户余额 df['MONTH_GRYJCE_DWYJCE_TO_GRZHYE'] = df['MONTH_GRYJCE_DWYJCE']/df['GRZHYE'] #一个月的总缴存额/个人账户余额 df['GRZHDNGJYE_TO_GRZHYE'] = df['GRZHDNGJYE']/df['GRZHYE'] #个人账户当年归结余额/个人账户余额 df['GRZHSNJZYE_TO_GRZHYE'] = df['GRZHSNJZYE']/df['GRZHYE'] #个人账户上年转结余额/个人账户余额 df['BUJIAO_ZHUANRU_SUB_TIQVE'] = df['BUJIAO_ZHUANRU_SUB_TIQVE']/df['GRZHYE'] #(补缴+转入-提取额)/个人账户余额 df['JIEXI_TO_YEAR_GRYJCE_DWYJCE'] = df['JIEXI']/df['YEAR_GRYJCE_DWYJCE'] #一年的结息额/一年的总缴存额 df['JIEXI_TO_GRZHDNGJYE'] = df['JIEXI']/df['GRZHDNGJYE'] #一年的结息额/个人账户当年归结余额 # 个人缴款基数---新建相关字段 df['GJJJKBL'] = df['GRYJCE'] / df['GRJCJS'] #公积金缴款比例 # df['GRJCJS_TO_DKFFE_SUB_DKYE'] = df['GRJCJS']/df['DKFFE_SUB_DKYE'] #个人缴款基数/贷款未还本金 df['GRJCJS_TO_DKYE'] = df['GRJCJS']/df['DKYE'] #个人缴款基数/已还本金 df['GRJCJS_TO_DKFFE'] = df['GRJCJS']/df['DKFFE'] #个人缴款基数/贷款发放额 df['GRJCJS_TO_GRZHDNGJYE'] = df['GRJCJS']/df['GRZHDNGJYE'] #个人缴款基数/个人账户当年归结余额 df['GRJCJS_TO_GRZHSNJZYE'] = df['GRJCJS']/df['GRZHSNJZYE'] #个人缴款基数/个人账户上年转结余额 df['GRJCJS_TO_GRZHYE'] = df['GRJCJS']/df['GRZHYE'] #个人缴款基数/个人账户余额 # 暂不清楚是否是噪声的字段 df['DKYE_DIV_GRYJCE_ADD_DWYJCE'] = df['DKYE'] / ((df['GRYJCE'] + df['DWYJCE'])*12) df['GRYJCE_ADD_DWYJCE_TO_DKYE'] = (df['GRYJCE'] + df['DWYJCE']) / df['DKYE'] df['GRZHYE_diff_GRZHDNGJYE'] = df['GRZHYE'] - df['GRZHDNGJYE'] df['GRZHYE_diff_GRZHSNJZYE'] = df['GRZHYE'] - df['GRZHSNJZYE'] # 'YIHUAN_TO_WEIHUAN','GRJCJS_TO_DKFFE_SUB_DKYE' gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE', 'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN', 'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL', 'DKFFE_1_DKLL','DKFFE_SUB_DKYE_TO_DKFFE_1_DKLL','DKYE_TO_DKFFE_1_DKLL', 'DKFFE_SUB_DKYE_DKLL_TO_DKFFE_DKLL','DKYE_DKLL_TO_DKFFE_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE', 'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE', 'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE', 'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_YEAR_GRYJCE_DWYJCE','JIEXI_TO_GRZHDNGJYE', 'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE', 'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE', 'GRZHYE_diff_GRZHSNJZYE'] #若有两个字段是一样的要只保留一个 #对于有正负数的字段要新建表示正负的字段
#将float转为三位小数
for i in range(len(df.columns)):
# print(df.columns[i])
# print(df[df.columns[i]].dtype)
if(df.columns[i]!=['label',]):
if(df[df.columns[i]].dtype=='float64'):
df[df.columns[i]] = df[df.columns[i]].apply(lambda x:round(x,4))
print(df)
_,missing = pankong(df)
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
# 保存以上处理过后的数据到
df.to_csv('D:/df_little_change.csv',index = False)
df = pd.read_csv('D:/df_little_change.csv')
print(df.shape)
print(df)
4.2类别特征count、count ratio、onehot编码等
for f in tqdm(cate_cols): # 将类型数据转换成01234...的数字 df[f] = df[f].map(dict(zip(df[f].unique(), range(df[f].nunique())))) # map()的功能是将一个自定义函数作用于Series对象的每个元素。 # df[f + '_count']字段表示类型数据中,不同值在该字段中分别出现的次数 df[f + '_count'] = df[f].map(df[f].value_counts()) # 使用get_dummies方法将类型数据转换成独热编码 df = pd.concat([df,pd.get_dummies(df[f],prefix=f"{f}")],axis=1) # 将两个字段联合起来 cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) \ for j in range(i + 1, len(cate_cols))] for f1, f2 in tqdm(cate_cols_combine): # ???两个类型字段中各种值的出现次数的相加 df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['id'].transform('count') df['{}_in_{}_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / df[f2 + '_count'] df['{}_in_{}_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / df[f1 + '_count']
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
df=if_field_is_same(df)
print(df.shape)
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
4.3 离散型单特征衍生
# Create Features based on anonymised prefix groups prefix = cate_2_cols for i, p in enumerate(prefix): print(i,p) #column_set[]是以'XINGBIE', 'ZHIWU', 'XUELI'开头的字段 column_set = [x for x in df.columns.tolist() if x.startswith(prefix[i])] # Take NA count df[p + "_group_nan_sum"] = df[column_set].isnull().sum(axis=1) / df[column_set].shape[1] # Take SUM/Mean if numeric numeric_cols = [x for x in column_set if df[x].dtype != object] if numeric_cols: df[p + "_group_sum"] = df[column_set].sum(axis=1) df[p + "_group_mean"] = df[column_set].mean(axis=1) # Zero Count df[p + "_group_0_count"] = (df[column_set] == 0).astype(int).sum(axis=1) / ( df[column_set].shape[1] - df[p + "_group_nan_sum"])
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
no_cate_1_cols = [col for col in df.columns if col not in cate_1_cols]
df=df[no_cate_1_cols]
print(df.shape)
df=if_field_is_same(df)
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df)
#relation_cate_2_cols用来存放选取的相关性较大的二值字段
relation_cate_2_cols = relation(df[cate_2_cols+['label']], poly_num=0.05)
# print(relation_cate_2_cols)
检查哪些数据的值差不多一样,发作用不大
weak_filed = find_weak_filed(df[cate_cols+cate_2_cols])
print(len(weak_filed))
print(weak_filed)
特征选择
rfecv_cate_2_col=clf_rfecv(df,cate_2_cols)
relation_cate_2_cols 与 rfecv_cate_2_col取并集–>select_cate_2_col
# relation_cate_2_cols 与 rfecv_cate_2_col取并集
select_cate_2_col=list(set(relation_cate_2_cols).union(set(rfecv_cate_2_col)))
print(len(select_cate_2_col))
print(select_cate_2_col)
# 保存二值类数据到本地
df[select_cate_2_col].to_csv('D:/rizhao_select_cate_2_col.csv',index = False)
_,missing = pankong(df)
df = select_missing_rate(df,missing,rate=0.001)
4.4 df内只留下多类别数据和连续数据
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(df,)
# 保存二值类数据到本地
df[cate_cols+num_cols1+['label']].to_csv('D:/rizhao_cate_cols_num_cols1.csv',index = False)
cate_cols_num_cols1_df = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(cate_cols_num_cols1_df.shape)
print(cate_cols_num_cols1_df)
过滤多分类字段
cate_1_cols,cate_2_cols,cate_cols,num_cols1 = find_filed_class(cate_cols_num_cols1_df,20) df=cate_cols_num_cols1_df # 相关系数 relation_cate_cols = relation(df[cate_cols+['label']], poly_num=0.05) rfecv_cate_cols=clf_rfecv(df,cate_cols) # relation_cate_cols 与 rfecv_cate_cols 取并集 select_cate_col=[] select_cate_col=list(set(relation_cate_cols).union(set(rfecv_cate_cols))) select_cate_col=list(set(select_cate_col).union(set(['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']))) print(len(select_cate_col)) print(select_cate_col) # 保存多分类数据到本地 df[select_cate_col].to_csv('D:/rizhao_select_cate_col.csv',index = False) # 保存上一部处理过的数据到本地 df[select_cate_col+num_cols1+['label']].to_csv('D:/rizhao_select_cate_col_num_cols1.csv',index = False)
4.5 类别特征与数值特征交叉
select_cate_col_num_cols1 = pd.read_csv('D:/rizhao_cate_cols_num_cols1.csv')
print(select_cate_col_num_cols1.shape)
print(select_cate_col_num_cols1)
df_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv')
select_cate_col = df_select_cate_col.columns
print(len(select_cate_col))
_,_,cate_cols,num_cols1 = find_filed_class(select_cate_col_num_cols1,20)
num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
select_cate_col_num_cols1[select_cate_col]
多类别与数值第一次交叉
select_cate_col_num_cols1['label']=df['label'] relation_cate_cols = [] rfecv_cate_col=[] i=0 for f1 in tqdm(select_cate_col): temp_cate_cols=[] g = select_cate_col_num_cols1.groupby(f1) # print(g) for f2 in num_gen_feats: for stat in ['sum', 'mean', 'std']: # f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min' select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat) temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat)) # 相关系数 relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1)) print(len(relation_cate_cols))
多类别与数值第二次交叉
for f1 in tqdm(select_cate_col):
temp_cate_cols=[]
g = select_cate_col_num_cols1.groupby(f1)
# print(g)
for f2 in num_gen_feats:
for stat in ['max', 'min', 'var','count']:
# f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
select_cate_col_num_cols1['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
temp_cate_cols.append('{}_{}_{}'.format(f1, f2, stat))
# 相关系数
relation_cate_cols.extend(relation(select_cate_col_num_cols1[temp_cate_cols+['label']], poly_num=0.1))
print(len(relation_cate_cols))
select_cate_col_num_cols1[relation_cate_cols]
# 保存上一部处理过的数据到本地
select_cate_col_num_cols1[relation_cate_cols+['label']].to_csv('D:/df_relation_cate_cols.csv',index = False)
df_relation_cate_cols = pd.read_csv('D:/df_relation_cate_cols.csv')
print(df_relation_cate_cols.shape)
print(df_relation_cate_cols)
_,missing = pankong(df_relation_cate_cols)
df_relation_cate_cols = select_missing_rate(df_relation_cate_cols,missing,rate=0.001)
df_relation_cate_cols = fill_kongzhi(df_relation_cate_cols,fill="mode")
_,missing = pankong(df_relation_cate_cols)
cate_1_cols,_,_,_ = find_filed_class(df_relation_cate_cols,20)
no_cate_1_cols = [col for col in df_relation_cate_cols.columns if col not in cate_1_cols]
df_relation_cate_cols=df_relation_cate_cols[no_cate_1_cols]
print(df_relation_cate_cols.shape)
gbdt_select_temp_df2=[]
for i in tqdm(range(int(len(df_relation_cate_cols.columns)/400+1))):
temp_col=[]
temp_col.extend(df_relation_cate_cols.columns[i*400:i*400+400])
# print(temp_col)
# print(len(temp_col))
# print(i)
if(i<int(len(df_relation_cate_cols.columns)/400)):
gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col+['label']],max_num=150))
elif(i==int(len(df_relation_cate_cols.columns)/400)):
gbdt_select_temp_df2.extend(GBDTselectfea(df_relation_cate_cols[temp_col],max_num=150))
print(len(gbdt_select_temp_df2))
print(len(gbdt_select_temp_df2))
print(gbdt_select_temp_df2)
gbdt_select_cate_num_mix = df_relation_cate_cols[gbdt_select_temp_df2+['label']]
gbdt_select_cate_num_mix=if_field_is_same(gbdt_select_cate_num_mix)
print(gbdt_select_cate_num_mix.shape)
# 保存上一部处理过的数据到本地
gbdt_select_cate_num_mix.to_csv('D:/gbdt_select_temp_df2.csv',index = False)
gbdt_select_cate_num_mix= pd.read_csv('D:/gbdt_select_temp_df2.csv')
print(gbdt_select_cate_num_mix.shape)
print(gbdt_select_cate_num_mix)
4.6 数值特征与数值特征交叉
num_gen_feats = ['YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'DKYE_TO_DKFFE', 'DKFFE_SUB_DKYE', 'DKFFE_SUB_DKYE_TO_DKFFE', 'WEIHUAN_TO_YIHUAN',
'REAL_DKLL','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL','DKYE_DKLL','DKYE_1_DKLL','DKFFE_DKLL',
'DKFFE_1_DKLL','GRZHDNGJYE_SUB_YEAR_GRYJCE_DWYJCE',
'GRZHDNGJYE_SUB_YEAR_GRYJCE','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','BUJIAO_ZHUANRU_SUB_TIQVE',
'GRYJCE_TO_GRZHYE','YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','JIEXI_TO_GRZHDNGJYE',
'GJJJKBL','GRJCJS_TO_DKYE','GRJCJS_TO_DKFFE','GRJCJS_TO_GRZHDNGJYE','GRJCJS_TO_GRZHSNJZYE',
'GRJCJS_TO_GRZHYE','DKYE_DIV_GRYJCE_ADD_DWYJCE','GRYJCE_ADD_DWYJCE_TO_DKYE','GRZHYE_diff_GRZHDNGJYE',
'GRZHYE_diff_GRZHSNJZYE','GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]
# # 计算中出现的空值主要来自于这里!!!
relation_num_cols = []
i=0
for f1 in tqdm(num_gen_feats):
temp_num_cols=[]
g = df.groupby(f1)
# print(g)
for f2 in num_gen_feats:
for stat in ['sum', 'mean', 'std']:
# f1的每种类型对应的f2的'sum', 'mean', 'std', 'max', 'min'
df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
temp_num_cols.append('{}_{}_{}'.format(f1, f2, stat))
# 相关系数
relation_num_cols.extend(relation(df[temp_num_cols+['label']], poly_num=0.05))
print(relation_num_cols) print(len(relation_num_cols)) # 保存上一部处理过的数据到本地 df[relation_num_cols+['label']].to_csv('D:/df_relation_num_cols.csv',index = False) df_relation_num_cols = pd.read_csv('D:/df_relation_num_cols.csv') print(df_relation_num_cols.shape) print(df_relation_num_cols) _,missing = pankong(df_relation_num_cols) df_relation_num_cols = select_missing_rate(df_relation_num_cols,missing,rate=0.001) df_relation_num_cols = fill_kongzhi(df_relation_num_cols,fill="mode") _,missing = pankong(df_relation_num_cols) cate_1_cols,_,_,_ = find_filed_class(df_relation_num_cols,20) no_cate_1_cols = [col for col in df_relation_num_cols.columns if col not in cate_1_cols] df_relation_num_cols=df_relation_num_cols[no_cate_1_cols] print(df_relation_num_cols.shape)
gbdt_select_temp_df3=[]
for i in tqdm(range(int(len(df_relation_num_cols.columns)/400+1))):
temp_col=[]
temp_col.extend(df_relation_num_cols.columns[i*400:i*400+400])
# print(temp_col)
# print(len(temp_col))
# print(i)
if(i<int(len(df_relation_num_cols.columns)/400)):
gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col+['label']],max_num=150))
elif(i==int(len(df_relation_num_cols.columns)/400)):
gbdt_select_temp_df3.extend(GBDTselectfea(df_relation_num_cols[temp_col],max_num=150))
print(len(gbdt_select_temp_df3))
print(len(gbdt_select_temp_df3))
print(gbdt_select_temp_df3)
gbdt_select_num_num_mix = df_relation_num_cols[gbdt_select_temp_df3+['label']]
gbdt_select_num_num_mix=if_field_is_same(gbdt_select_num_num_mix)
print(gbdt_select_num_num_mix.shape)
# 保存上一部处理过的数据到本地
gbdt_select_num_num_mix.to_csv('D:/gbdt_select_num_num_mix.csv',index = False)
gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv')
print(gbdt_select_num_num_mix.shape)
print(gbdt_select_num_num_mix)
4.7 多项式特征
num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL',
'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE',
'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE',
'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL',
'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL']
df[num_gen_feats]
def polynomial_features111(df, poly_num=0.15 ,change=0,degreenum=2): """ poly_num:相关性 change:0原始字段不变,1输出新增字段 degreenum:阶数 """ num_gen_feats = ['MONTH_GRYJCE_DWYJCE','DKFFE_SUB_DKYE','DKFFE_SUB_DKYE_DKLL','DKFFE_SUB_DKYE_1_DKLL', 'DKYE_DKLL','DKFFE_DKLL','DKFFE_1_DKLL','GRZHDNGJYE_SUB_GRZHSNJZYE','JIEXI','GRYJCE_TO_GRZHYE', 'YEAR_GRYJCE_TO_GRZHYE','MONTH_GRYJCE_DWYJCE_TO_GRZHYE','GRZHDNGJYE_TO_GRZHYE', 'GRZHSNJZYE_TO_GRZHYE','BUJIAO_ZHUANRU_SUB_TIQVE','GRZHYE_diff_GRZHDNGJYE','REAL_DKLL', 'GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE','DKFFE', 'DKYE', 'DKLL'] poly_features = df[:40000][num_gen_feats] # Create the polynomial object with specified degree poly_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False) poly_transformer.fit(poly_features) poly_features = poly_transformer.transform(poly_features) # 新特征是否与target有相关性。 poly_features = pd.DataFrame(poly_features , columns = poly_transformer.get_feature_names(input_features = num_gen_feats) ) poly_features['TARGET'] =df[:40000]['label'] poly_corrs = poly_features.corr()['TARGET'].sort_values() po_temp = [] for i in range(len(poly_corrs)): if(abs(poly_corrs[i])>poly_num and poly_corrs.index[i] not in num_gen_feats + ['TARGET']): po_temp.append(poly_corrs.index[i]) print('相关性>'+str(poly_num)+'的字段为:\n'+str(po_temp)+'\n共有'+str(len(po_temp))+'个') if(change == 1): dfpo = df[num_gen_feats] dfpo_transformer = PolynomialFeatures(degree=degreenum, interaction_only=False, include_bias=False) dfpo_transformer.fit(dfpo) dfpo = dfpo_transformer.transform(dfpo) dfpo = pd.DataFrame(dfpo , columns = poly_transformer.get_feature_names(input_features = num_gen_feats) ) # 新的字段拼接到df上 df = dfpo[po_temp] return df,po_temp
# Make a new dataframe for polynomial features
df_poly,poly_field = polynomial_features111(df[num_gen_feats+['label']],poly_num=0.01 ,change=1,degreenum=2)
print(df_poly.shape)
print(len(poly_field))
gbdt_poly_df=[]
df_poly['label']=df['label']
gbdt_poly_df.extend(GBDTselectfea(df_poly[poly_field+['label']],max_num=100))
# 保存上一部处理过的数据到本地
df_poly[gbdt_poly_df].to_csv('D:/df_gbdt_poly_fea.csv',index = False)
df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv')
print(df_gbdt_poly_fea.shape)
print(df_gbdt_poly_fea)
4.8 连续型变量分析log
1.查看某一个数值型变量的分布,查看变量是否符合正态分布, 如果不符合正太分布的变量可以log化后再观察下是否符合正态分布。 2.如果想统一处理一批数据变标准化 必须把这些之前已经正态化的数据提出 3.正态化的原因:一些情况下正态非正态可以让模型更快的收敛, 一些模型要求数据正态(eg. GMM、KNN),保证数据不要过偏态即可,过于偏态可能会影响模型预测结果。
value_vars = ['GRZHYE','GRJCJS', 'GRYJCE', 'YEAR_GRYJCE', 'MONTH_GRYJCE_DWYJCE', 'YEAR_GRYJCE_DWYJCE',
'JIEXI','DKYE_DIV_GRYJCE_ADD_DWYJCE','GJJJKBL']
df = Normal_distribution(df, value_vars, 0)
4.9 拼接上面筛选的所有特征
# 原始字段和一些新字段 print("-----------------原始字段和一些新字段--------------------") df_little_change = pd.read_csv('D:/df_little_change.csv') print(df_little_change.shape) print(df_little_change) # 二值类数据 print("-----------------二值类数据--------------------") rizhao_select_cate_2_col = pd.read_csv('D:/rizhao_select_cate_2_col.csv') rizhao_select_cate_2_col['id'] = df_little_change['id'] print(rizhao_select_cate_2_col.shape) print(rizhao_select_cate_2_col) # 多值类数据 print("-----------------多值类数据--------------------") rizhao_select_cate_col = pd.read_csv('D:/rizhao_select_cate_col.csv') rizhao_select_cate_col['id'] = df_little_change['id'] print(rizhao_select_cate_col.shape) print(rizhao_select_cate_col) # 类别与数值交叉 print("-----------------类别与数值交叉--------------------") gbdt_select_temp_df2= pd.read_csv('D:/gbdt_select_temp_df2.csv') gbdt_select_temp_df2['id'] = df_little_change['id'] print(gbdt_select_temp_df2.shape) print(gbdt_select_temp_df2) # 数值与数值交叉 print("-----------------数值与数值交叉--------------------") gbdt_select_num_num_mix= pd.read_csv('D:/gbdt_select_num_num_mix.csv') gbdt_select_num_num_mix['id'] = df_little_change['id'] print(gbdt_select_num_num_mix.shape) print(gbdt_select_num_num_mix) # 多项式数据 print("-----------------多项式数据--------------------") df_gbdt_poly_fea= pd.read_csv('D:/df_gbdt_poly_fea.csv') df_gbdt_poly_fea['id'] = df_little_change['id'] print(df_gbdt_poly_fea.shape) print(df_gbdt_poly_fea)
df = pd.merge(df_little_change,rizhao_select_cate_2_col ,on='id')
print(df.shape)
df = pd.merge(df,rizhao_select_cate_col ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_temp_df2 ,on='id')
print(df.shape)
df = pd.merge(df,gbdt_select_num_num_mix ,on='id')
print(df.shape)
df = pd.merge(df,df_gbdt_poly_fea ,on='id')
print(df.shape)
_,missing = pankong(df)
df = fill_kongzhi(df)
_,missing = pankong(df)
print(len(df.columns))
print(len(set(df.columns)))
df=if_field_is_same(df)
df.shape
print(len(df.columns))
print(len(set(df.columns)))
col_temp=[] for i in range(len(df.columns)): print(df.columns[i]) if(df.columns[i] not in col_temp): if(" " in df.columns[i]): col_temp.append(df.columns[i].replace(" ", "_*_")) else: col_temp.append(df.columns[i]) print(len(col_temp)) print(col_temp) df.columns = col_temp for i in range(len(df.columns)): print(df.columns[i])
# 保存上一部处理过的数据到本地
df.to_csv('D:/df_concat.csv',index = False)
5. 模型调参
方法一:
第一步:学习率和迭代次数
import pandas as pd import lightgbm as lgb # from sklearn.cross_validation import train_test_split from sklearn.model_selection import train_test_split cols = [col for col in df.columns if col not in ['label','id']] X=df[:40000][cols] y=df[:40000]['label'] print(X.shape) print(y.shape) X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread':12, 'learning_rate':0.1, 'num_leaves':32, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, } data_train = lgb.Dataset(X_train, y_train) cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0) print('best n_estimators:', len(cv_results['auc-mean'])) print('best cv score:', pd.Series(cv_results['auc-mean']).max())
第二步:确定max_depth和num_leaves
from sklearn.model_selection import GridSearchCV params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)} gsearch1 = GridSearchCV( estimator=lgb.LGBMClassifier( boosting_type='gbdt',objective='binary',metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, bagging_fraction = 0.8, feature_fraction = 0.8), param_grid = params_test1, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch1.fit(X_train,y_train) gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_
第三步:确定min_data_in_leaf和max_bin in
params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)} gsearch2 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt',objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, bagging_fraction = 0.8, feature_fraction = 0.8), param_grid = params_test2, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch2.fit(X_train,y_train) gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_
第四步:确定feature_fraction、bagging_fraction、bagging_freq
params_test3={'feature_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0], 'bagging_fraction': [0.65,0.7,0.75,0.8,0.85,0.9,1.0], 'bagging_freq': range(0,101,10)} gsearch3 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71), param_grid = params_test3, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch3.fit(X_train,y_train) gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_
第五步:确定lambda_l1和lambda_l2
# params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0], # 'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]} params_test4={'lambda_l1': [0.8,0.85,0.9,0.95], 'lambda_l2': [0.8,0.85,0.9,0.95]} gsearch4 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq=0, feature_fraction= 0.8), param_grid = params_test4, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch4.fit(X_train,y_train) gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_
第六步:确定 min_split_gain
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]} gsearch5 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9), param_grid = params_test5, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch5.fit(X_train,y_train) gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_
# subsample params_test6={'subsample':[0.0,0.1,0.2,0.3,0.4]} gsearch6 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9, min_split_gain=0), param_grid = params_test6, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch6.fit(X_train,y_train) gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_
# colsample_bytree params_test7={'colsample_bytree':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]} gsearch7 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9, min_split_gain=0, subsample=0), param_grid = params_test7, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch7.fit(X_train,y_train) gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_
# min_child_weight params_test8={'min_child_weight':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]} gsearch8 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=154, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9, min_split_gain=0, subsample=0, colsample_bytree=0), param_grid = params_test8, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch8.fit(X_train,y_train) gsearch8.cv_results_, gsearch8.best_params_, gsearch8.best_score_
对调参没有经验,所以不知道以上超惨设置好之后,学习率和迭代次数怎么调整?所以索性写了一个循环,找到大概的较好的值
# subsample_freq params_test9={'learning_rate':[0.02,0.03,0.04,0.05,0.06,0.07,0.08], 'n_estimators':[1000,2000,5000,8000,10000,20000,30000]} gsearch9 = GridSearchCV( estimator = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metrics='auc', max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9, min_split_gain=0, subsample=0, colsample_bytree=0, min_child_weight=0), param_grid = params_test9, scoring='roc_auc', cv=5, n_jobs=-1 ) gsearch9.fit(X_train,y_train) gsearch9.cv_results_, gsearch9.best_params_, gsearch9.best_score_
第七步:降低学习率,增加迭代次数,验证模型
auc_list=[] tpr_list=[] for j in [0.018,0.019,0.02,0.21,0.22,0.023,0.24,0.025]: auc_list=[] tpr_list=[] for i in [5000,6000,7000,8000,10000,12000,15000,18000,20000,25000]: model=lgb.LGBMClassifier( learning_rate=j, n_estimators=i, max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9, min_split_gain=0, subsample=0, colsample_bytree=0, min_child_weight=0 ) model.fit(X_train,y_train) y_pre=model.predict_proba(X_test)[:, 1] print("---------------------------------------------------") print("learning_rate:"+str(j)+" "+"n_estimators:"+str(i)) auc=round(roc_auc_score(y_test,y_pre), 6) tpr=round(tpr_weight_funtion(y_test,y_pre), 6) if(auc in auc_list and tpr in tpr_list): print("---break---") break auc_list.append(auc) tpr_list.append(tpr) print("auc:",auc) print("tpr:",tpr) print("---------------------------------------------------") # 0.06-10000-0.470364-0.941146 # 0.06-20000-0.470364-0.941146 # 0.05-20000-0.476182-0.941146
方法二:
import pandas as pd import lightgbm as lgb from sklearn import metrics # from sklearn.datasets import load_breast_cancer # from sklearn.cross_validation import train_test_split # canceData=load_breast_cancer() # X=canceData.data # y=canceData.target # X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) cols = [col for col in df.columns if col not in ['label','id']] X=df[:40000][cols] y=df[:40000]['label'] print(X.shape) print(y.shape) X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) ### 数据转换 print('数据转换') lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False) ### 设置初始参数--不含交叉验证参数 print('设置参数') params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread':4, 'learning_rate':0.1 } ### 交叉验证(调参) print('交叉验证') max_auc = float('0') best_params = {} # 准确率 print("调参1:提高准确率") for num_leaves in range(5,100,5): for max_depth in range(3,8,1): params['num_leaves'] = num_leaves params['max_depth'] = max_depth cv_results = lgb.cv( params, lgb_train, seed=1, nfold=5, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if mean_auc >= max_auc: max_auc = mean_auc best_params['num_leaves'] = num_leaves best_params['max_depth'] = max_depth if 'num_leaves' and 'max_depth' in best_params.keys(): params['num_leaves'] = best_params['num_leaves'] params['max_depth'] = best_params['max_depth'] # 过拟合 print("调参2:降低过拟合") for max_bin in range(5,256,10): for min_data_in_leaf in range(1,102,10): params['max_bin'] = max_bin params['min_data_in_leaf'] = min_data_in_leaf cv_results = lgb.cv( params, lgb_train, seed=1, nfold=5, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if mean_auc >= max_auc: max_auc = mean_auc best_params['max_bin']= max_bin best_params['min_data_in_leaf'] = min_data_in_leaf if 'max_bin' and 'min_data_in_leaf' in best_params.keys(): params['min_data_in_leaf'] = best_params['min_data_in_leaf'] params['max_bin'] = best_params['max_bin'] print("调参3:降低过拟合") for feature_fraction in [0.6,0.7,0.8,0.9,1.0]: for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]: for bagging_freq in range(0,50,5): params['feature_fraction'] = feature_fraction params['bagging_fraction'] = bagging_fraction params['bagging_freq'] = bagging_freq cv_results = lgb.cv( params, lgb_train, seed=1, nfold=5, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if mean_auc >= max_auc: max_auc=mean_auc best_params['feature_fraction'] = feature_fraction best_params['bagging_fraction'] = bagging_fraction best_params['bagging_freq'] = bagging_freq if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys(): params['feature_fraction'] = best_params['feature_fraction'] params['bagging_fraction'] = best_params['bagging_fraction'] params['bagging_freq'] = best_params['bagging_freq'] print("调参4:降低过拟合") for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]: for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]: params['lambda_l1'] = lambda_l1 params['lambda_l2'] = lambda_l2 cv_results = lgb.cv( params, lgb_train, seed=1, nfold=5, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if mean_auc >= max_auc: max_auc=mean_auc best_params['lambda_l1'] = lambda_l1 best_params['lambda_l2'] = lambda_l2 if 'lambda_l1' and 'lambda_l2' in best_params.keys(): params['lambda_l1'] = best_params['lambda_l1'] params['lambda_l2'] = best_params['lambda_l2'] print("调参5:降低过拟合2") for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]: params['min_split_gain'] = min_split_gain cv_results = lgb.cv( params, lgb_train, seed=1, nfold=5, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if mean_auc >= max_auc: max_auc=mean_auc best_params['min_split_gain'] = min_split_gain if 'min_split_gain' in best_params.keys(): params['min_split_gain'] = best_params['min_split_gain'] print(best_params)
方法三:贝叶斯
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV from sklearn.metrics import mean_absolute_error, make_scorer, accuracy_score from bayes_opt import BayesianOptimization from sklearn.metrics import f1_score answers = [] mean_score = 0 mean_f1_score = 0 n_folds = 5 sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1024) cols = [col for col in df.columns if col not in ['label','id']] # X=df[:40000][cols] # y=df[:40000]['label'] for tr, te in sk.split(df[:40000][cols], df[:40000]['label']): X = df[:40000][cols].iloc[tr] y = df[:40000]['label'].iloc[tr] print(X.shape) print(y.shape) #定义优化参数 def rf_cv(n_estimators,learning_rate): val = cross_val_score( LGBMClassifier( learning_rate=min(learning_rate,0.15), n_estimators=int(n_estimators), # boosting_type='dart', #提升树的类型,常用的梯度提升方法包括gbdt、dart、goss、rf。 # learning_rate=min(learning_rate,0.4), #0.05->0.918 0.07->0.924 0.08->0.926 # n_estimators=int(n_estimators), #拟合的树的棵树,可以理解为训练的轮数。弱学习器的个数,其中gbdt原理是利用通过梯度不断拟合新的弱学习器,直到达到设定的弱学习器的数量。 # max_depth=int(max_depth), #最大树的深度。每个弱学习器也就是决策树的最大深度。其中,-1表示不限制。 # num_leaves=int(num_leaves), #树的最大叶子数,控制模型复杂性的最重要参数之一。对比在xgboost中,一般为2^(max_depth) # subsample = min(subsample,0.9), #训练样本采样率,行 # colsample_bytree = min(colsample_bytree,0.9), #训练特征采样率,列 # random_state=int(random_state), #随机种子数 # min_data_in_leaf=int(min_data_in_leaf), # 可防止在叶子树中过度拟合,最佳值取决于训练样本和的数量num_leaves # reg_alpha= min(reg_alpha,0.999), # reg_lambda= min(reg_lambda,0.999), # lambda_l1= 0.1, # 0.1 # lambda_l2=0.2, # 0.2 # min_split_gain=min(min_split_gain,0.9), # min_child_weight=min(min_child_weight,0.9), # metric='auc',#模型度量标准,"rmse"、"auc"、'binary_logloss' n_jobs=6, #并行运行多线程核心数 verbose=-1 ), X,y,scoring="accuracy",cv=5 ).mean() return val #贝叶斯优化 rf_bo = BayesianOptimization(rf_cv, { "n_estimators":(1000,20000), "learning_rate":(0.001,0.1) # "colsample_bytree":(0.85,0.97), # "min_data_in_leaf":(100,2000) # "subsample":(0.7,0.9), # "max_depth":(25,40), # "num_leaves":(31,35) # "reg_alpha":(0.2,0.5), # "reg_lambda":(0.3,0.5), # "lambda_l1":(0.6,0.95), # "lambda_l2":(0.5,0.8), # "random_state":(0,1024), # "min_split_gain":(0.2,0.6), # "min_child_weight":(0.6,0.9) })
#开始优化
num_iter = 100
init_points = 5
rf_bo.maximize(init_points=init_points,n_iter=num_iter)#显示优化结果
rf_bo.max
以上步骤后数据位55000行,1971列,调参之后的训练结果不是很好,甚至少于曾经用原始数据+贝叶斯优化的结果。
因此,在这里准备再次进行特征选择
#GBDT
gbdt_col=[]
for i in tqdm(range(int(len(df.columns)/400+1))):
temp_col=[]
temp_col.extend(df.columns[i*400:i*400+400])
if(i<int(len(df.columns)/400)):
gbdt_col.extend(GBDTselectfea(df[temp_col+['label']],max_num=300))
elif(i==int(len(df.columns)/400)):
gbdt_col.extend(GBDTselectfea(df[temp_col],max_num=250))
print(len(gbdt_col))
print(gbdt_col)
# 循环递归消除
rfecv_col=[]
for i in tqdm(range(int(len(df.columns)/200+1))):
temp_col=[]
temp_col.extend(df.columns[i*200:i*200+200])
rfecv_col.extend(clf_rfecv(df[:40000][temp_col+['label']],temp_col+['label'],5))
print(len(rfecv_col))
print(rfecv_col)
gbdt_col=gbdt_col.extend(['id','label'])
gbdt_rfecv_col=list(set(rfecv_col).union(set(gbdt_col)))
print(len(gbdt_rfecv_col))
df=df[gbdt_rfecv_col]
print(df.shape)
print(df)
# 加载数据
df= pd.read_csv('D:/df_rfecv.csv')
print(df.shape)
print(df)
# 保存上一部处理过的数据到本地
df.to_csv('D:/df_rfecv.csv',index = False)
6 训练
oof = np.zeros(train_df.shape[0]) # feat_imp_df = pd.DataFrame({'feat': cols, 'imp': 0}) test_df['prob'] = 0 clf = LGBMClassifier( boosting_type='gbdt', objective='binary', learning_rate=0.015, n_estimators=6500, # metrics='auc', max_depth=6, num_leaves=30, max_bin=25, min_data_in_leaf=71, bagging_fraction=0.65, bagging_freq= 0, feature_fraction= 0.8, lambda_l1=0.9, lambda_l2=0.9, min_split_gain=0, metric=None, n_jobs=6, #并行运行多线程核心数 verbose=-1 ) val_aucs = [] seeds = [1023, 2048, 2098] for seed in seeds: skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])): print('--------------------- {} fold ---------------------'.format(i)) t = time.time() trn_x, trn_y = train_df[cols].iloc[trn_idx].reset_index(drop=True), train_df['label'].values[trn_idx] val_x, val_y = train_df[cols].iloc[val_idx].reset_index(drop=True), train_df['label'].values[val_idx] clf.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], # categorical_feature=cate_cols, eval_metric='auc', early_stopping_rounds=200, verbose=200 ) # feat_imp_df['imp'] += clf.feature_importances_ / skf.n_splits oof[val_idx] = clf.predict_proba(val_x)[:, 1] test_df['prob'] += clf.predict_proba(test_df[cols])[:, 1] / skf.n_splits / len(seeds) cv_auc = roc_auc_score(train_df['label'], oof) val_aucs.append(cv_auc) print('\ncv_auc: ', cv_auc) print(val_aucs, np.mean(val_aucs))
评价指标:TPR
def tpr_weight_funtion(y_true,y_predict): d = pd.DataFrame() d['prob'] = list(y_predict) d['y'] = list(y_true) d = d.sort_values(['prob'], ascending=[0]) y = d.y PosAll = pd.Series(y).value_counts()[1] NegAll = pd.Series(y).value_counts()[0] pCumsum = d['y'].cumsum() nCumsum = np.arange(len(y)) - pCumsum + 1 pCumsumPer = pCumsum / PosAll nCumsumPer = nCumsum / NegAll TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()] TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()] TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()] return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3 tpr = round(tpr_weight_funtion(train_df['label'], oof), 6) tpr, round(np.mean(val_aucs), 5)
# print(test_df)
submit['id'] = test_df['id']
submit['label'] = test_df['prob']
submit.to_csv('D:/submit12.csv'.format(tpr, round(np.mean(val_aucs), 6)), index = False)
submit.head()
这是本人第二次参赛,回想第一次参赛啥也不懂,这次对相关技术了解得更多,以后要继续努力,加强基础知识的学习,同时也要经常关注各类比赛,将比赛与个人的研究方向相结合。希望各位大神多多指教,其中很多都是个人自己的想法,并不确定其中的正确性和原理,各种方法组合到一起知否能达到最优本人也不是很清楚。希望大家多多指教。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。