赞
踩
一、IV计算代码
- def cal_iv(df, label_col, feat_cols, bin=10):
- eps = 0.0000000000001
- target = label_col
- re_list = []
- for col in feat_cols:
- ivs = []
- df2 = df[[col, label_col]]
- df2.dropna()
- if len(df2[col].drop_duplicates()) < 10:
- continue
- else:
- df2 = df2.sort_values(by=col, ascending=True)
- count_0 = float(df2[df2[target] == 0].shape[0])
- count_1 = float(df2[df2[target] == 1].shape[0])
- df2['indexn'] = range(len(df2))
- divs = int(len(df2) / bin)
- up = math.ceil(len(df2) / divs)
- df2['group'] = [int(item / divs) for item in df2['indexn']]
- df3 = df2[['group', target]]
- tmpdata = []
- for i in range(up):
- df4 = df3[df3['group'] == i]
- yi = df4[df4[target] == 1].shape[0] + eps
- tmpdata.append(df4)
- if yi<1:
- continue
- else:
- df4 = pd.concat(tmpdata,axis=0)
- tmpdata = []
- ni = df4[df4[target] == 0].shape[0] + eps
- iv = (yi / count_1 - ni / count_0) * math.log((yi / count_1) / (ni / count_0))
- if iv > 1:
- print('group',i, df4.shape[0], df4[target].mean())
- ivs.append(iv)
- iv = round(sum(ivs), 3)
- re_list.append({'feature': col, 'iv': iv})
- df_re = pd.DataFrame(re_list)[['feature', 'iv']]
- return df_re.sort_values(by='iv', ascending=False)
方法调用:
cal_iv(df_temp, Y, feas_list, bin=10)
df_temp:数据集,Y:是否逾期标签,feas_list:需要计算IV的 变量
2、 按月和不同Y 下的 IV计算
- def iv_distr_v2(df, flag, feas=feas_list):
-
- df_mob_iv = pd.DataFrame()
- # 循环不同的 Y
- for i in ['mob1_15','mob1_30','mob2_30','mob3_30']:
- print(i)
- df_temp1 = df[df['{}'.format(i)].notnull()]
- for j in list(df_temp1[flag].unique()):
- df_temp2 = df_temp1.loc[df_temp1[flag] == j]
- print(df_temp1.shape,df_temp2.shape)
- iv_temp1 = cal_iv(df_temp2, '{}'.format(i), feas_list, bin=10)
- iv_temp1['month'] = j
- iv_temp1['Y'] = i
-
- df_mob_iv = pd.concat([df_mob_iv, iv_temp1], axis=0)
-
- return df_mob_iv
调用代码:
iv_all = iv_distr_v2(df, flag= 'month',feas=feas_list, )
二 、KS、AUC调用代码,只对 值越大逾期越高的分数有效
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score,roc_curve
## 统计auc
def cal_auc(df, y_true, y_prob):
try:
return roc_auc_score(df[y_true], df[y_prob])
except:
return np.nan
## 统计ks
def cal_ks(df, y_true, y_prob):
try:
fpr, tpr, thre_ = roc_curve(df[y_true], df[y_prob])
ks = max(tpr - fpr)
return ks
except:
return np.nan
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。