赞
踩
基于处理好的数据--data.csv
data=pd.read_csv('data.csv',encoding='gbk')
y=data['status']
X_cl=data.drop('status',axis=1)
# 计算 IV 函数
def cal_iv(x, y, n_bins=6, null_value=np.nan,):
# 剔除空值
x = x[x != null_value]
# 若 x 只有一个值,返回 0
if len(x.unique()) == 1 or len(x) != len(y):
return 0
if x.dtype == np.number:
# 数值型变量
if x.nunique() > n_bins:
# 若 nunique 大于箱数,进行分箱
x = pd.qcut(x, q=n_bins, duplicates='drop')
# 计算IV
groups = x.groupby([x, list(y)]).size().unstack().fillna(0)
t0, t1 = y.value_counts().index
groups = groups / groups.sum()
not_zero_index = (groups[t0] > 0) & (groups[t1] > 0)
groups['iv_i'] = (groups[t0] - groups[t1]) * np.log(groups[t0] / groups[t1])
iv = sum(groups['iv_i'])
return iv
# 统计每个特征对应的 iv 值
fea_iv = X_cl.apply(lambda x: cal_iv(x, y), axis=0).sort_values(ascending=False)
# 筛选 IV > 0.1 的特征
imp_fea_iv = fea_iv[fea_iv > 0.05].index
imp_fea_iv
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_cl, y)
rf_impc = pd.Series(rf.feature_importances_, index=X_cl.columns).sort_values(ascending=False)
# 筛选 重要性前十五 个特征
imp_fea_rf = rf_impc.index[:15]
# 合并特征并筛选出有用特征
imp_fea = list(set(imp_fea_iv) | set(imp_fea_rf))
X_imp = X_cl[imp_fea]
X_imp.shape
问题:算法是百度的,跑了整个过程,不是很懂原理,短时间搞不懂,希望可以提供一些讲解
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。