赞
踩
消费者人群画像——信用智能评分
(Group Image of Consumers-----Intelligent Scoring of Credits)
大赛地址:https://www.datafountain.cn/competitions/337/datasets
大赛介绍
2019数字中国创新大赛(Digital China Innovation Contest, DCIC 2019)由福建省数字福建建设领导小组办公室、福建省工业和信息化厅、福州市人民政府、中国电子信息产业发展研究院、数字中国研究院和中国互联网投资基金联合主办,第十二届全国政协副主席王钦敏担任大赛总顾问。作为第二届数字中国建设峰会的重要组成部分,本届赛事分为大数据、人工智能、工业互联网三大类算法题,旨在解决数字经济建设发展中的痛点、难点,推动新一代信息技术和传统产业的深入融合,助力数字中国建设。
本次提供数据主要包含用户几个方面信息:身份特征、消费能力、人脉关系、位置轨迹、应用行为偏好。字段说明如下:
#coding:utf-8 import time import matplotlib.pyplot as plt plt.rcParams["font.sans-serif"] = ["SimHei"]#用来显示图中中文 plt.rcParams['axes.unicode_minus'] = False #用来显示正常符号 plt.rcParams['font.family'] = ['sans-serif'] import seaborn as sns import numpy as np import pandas as pd import lightgbm as lgb from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder pd.set_option('display.max_columns',100)#显示最大列数 # 1.读取数据 data_path = 'myself_train_by_BSL_2_28/data/' train_data = pd.read_csv(data_path+'train_dataset.csv',header =0,error_bad_lines = False) test_data = pd.read_csv(data_path+'test_dataset.csv',header = 0,error_bad_lines = False) sample_sub = pd.read_csv(data_path+'submit_example.csv') # 2. 查看特征名称 # print(train_data.head()) # print(train_data.describe()) # print(train_data.columns) # print(train_data.info()) #查看是否存在缺失值,以及数据量 # train_data.info() # 3. 判断各列和信用分的相关度-----重点1 #object是基类--object 类:应该被翻译为“对象”,或者“东西”类。 # object 类是所有类的父类。换言之,其它的任何一个类,都直接或间接地继承了 object 类(的属性和方法)。 x_cols = [col for col in train_data.columns if col not in ["信用分"] and train_data[col].dtype!='object'] # for col in train_data.columns : # if col in ["信用分"] : # print(col) # print("123") # elif train_data[col].dtype=='object': # print(col) labels = [] values = [] #判断各列和信用分的相关性 for col in x_cols: labels.append(col) values.append(np.corrcoef(train_data[col].values,train_data["信用分"].values)[0,1]) corr_df = pd.DataFrame({"col_labels":labels,'corr_values':values}) corr_df = corr_df.sort_values(by = 'corr_values')#由小到大排 # print(corr_df) # 画图---注意画图是怎么画的 ind = np.arange(len(labels)) width=0.5 fig,ax = plt.subplots(figsize=(12,60)) rects = ax.barh(ind,np.array(corr_df.corr_values.values),color = 'y') ax.set_yticks(ind) ax.set_yticklabels(corr_df.col_labels.values,rotation='horizontal') ax.set_xlabel('Correlation coefficient') ax.set_title("Correlation coefficient of the variables") # plt.show() #密度曲线 def plot_kde(data): plt.figure(figsize=(8,6)) data.plot(kind='kde') def plot_his(data): plt.figure(figsize=(8,6)) sns.distplot(data.values,bins=50,kde=False) plot_kde(train_data['信用分']) # plt.show() # 4.特征工程 #top up amount,充值金额是整数,和小数,应该对应不同的充值途径? def produce_offline_feature(train_data): train_data['不同充值途径'] = 0 train_data['不同充值途径'][(train_data['缴费用户最近一次缴费金额(元)'] % 10 == 0) & train_data['缴费用户最近一次缴费金额(元)'] != 0] = 1 return train_data train_data = produce_offline_feature(train_data) test_data = produce_offline_feature(test_data) #看importance,当月话费和最近半年平均话费都很高,算一下当月/半年 --》稳定性 def produce_fee_rate(train_data): train_data['当前费用稳定性'] = train_data['用户账单当月总费用(元)']/(train_data['最近近6个月平均消费值(元)'] +1) #当月话费/当月账户余额 train_data['用户余额比例'] = train_data['用户账单当月总费用(元)']/(train_data['用户当月账户余额(元)'] +1) return train_data train_data = produce_offline_feature(train_data) test_data = produce_offline_feature(test_data) #获取特征 def get_features(data): data.loc[data['用户年龄']==0,'用户年龄'] = data['用户年龄'].mode() data['缴费金额是否能覆盖当月账单'] = data['缴费用户最近一次缴费金额(元)'] - data['用户近6个月平均消费值(元)'] data['当月账单是否超过平均消费额'] = data['用户账单当月总费用(元)'] - data['用户近6个月平均消费值(元)'] #映射年龄 def map_age(x): if x<=18: return 1 elif x<=30: return 2 elif x<=35: return 3 elif x<=45: return 4 else: return 5 data['是否大学生_黑名单'] = data['是否大学生客户'] + data['是否黑名单客户'] data['是否去过高档商场'] = data['当月是否到过福州山姆会员店'] + data['当月是否逛过福州仓山万达'] data['是否去过高档商场'] = data['是否去过高档商场'].map(lambda x:1 if x>=1 else 0) data['是否_商场_电影'] = data['是否去过高档商场']*data['当月是否看电影'] data['是否_商场_体育馆'] = data['是否去过高档商场'] * data['当月是否体育场馆消费'] data['是否_商场_旅游'] = data['是否去过高档商场'] * data['当月是否景点游览'] data['是否_电影_体育馆'] = data['当月是否看电影'] * data['当月是否体育场馆消费'] data['是否_电影_旅游'] = data['当月是否看电影'] * data['当月是否景点游览'] data['是否_旅游_体育馆'] = data['当月是否景点游览'] * data['当月是否体育场馆消费'] data['是否_商场_旅游_体育馆'] = data['是否去过高档商场'] * data['当月是否景点游览'] * data['当月是否体育场馆消费'] data['是否_商场_电影_体育馆'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否体育场馆消费'] data['是否_商场_电影_旅游'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否景点游览'] data['是否_体育馆_电影_旅游'] = data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览'] data['是否_商场_体育馆_电影_旅游'] = data['是否去过高档商场'] * data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览'] discretize_features = ['交通类应用使用次数', '当月物流快递类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数', '当月旅游资讯类应用使用次数'] data['交通类应用使用次数'] = data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数'] data['6个月平均占比总费用'] = data['用户近6个月平均消费值(元)'] / (data['用户账单当月总费用(元)'] + 1) def map_discretize(x): if x==0: return 0 elif x<=5: return 1 elif x<=15: return 2 elif x<=50: return 3 elif x<=100: return 4 else: return 5 for col in discretize_features[:]: data[col] = data[col].map(lambda x:map_discretize(x)) return data train_data = get_features(train_data) test_data = get_features(test_data) def base_process(data): transform_value_feature = ['用户年龄', '用户网龄(月)', '当月通话交往圈人数', '近三个月月均商场出现次数', '当月网购类应用使用次数', '当月物流快递类应用使用次数' , '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数', '当月旅游资讯类应用使用次数'] user_fea = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)', '用户当月账户余额(元)'] log_features = ['当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月物流快递类应用使用次数', '当月视频播放类应用使用次数'] #处理离散点 for col in transform_value_feature + user_fea + log_features: #取出最高99.9%值 ulimit = np.percentile(train_data[col].values,99.9) #取出最低0.1%值 llimit = np.percentile(train_data[col].values,0.1) train_data.loc[train_data[col]>ulimit,col] = ulimit train_data.loc[train_data[col]<llimit,col] = llimit for col in user_fea + log_features: data[col] = data[col].map(lambda x:np.log1p(x)) return data train_data = base_process(train_data) test_data = base_process(test_data) # 5. 特征重要度显示函数 def display_importances(feature_importance_df_): cols = feature_importance_df_[['feature','importance']].groupby('feature').mean().sort_values(by='importance',ascending = False)[:40].index best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8,10)) sns.barplot(x='importance',y = 'feature',data = best_features.sort_values(by='importance',ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.show() # 6. 训练及预测 #pars params = { 'learning_rate':0.01, 'boosting_type':'gbdt', 'objective':'regression_l1', 'metric':'mae', 'feature_fraction':0.6, 'bagging_fraction':0.8, 'bagging_freq':2, 'num_leaves':31, 'verbose':-1, 'max_depth':-1, 'reg_alpha':2.2, 'reg_lambda':1.4, 'nthread':8 } from sklearn.model_selection import KFold cv_pred_all = 0 en_amount = 3 oof_lgb1 = np.zeros(len(train_data)) prediction_lgb1 = np.zeros(len(test_data)) for seed in range(en_amount): NFOLDS=5 train_label = train_data["信用分"] # 交叉验证,shuffle指是否对数据洗牌,random_state为随机种子 kfold = KFold(n_splits=NFOLDS,shuffle=True,random_state=seed) kf = kfold.split(train_data,train_label)#作用:返回样本切分之后数据集的indices,即索引 train_data_use = train_data.drop(['用户编码','信用分'],axis=1) test_data_use = test_data.drop(['用户编码'],axis=1) # print(len(train_data_use.columns)) cv_pred = np.zeros(test_data.shape[0]) valid_best_l2_all = 0 feature_importance_df = pd.DataFrame() count = 0 # 即对一个可遍历的数据对象(如列表、元组或字符串), enumerate会将该数据对象组合为一个索引序列, 同时列出数据和数据下标 for i ,(train_fold,validate) in enumerate(kf): print('fold:',i,'training') print("train_fold:%s"%train_fold) print("validate:",validate) X_train,X_validate,label_train,label_validate = \ train_data_use.iloc[train_fold,:],train_data_use.iloc[validate,:],\ train_label[train_fold],train_label[validate] #放入dataset中供lgb使用 dtrain = lgb.Dataset(X_train,label_train) dvalid = lgb.Dataset(X_validate,label_validate,reference=dtrain) bst = lgb.train(params,dtrain,num_boost_round=10000,valid_sets=dvalid,verbose_eval=-1,early_stopping_rounds=250) cv_pred += bst.predict(test_data_use,num_iteration=bst.best_iteration) valid_best_l2_all +=bst.best_score['valid_0']['l1'] # print("Saving Model...") # bst.save_model(model_file) # 保存模型 oof_lgb1[validate] = bst.predict(X_validate,num_iteration=bst.best_iteration) prediction_lgb1 +=bst.predict(test_data_use,num_iteration = bst.best_iteration)/kfold.n_splits fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = list(X_train.columns) fold_importance_df['importance'] = bst.feature_importance(importance_type = 'split',iteration = bst.best_iteration) fold_importance_df['fold'] = count +1 feature_importance_df =pd.concat([feature_importance_df,fold_importance_df],axis=0) count +=1 cv_pred /=NFOLDS valid_best_l2_all /=NFOLDS cv_pred_all +=cv_pred cv_pred_all /=en_amount prediction_lgb1 /=en_amount print('cv score for valid is :',1/(1+ valid_best_l2_all)) #查看lgb训练完后特征重要度 display_importances(feature_importance_df) # 7. XGB训练及预测 import xgboost as xgb from sklearn.model_selection import KFold xgb_params = { 'eta':0.005, 'max_depth':10, 'subsample':0.8, 'colsample_bytree':0.8, 'objective':'reg:linear', 'eval_metric':'mae', 'silent':True, 'nthread':8 } cv_pred_allxgb = 0 en_amount = 3 oof_xgb1 = np.zeros(len(train_data)) prediction_xgb1 = np.zeros(len(test_data)) for seed in range(en_amount): NFOLDS=5 train_label = train_data['信用分'] kfold = KFold(n_splits=NFOLDS, shuffle=True,random_state=seed) kf = kfold.split(train_data,train_label) train_data_use = train_data.drop(['用户编码','信用分'],axis=1) test_data_use = test_data.drop(['用户编码'],axis=1) # print("test_data_use:",test_data_use.columns) cv_pred = np.zeros(test_data.shape[0]) valid_best_l2_all=0 feature_importance_df = pd.DataFrame() count = 0 for i,(train_fold,validate) in enumerate(kf): print("fold:",i,'training') X_train,X_validate,label_train,label_validate = \ train_data_use.iloc[train_fold,:],train_data_use.iloc[validate,:],\ train_label[train_fold],train_label[validate] dtrain = xgb.DMatrix(X_train,label_train) dvalid = xgb.DMatrix(X_validate,label_validate) watchlist = [(dtrain,'train'),(dvalid,'valid_data')] bst = xgb.train(dtrain = dtrain,num_boost_round = 10000,evals = watchlist, early_stopping_rounds=100,verbose_eval=300,params=xgb_params) cv_pred += bst.predict(xgb.DMatrix(test_data_use), ntree_limit=bst.best_ntree_limit) oof_xgb1[validate] = bst.predict(xgb.DMatrix(X_validate), ntree_limit=bst.best_ntree_limit) prediction_xgb1 += bst.predict(xgb.DMatrix(test_data_use), ntree_limit=bst.best_ntree_limit) / kfold.n_splits count += 1 cv_pred /=NFOLDS cv_pred_allxgb +=cv_pred cv_pred_allxgb /=en_amount prediction_xgb1 /=en_amount # 8. 贝叶斯 from sklearn.linear_model import BayesianRidge from sklearn.model_selection import RepeatedKFold from sklearn.metrics import mean_absolute_error # 将lgb和xgb的结果进行stacking train_stack = np.vstack([oof_lgb1, oof_xgb1]).transpose() test_stack = np.vstack([prediction_lgb1, prediction_xgb1]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2019) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) target = train_data['信用分'] for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 mean_absolute_error(target.values, oof_stack) test_data_sub1 = test_data[['用户编码']] test_data_sub1['score'] = predictions test_data_sub1.columns = ['id','score'] test_data_sub1['score1'] = cv_pred_all test_data_sub1['score'] = test_data_sub1['score'].apply(lambda x: int(np.round(x))) print("test_data_sub1.head():",test_data_sub1.head())
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。