赞
踩
代码简介:三种树模型lgb、xgb、cat(CatBoostRegressor)
链接:https://github.com/datawhalechina/team-learning-data-mining/blob/master/FinancialRiskControl/baseline.md
代码:
- #! /usr/bin/env python
- # -*- coding:utf-8 -*-
- #====#====#====#====
- '''
- '''
- #====#====#====#====
- #---导入包
- import pandas as pd
- import os
- import gc
- import lightgbm as lgb
- import xgboost as xgb
- from catboost import CatBoostRegressor
- from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
- from sklearn.preprocessing import MinMaxScaler
- import math
- import numpy as np
- from tqdm import tqdm
- from sklearn.model_selection import StratifiedKFold, KFold
- from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
- import matplotlib.pyplot as plt
- import time
- import warnings
- warnings.filterwarnings('ignore')
-
- #---读取数据
- train = pd.read_csv('finance2/train.csv')
- testA = pd.read_csv('finance2/testA.csv')
- #查看数据
- print(train.head())
- data = pd.concat([train, testA], axis=0, ignore_index=True)
-
- #---数据预处理
- #可以看到很多变量不能直接训练,比如grade、subGrade、employmentLength、issueDate、earliesCreditLine,需要进行预处理
- print(sorted(data['grade'].unique()))
- print(sorted(data['subGrade'].unique()))
- data['employmentLength'].value_counts(dropna=False).sort_index()
-
- ##--首先对employmentLength进行转换到数值
- data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
- data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
-
- def employmentLength_to_int(s):
- if pd.isnull(s):
- return s
- else:
- return np.int8(s.split()[0])
-
- data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
- data['employmentLength'].value_counts(dropna=False).sort_index()
-
- ##--对earliesCreditLine进行预处理
- data['earliesCreditLine'].sample(5)
- data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
- data['earliesCreditLine'].describe()
- print(data.head())
-
- ##--类别特征处理
- # 部分类别特征
- cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
- 'applicationType', 'initialListStatus', 'title', 'policyCode']
- for f in cate_features:
- print(f, '类型数:', data[f].nunique())
- # 类型数在2之上,又不是高维稀疏的
- data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
- # 高维类别特征需要进行转换
- for f in ['employmentTitle', 'postCode', 'title']:
- data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
- data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
- del data[f]
-
-
-
- #---训练数据/测试数据准备
- features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]
- train = data[data.isDefault.notnull()].reset_index(drop=True)
- test = data[data.isDefault.isnull()].reset_index(drop=True)
- x_train = train[features]
- x_test = test[features]
- y_train = train['isDefault']
-
- #---模型训练
- #直接构建了一个函数,可以调用三种树模型,方便快捷
- def cv_model(clf, train_x, train_y, test_x, clf_name):
- folds = 5
- seed = 2020
- kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
-
- train = np.zeros(train_x.shape[0])
- test = np.zeros(test_x.shape[0])
-
- cv_scores = []
-
- for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
- print('************************************ {} ************************************'.format(str(i + 1)))
- trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
- train_y[valid_index]
-
- if clf_name == "lgb":
- train_matrix = clf.Dataset(trn_x, label=trn_y)
- valid_matrix = clf.Dataset(val_x, label=val_y)
-
- params = {
- 'boosting_type': 'gbdt',
- 'objective': 'binary',
- 'metric': 'auc',
- 'min_child_weight': 5,
- 'num_leaves': 2 ** 5,
- 'lambda_l2': 10,
- 'feature_fraction': 0.8,
- 'bagging_fraction': 0.8,
- 'bagging_freq': 4,
- 'learning_rate': 0.1,
- 'seed': 2020,
- 'nthread': 28,
- 'n_jobs': 24,
- 'silent': True,
- 'verbose': -1,
- }
-
- model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,
- early_stopping_rounds=200)
- val_pred = model.predict(val_x, num_iteration=model.best_iteration)
- test_pred = model.predict(test_x, num_iteration=model.best_iteration)
-
- # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
-
- if clf_name == "xgb":
- train_matrix = clf.DMatrix(trn_x, label=trn_y)
- valid_matrix = clf.DMatrix(val_x, label=val_y)
- test_matrix = clf.DMatrix(test_x)
-
- params = {'booster': 'gbtree',
- 'objective': 'binary:logistic',
- 'eval_metric': 'auc',
- 'gamma': 1,
- 'min_child_weight': 1.5,
- 'max_depth': 5,
- 'lambda': 10,
- 'subsample': 0.7,
- 'colsample_bytree': 0.7,
- 'colsample_bylevel': 0.7,
- 'eta': 0.04,
- 'tree_method': 'exact',
- 'seed': 2020,
- 'nthread': 36,
- "silent": True,
- }
-
- watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
-
- model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200,
- early_stopping_rounds=200)
- val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
- test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
-
- if clf_name == "cat":
- params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
- 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
-
- model = clf(iterations=20000, **params)
- model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
- cat_features=[], use_best_model=True, verbose=500)
-
- val_pred = model.predict(val_x)
- test_pred = model.predict(test_x)
-
- train[valid_index] = val_pred
- test = test_pred / kf.n_splits
- cv_scores.append(roc_auc_score(val_y, val_pred))
-
- print(cv_scores)
-
- print("%s_scotrainre_list:" % clf_name, cv_scores)
- print("%s_score_mean:" % clf_name, np.mean(cv_scores))
- print("%s_score_std:" % clf_name, np.std(cv_scores))
- return train, test
- def lgb_model(x_train, y_train, x_test):
- lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
- return lgb_train, lgb_test
-
- def xgb_model(x_train, y_train, x_test):
- xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
- return xgb_train, xgb_test
-
- def cat_model(x_train, y_train, x_test):
- cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
- return cat_train, cat_test
-
- lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
- xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
- cat_train, cat_test = cat_model(x_train, y_train, x_test)
-
- rh_test = lgb_test*0.5 + xgb_test*0.5
- testA['isDefault'] = rh_test
- testA[['id','isDefault']].to_csv('test_sub.csv', index=False)

【最后的运行结果】
代码简介:Baseline-LGBM
链接:https://tianchi.aliyun.com/forum/postDetail?spm=5176.12586969.1002.21.3b306856mDlndD&postId=128654
代码:
- import pandas as pd
-
- import numpy as np
-
- from category_encoders.target_encoder import TargetEncoder
-
- from sklearn.model_selection import KFold
-
- from sklearn.metrics import auc, roc_curve
-
- from lightgbm import LGBMRegressor
-
-
-
- # 导入数据
-
- train = pd.read_csv('finance2/train.csv', index_col='id')
-
- test = pd.read_csv('finance2/testA.csv', index_col='id')
-
- target = train.pop('isDefault')
-
- test = test[train.columns]
-
-
-
- # 非数值列
-
- s = train.apply(lambda x:x.dtype)
-
- tecols = s[s=='object'].index.tolist()
-
-
-
- # 模型
-
- def makelgb():
-
- lgbr = LGBMRegressor(num_leaves=30
-
- ,max_depth=5
-
- ,learning_rate=.02
-
- ,n_estimators=1000
-
- ,subsample_for_bin=5000
-
- ,min_child_samples=200
-
- ,colsample_bytree=.2
-
- ,reg_alpha=.1
-
- ,reg_lambda=.1
-
- )
-
- return lgbr
-
-
-
- # 本地验证
-
- kf = KFold(n_splits=10, shuffle=True, random_state=100)
-
- devscore = []
-
- for tidx, didx in kf.split(train.index):
-
- tf = train.iloc[tidx]
-
- df = train.iloc[didx]
-
- tt = target.iloc[tidx]
-
- dt = target.iloc[didx]
-
- te = TargetEncoder(cols=tecols)
-
- tf = te.fit_transform(tf, tt)
-
- df = te.transform(df)
-
- lgbr = makelgb()
-
- lgbr.fit(tf, tt)
-
- pre = lgbr.predict(df)
-
- fpr, tpr, thresholds = roc_curve(dt, pre)
-
- score = auc(fpr, tpr)
-
- devscore.append(score)
-
- print(np.mean(devscore))
-
-
-
- # 在整个train集上重新训练,预测test,输出结果
-
- lgbr = makelgb()
-
- te = TargetEncoder(cols=tecols)
-
- tf = te.fit_transform(train, target)
-
- df = te.transform(test)
-
- lgbr.fit(tf, target)
-
- pre = lgbr.predict(df)
-
- pd.Series(pre, name='isDefault', index=test.index).reset_index().to_csv('submit.csv', index=False)

【遇到问题:安装category-encoders包】
【代码结果】
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。