当前位置:   article > 正文

【day6-baseline】_def xgbmodel(x_train,x_test,y_train,y_test,params)

def xgbmodel(x_train,x_test,y_train,y_test,params): # 定义模型的结构,训练 次

 

1.官方代码

代码简介:三种树模型lgb、xgb、cat(CatBoostRegressor)

链接:https://github.com/datawhalechina/team-learning-data-mining/blob/master/FinancialRiskControl/baseline.md

代码

  1. #! /usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. #====#====#====#====
  4. '''
  5. '''
  6. #====#====#====#====
  7. #---导入包
  8. import pandas as pd
  9. import os
  10. import gc
  11. import lightgbm as lgb
  12. import xgboost as xgb
  13. from catboost import CatBoostRegressor
  14. from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
  15. from sklearn.preprocessing import MinMaxScaler
  16. import math
  17. import numpy as np
  18. from tqdm import tqdm
  19. from sklearn.model_selection import StratifiedKFold, KFold
  20. from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
  21. import matplotlib.pyplot as plt
  22. import time
  23. import warnings
  24. warnings.filterwarnings('ignore')
  25. #---读取数据
  26. train = pd.read_csv('finance2/train.csv')
  27. testA = pd.read_csv('finance2/testA.csv')
  28. #查看数据
  29. print(train.head())
  30. data = pd.concat([train, testA], axis=0, ignore_index=True)
  31. #---数据预处理
  32. #可以看到很多变量不能直接训练,比如grade、subGrade、employmentLength、issueDate、earliesCreditLine,需要进行预处理
  33. print(sorted(data['grade'].unique()))
  34. print(sorted(data['subGrade'].unique()))
  35. data['employmentLength'].value_counts(dropna=False).sort_index()
  36. ##--首先对employmentLength进行转换到数值
  37. data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
  38. data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
  39. def employmentLength_to_int(s):
  40. if pd.isnull(s):
  41. return s
  42. else:
  43. return np.int8(s.split()[0])
  44. data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
  45. data['employmentLength'].value_counts(dropna=False).sort_index()
  46. ##--对earliesCreditLine进行预处理
  47. data['earliesCreditLine'].sample(5)
  48. data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
  49. data['earliesCreditLine'].describe()
  50. print(data.head())
  51. ##--类别特征处理
  52. # 部分类别特征
  53. cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
  54. 'applicationType', 'initialListStatus', 'title', 'policyCode']
  55. for f in cate_features:
  56. print(f, '类型数:', data[f].nunique())
  57. # 类型数在2之上,又不是高维稀疏的
  58. data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
  59. # 高维类别特征需要进行转换
  60. for f in ['employmentTitle', 'postCode', 'title']:
  61. data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
  62. data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
  63. del data[f]
  64. #---训练数据/测试数据准备
  65. features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]
  66. train = data[data.isDefault.notnull()].reset_index(drop=True)
  67. test = data[data.isDefault.isnull()].reset_index(drop=True)
  68. x_train = train[features]
  69. x_test = test[features]
  70. y_train = train['isDefault']
  71. #---模型训练
  72. #直接构建了一个函数,可以调用三种树模型,方便快捷
  73. def cv_model(clf, train_x, train_y, test_x, clf_name):
  74. folds = 5
  75. seed = 2020
  76. kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
  77. train = np.zeros(train_x.shape[0])
  78. test = np.zeros(test_x.shape[0])
  79. cv_scores = []
  80. for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
  81. print('************************************ {} ************************************'.format(str(i + 1)))
  82. trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
  83. train_y[valid_index]
  84. if clf_name == "lgb":
  85. train_matrix = clf.Dataset(trn_x, label=trn_y)
  86. valid_matrix = clf.Dataset(val_x, label=val_y)
  87. params = {
  88. 'boosting_type': 'gbdt',
  89. 'objective': 'binary',
  90. 'metric': 'auc',
  91. 'min_child_weight': 5,
  92. 'num_leaves': 2 ** 5,
  93. 'lambda_l2': 10,
  94. 'feature_fraction': 0.8,
  95. 'bagging_fraction': 0.8,
  96. 'bagging_freq': 4,
  97. 'learning_rate': 0.1,
  98. 'seed': 2020,
  99. 'nthread': 28,
  100. 'n_jobs': 24,
  101. 'silent': True,
  102. 'verbose': -1,
  103. }
  104. model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,
  105. early_stopping_rounds=200)
  106. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
  107. test_pred = model.predict(test_x, num_iteration=model.best_iteration)
  108. # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
  109. if clf_name == "xgb":
  110. train_matrix = clf.DMatrix(trn_x, label=trn_y)
  111. valid_matrix = clf.DMatrix(val_x, label=val_y)
  112. test_matrix = clf.DMatrix(test_x)
  113. params = {'booster': 'gbtree',
  114. 'objective': 'binary:logistic',
  115. 'eval_metric': 'auc',
  116. 'gamma': 1,
  117. 'min_child_weight': 1.5,
  118. 'max_depth': 5,
  119. 'lambda': 10,
  120. 'subsample': 0.7,
  121. 'colsample_bytree': 0.7,
  122. 'colsample_bylevel': 0.7,
  123. 'eta': 0.04,
  124. 'tree_method': 'exact',
  125. 'seed': 2020,
  126. 'nthread': 36,
  127. "silent": True,
  128. }
  129. watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
  130. model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200,
  131. early_stopping_rounds=200)
  132. val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
  133. test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
  134. if clf_name == "cat":
  135. params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
  136. 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
  137. model = clf(iterations=20000, **params)
  138. model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
  139. cat_features=[], use_best_model=True, verbose=500)
  140. val_pred = model.predict(val_x)
  141. test_pred = model.predict(test_x)
  142. train[valid_index] = val_pred
  143. test = test_pred / kf.n_splits
  144. cv_scores.append(roc_auc_score(val_y, val_pred))
  145. print(cv_scores)
  146. print("%s_scotrainre_list:" % clf_name, cv_scores)
  147. print("%s_score_mean:" % clf_name, np.mean(cv_scores))
  148. print("%s_score_std:" % clf_name, np.std(cv_scores))
  149. return train, test
  150. def lgb_model(x_train, y_train, x_test):
  151. lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
  152. return lgb_train, lgb_test
  153. def xgb_model(x_train, y_train, x_test):
  154. xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
  155. return xgb_train, xgb_test
  156. def cat_model(x_train, y_train, x_test):
  157. cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
  158. return cat_train, cat_test
  159. lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
  160. xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
  161. cat_train, cat_test = cat_model(x_train, y_train, x_test)
  162. rh_test = lgb_test*0.5 + xgb_test*0.5
  163. testA['isDefault'] = rh_test
  164. testA[['id','isDefault']].to_csv('test_sub.csv', index=False)

【最后的运行结果】

2.某大佬代码

代码简介:Baseline-LGBM

  1. 手动删除若干疑似重复列n2;
  2. 没有引入业务知识;
  3. 对所有非数值字段直接Target encode;
  4. 采用LGBMRegressor,随手设置了一些参数;
  5. 本地十折AUC均值0.7317,线上0.7291

 

链接:https://tianchi.aliyun.com/forum/postDetail?spm=5176.12586969.1002.21.3b306856mDlndD&postId=128654

代码:

  1. import pandas as pd
  2. import numpy as np
  3. from category_encoders.target_encoder import TargetEncoder
  4. from sklearn.model_selection import KFold
  5. from sklearn.metrics import auc, roc_curve
  6. from lightgbm import LGBMRegressor
  7. # 导入数据
  8. train = pd.read_csv('finance2/train.csv', index_col='id')
  9. test = pd.read_csv('finance2/testA.csv', index_col='id')
  10. target = train.pop('isDefault')
  11. test = test[train.columns]
  12. # 非数值列
  13. s = train.apply(lambda x:x.dtype)
  14. tecols = s[s=='object'].index.tolist()
  15. # 模型
  16. def makelgb():
  17.     lgbr = LGBMRegressor(num_leaves=30
  18.                         ,max_depth=5
  19.                         ,learning_rate=.02
  20.                         ,n_estimators=1000
  21.                         ,subsample_for_bin=5000
  22.                         ,min_child_samples=200
  23.                         ,colsample_bytree=.2
  24.                         ,reg_alpha=.1
  25.                         ,reg_lambda=.1
  26.                         )
  27.     return lgbr
  28. # 本地验证
  29. kf = KFold(n_splits=10, shuffle=True, random_state=100)
  30. devscore = []
  31. for tidx, didx in kf.split(train.index):
  32.     tf = train.iloc[tidx]
  33.     df = train.iloc[didx]
  34.     tt = target.iloc[tidx]
  35.     dt = target.iloc[didx]
  36.     te = TargetEncoder(cols=tecols)
  37.     tf = te.fit_transform(tf, tt)
  38.     df = te.transform(df)
  39.     lgbr = makelgb()
  40.     lgbr.fit(tf, tt)
  41.     pre = lgbr.predict(df)
  42.     fpr, tpr, thresholds = roc_curve(dt, pre)
  43.     score = auc(fpr, tpr)
  44.     devscore.append(score)
  45. print(np.mean(devscore))
  46. # 在整个train集上重新训练,预测test,输出结果
  47. lgbr = makelgb()
  48. te = TargetEncoder(cols=tecols)
  49. tf = te.fit_transform(train, target)
  50. df = te.transform(test)
  51. lgbr.fit(tf, target)
  52. pre = lgbr.predict(df)
  53. pd.Series(pre, name='isDefault', index=test.index).reset_index().to_csv('submit.csv', index=False)

【遇到问题:安装category-encoders包】

【代码结果】

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/510906
推荐阅读
相关标签
  

闽ICP备14008679号