当前位置:   article > 正文

lightGBM使用案例_lightgbm示例

lightgbm示例

转载于文本

xgboost:https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

lightgbm的两种使用方式

lightgbm

  1. #!/usr/bin/env python2
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Sat Mar 31 21:19:09 2018
  5. @author: hello4720
  6. """
  7. import numpy as np
  8. import pandas as pd
  9. import lightgbm as lgb
  10. from sklearn import metrics
  11. from sklearn.model_selection import train_test_split
  12. ### 读取数据
  13. print("载入数据")
  14. dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
  15. dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
  16. dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
  17. dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
  18. dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
  19. dataset1.drop_duplicates(inplace=True)
  20. dataset2.drop_duplicates(inplace=True)
  21. dataset3.drop_duplicates(inplace=True)
  22. dataset4.drop_duplicates(inplace=True)
  23. dataset5.drop_duplicates(inplace=True)
  24. ### 数据合并
  25. print("数据合并")
  26. trains = pd.concat([dataset1,dataset2],axis=0)
  27. trains = pd.concat([trains,dataset3],axis=0)
  28. trains = pd.concat([trains,dataset4],axis=0)
  29. online_test = dataset5
  30. ### 数据拆分
  31. print("数据拆分")
  32. train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)
  33. train,val = train_test_split(train_xy, test_size = 0.2,random_state=21)
  34. print("训练集")
  35. y = train.is_trade # 训练集标签
  36. X = train.drop(['instance_id','is_trade'],axis=1) # 训练集特征矩阵
  37. print("验证集")
  38. val_y = val.is_trade # 验证集标签
  39. val_X = val.drop(['instance_id','is_trade'],axis=1) # 验证集特征矩阵
  40. print("测试集")
  41. offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 线下测试特征矩阵
  42. online_test_X=online_test.drop(['instance_id'],axis=1) # 线上测试特征矩阵
  43. ### 数据转换
  44. lgb_train = lgb.Dataset(X, y, free_raw_data=False)
  45. lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False)
  46. ### 开始训练
  47. print('设置参数')
  48. params = {
  49. 'boosting_type': 'gbdt',
  50. 'boosting': 'dart',
  51. 'objective': 'binary',
  52. 'metric': 'binary_logloss',
  53. 'learning_rate': 0.01,
  54. 'num_leaves':25,
  55. 'max_depth':3,
  56. 'max_bin':10,
  57. 'min_data_in_leaf':8,
  58. 'feature_fraction': 0.6,
  59. 'bagging_fraction': 1,
  60. 'bagging_freq':0,
  61. 'lambda_l1': 0,
  62. 'lambda_l2': 0,
  63. 'min_split_gain': 0
  64. }
  65. print("开始训练")
  66. gbm = lgb.train(params, # 参数字典
  67. lgb_train, # 训练集
  68. num_boost_round=2000, # 迭代次数
  69. valid_sets=lgb_eval, # 验证集
  70. early_stopping_rounds=30) # 早停系数
  71. ### 线下预测
  72. print ("线下预测")
  73. preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 输出概率
  74. offline=offline_test[['instance_id','is_trade']]
  75. offline['preds']=preds_offline
  76. offline.is_trade = offline['is_trade'].astype(np.float64)
  77. print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
  78. ### 线上预测
  79. print("线上预测")
  80. preds_online = gbm.predict(online_test_X, num_iteration=gbm.best_iteration) # 输出概率
  81. online=online_test[['instance_id']]
  82. online['preds']=preds_online
  83. online.rename(columns={'preds':'predicted_score'},inplace=True)
  84. online.to_csv("./data/20180405.txt",index=None,sep=' ')
  85. ### 保存模型
  86. from sklearn.externals import joblib
  87. joblib.dump(gbm,'gbm.pkl')
  88. ### 特征选择
  89. df = pd.DataFrame(X.columns.tolist(), columns=['feature'])
  90. df['importance']=list(gbm.feature_importance())
  91. df = df.sort_values(by='importance',ascending=False)
  92. df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')

调参案例

  lightgbm使用leaf_wise tree生长策略,leaf_wise_tree的优点是收敛速度快,缺点是容易过拟合。

# lightgbm关键参数

image

  1. # -*- coding: utf-8 -*-
  2. """
  3. # 作者:wanglei5205
  4. # 邮箱:wanglei5205@126.com
  5. # 博客:http://cnblogs.com/wanglei5205
  6. # github:http://github.com/wanglei5205
  7. """
  8. ### 导入模块
  9. import numpy as np
  10. import pandas as pd
  11. import lightgbm as lgb
  12. from sklearn import metrics
  13. ### 载入数据
  14. print('载入数据')
  15. dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
  16. dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
  17. dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
  18. dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
  19. dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
  20. print('数据去重')
  21. dataset1.drop_duplicates(inplace=True)
  22. dataset2.drop_duplicates(inplace=True)
  23. dataset3.drop_duplicates(inplace=True)
  24. dataset4.drop_duplicates(inplace=True)
  25. dataset5.drop_duplicates(inplace=True)
  26. print('数据合并')
  27. trains = pd.concat([dataset1,dataset2],axis=0)
  28. trains = pd.concat([trains,dataset3],axis=0)
  29. trains = pd.concat([trains,dataset4],axis=0)
  30. online_test = dataset5
  31. ### 数据拆分(训练集+验证集+测试集)
  32. print('数据拆分')
  33. from sklearn.model_selection import train_test_split
  34. train_xy,offline_test = train_test_split(trains,test_size = 0.2,random_state=21)
  35. train,val = train_test_split(train_xy,test_size = 0.2,random_state=21)
  36. # 训练集
  37. y_train = train.is_trade # 训练集标签
  38. X_train = train.drop(['instance_id','is_trade'],axis=1) # 训练集特征矩阵
  39. # 验证集
  40. y_val = val.is_trade # 验证集标签
  41. X_val = val.drop(['instance_id','is_trade'],axis=1) # 验证集特征矩阵
  42. # 测试集
  43. offline_test_X = offline_test.drop(['instance_id','is_trade'],axis=1) # 线下测试特征矩阵
  44. online_test_X = online_test.drop(['instance_id'],axis=1) # 线上测试特征矩阵
  45. ### 数据转换
  46. print('数据转换')
  47. lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
  48. lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,free_raw_data=False)
  49. ### 设置初始参数--不含交叉验证参数
  50. print('设置参数')
  51. params = {
  52. 'boosting_type': 'gbdt',
  53. 'objective': 'binary',
  54. 'metric': 'binary_logloss',
  55. }
  56. ### 交叉验证(调参)
  57. print('交叉验证')
  58. min_merror = float('Inf')
  59. best_params = {}
  60. # 准确率
  61. print("调参1:提高准确率")
  62. for num_leaves in range(20,200,5):
  63. for max_depth in range(3,8,1):
  64. params['num_leaves'] = num_leaves
  65. params['max_depth'] = max_depth
  66. cv_results = lgb.cv(
  67. params,
  68. lgb_train,
  69. seed=2018,
  70. nfold=3,
  71. metrics=['binary_error'],
  72. early_stopping_rounds=10,
  73. verbose_eval=True
  74. )
  75. mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  76. boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  77. if mean_merror < min_merror:
  78. min_merror = mean_merror
  79. best_params['num_leaves'] = num_leaves
  80. best_params['max_depth'] = max_depth
  81. params['num_leaves'] = best_params['num_leaves']
  82. params['max_depth'] = best_params['max_depth']
  83. # 过拟合
  84. print("调参2:降低过拟合")
  85. for max_bin in range(1,255,5):
  86. for min_data_in_leaf in range(10,200,5):
  87. params['max_bin'] = max_bin
  88. params['min_data_in_leaf'] = min_data_in_leaf
  89. cv_results = lgb.cv(
  90. params,
  91. lgb_train,
  92. seed=42,
  93. nfold=3,
  94. metrics=['binary_error'],
  95. early_stopping_rounds=3,
  96. verbose_eval=True
  97. )
  98. mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  99. boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  100. if mean_merror < min_merror:
  101. min_merror = mean_merror
  102. best_params['max_bin']= max_bin
  103. best_params['min_data_in_leaf'] = min_data_in_leaf
  104. params['min_data_in_leaf'] = best_params['min_data_in_leaf']
  105. params['max_bin'] = best_params['max_bin']
  106. print("调参3:降低过拟合")
  107. for feature_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  108. for bagging_fraction in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  109. for bagging_freq in range(0,50,5):
  110. params['feature_fraction'] = feature_fraction
  111. params['bagging_fraction'] = bagging_fraction
  112. params['bagging_freq'] = bagging_freq
  113. cv_results = lgb.cv(
  114. params,
  115. lgb_train,
  116. seed=42,
  117. nfold=3,
  118. metrics=['binary_error'],
  119. early_stopping_rounds=3,
  120. verbose_eval=True
  121. )
  122. mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  123. boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  124. if mean_merror < min_merror:
  125. min_merror = mean_merror
  126. best_params['feature_fraction'] = feature_fraction
  127. best_params['bagging_fraction'] = bagging_fraction
  128. best_params['bagging_freq'] = bagging_freq
  129. params['feature_fraction'] = best_params['feature_fraction']
  130. params['bagging_fraction'] = best_params['bagging_fraction']
  131. params['bagging_freq'] = best_params['bagging_freq']
  132. print("调参4:降低过拟合")
  133. for lambda_l1 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  134. for lambda_l2 in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  135. for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
  136. params['lambda_l1'] = lambda_l1
  137. params['lambda_l2'] = lambda_l2
  138. params['min_split_gain'] = min_split_gain
  139. cv_results = lgb.cv(
  140. params,
  141. lgb_train,
  142. seed=42,
  143. nfold=3,
  144. metrics=['binary_error'],
  145. early_stopping_rounds=3,
  146. verbose_eval=True
  147. )
  148. mean_merror = pd.Series(cv_results['binary_error-mean']).min()
  149. boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
  150. if mean_merror < min_merror:
  151. min_merror = mean_merror
  152. best_params['lambda_l1'] = lambda_l1
  153. best_params['lambda_l2'] = lambda_l2
  154. best_params['min_split_gain'] = min_split_gain
  155. params['lambda_l1'] = best_params['lambda_l1']
  156. params['lambda_l2'] = best_params['lambda_l2']
  157. params['min_split_gain'] = best_params['min_split_gain']
  158. print(best_params)
  159. ### 训练
  160. params['learning_rate']=0.01
  161. lgb.train(
  162. params, # 参数字典
  163. lgb_train, # 训练集
  164. valid_sets=lgb_eval, # 验证集
  165. num_boost_round=2000, # 迭代次数
  166. early_stopping_rounds=50 # 早停次数
  167. )
  168. ### 线下预测
  169. print ("线下预测")
  170. preds_offline = lgb.predict(offline_test_X, num_iteration=lgb.best_iteration) # 输出概率
  171. offline=offline_test[['instance_id','is_trade']]
  172. offline['preds']=preds_offline
  173. offline.is_trade = offline['is_trade'].astype(np.float64)
  174. print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
  175. ### 线上预测
  176. print("线上预测")
  177. preds_online = lgb.predict(online_test_X, num_iteration=lgb.best_iteration) # 输出概率
  178. online=online_test[['instance_id']]
  179. online['preds']=preds_online
  180. online.rename(columns={'preds':'predicted_score'},inplace=True) # 更改列名
  181. online.to_csv("./data/20180405.txt",index=None,sep=' ') # 保存结果
  182. ### 保存模型
  183. from sklearn.externals import joblib
  184. joblib.dump(lgb,'lgb.pkl')
  185. ### 特征选择
  186. df = pd.DataFrame(X_train.columns.tolist(), columns=['feature'])
  187. df['importance']=list(lgb.feature_importance()) # 特征分数
  188. df = df.sort_values(by='importance',ascending=False) # 特征排序
  189. df.to_csv("./data/feature_score_20180331.csv",index=None,encoding='gbk') # 保存分数

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/258074
推荐阅读
  

闽ICP备14008679号