当前位置:   article > 正文

机器学习工作流

机器学习工作流

 本文的目的是演示如何构建一个相对完整的机器学习工作流

1.首先对工程进行基本的参数配置

  1. # 进行建模基本配置
  2. SCORE_EVA = 'roc_auc'
  3. random_state_clf = 1
  4. n_jobs= 4
  5. cv_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
  6. cv_split2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
  7. X, y = data_of_features, label

2.基于各ML模型默认参数进行建模评估,了解模型针对当前任务的基本建模能力,筛选出有前途的模型进行超参数调优

  1. #Machine Learning Algorithm (MLA) Selection and Initialization
  2. MLA = [
  3. #Ensemble Methods
  4. ensemble.AdaBoostClassifier(random_state=random_state_clf),
  5. ensemble.BaggingClassifier(random_state=random_state_clf),
  6. ensemble.ExtraTreesClassifier(random_state=random_state_clf),
  7. ensemble.GradientBoostingClassifier(random_state=random_state_clf),
  8. ensemble.RandomForestClassifier(random_state=random_state_clf),
  9. #Gaussian Processes
  10. gaussian_process.GaussianProcessClassifier(random_state=random_state_clf),
  11. #GLM
  12. linear_model.LogisticRegressionCV(random_state=random_state_clf),
  13. linear_model.PassiveAggressiveClassifier(random_state=random_state_clf),
  14. linear_model.RidgeClassifierCV(random_state=random_state_clf),
  15. linear_model.SGDClassifier(random_state=random_state_clf),
  16. linear_model.Perceptron(random_state=random_state_clf),
  17. #Navies Bayes
  18. naive_bayes.BernoulliNB(random_state=random_state_clf),
  19. naive_bayes.GaussianNB(random_state=random_state_clf),
  20. #Nearest Neighbor
  21. neighbors.KNeighborsClassifier(random_state=random_state_clf),
  22. #SVM
  23. svm.SVC(probability=True,random_state=random_state_clf),
  24. svm.NuSVC(probability=True,random_state=random_state_clf),
  25. svm.LinearSVC(random_state=random_state_clf),
  26. #Trees
  27. tree.DecisionTreeClassifier(random_state=random_state_clf),
  28. tree.ExtraTreeClassifier(random_state=random_state_clf),
  29. #Discriminant Analysis
  30. discriminant_analysis.LinearDiscriminantAnalysis(random_state=random_state_clf),
  31. discriminant_analysis.QuadraticDiscriminantAnalysis(random_state=random_state_clf),
  32. XGBClassifier(random_state=random_state_clf)
  33. ]
  34. MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Matric Mean', 'MLA Test Matric Mean', 'MLA Test Matric 3*STD' ,'MLA Time']
  35. MLA_compare = pd.DataFrame(columns = MLA_columns)
  36. row_index = 0
  37. for alg in MLA:
  38. #set name and parameters
  39. MLA_name = alg.__class__.__name__
  40. MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
  41. MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
  42. #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
  43. cv_results = model_selection.cross_validate(alg, X, y, cv = cv_split,return_train_score=True,scoring = SCORE_EVA)
  44. MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
  45. MLA_compare.loc[row_index, 'MLA Train Matric Mean'] = cv_results['train_score'].mean()
  46. MLA_compare.loc[row_index, 'MLA Test Matric Mean'] = cv_results['test_score'].mean()
  47. #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
  48. MLA_compare.loc[row_index, 'MLA Test Matric 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!
  49. row_index+=1
  50. MLA_compare = MLA_compare.sort_values(by='MLA Test Matric Mean', ascending=False) # 降序排列
  51. MLA_compare

3.挑选出表现较好的模型,结合交叉验证和递归特征消除技术,同时进行超参数调优和特征选择

  1. #函数可以选择使用REF或者SelectKBest方法结合贝叶斯或网格进行交叉验证寻优
  2. def SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_split,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref', bayes_n_iter=10,verbose = 0,n_jobs=1):
  3. if feature_method == 'ref':
  4. pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', RFE(estimator=clf_model)),('model', clf_model)])
  5. else:
  6. pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', SelectKBest(f_classif)), ('model', clf_model)])
  7. if Search_method =='grid':
  8. grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv_split, verbose=verbose,scoring = SCORE_EVA,n_jobs=n_jobs)
  9. else:
  10. grid_search = BayesSearchCV(pipe, search_spaces=param_grid, verbose=verbose, scoring=SCORE_EVA, cv=cv_split, n_iter=bayes_n_iter,n_jobs=n_jobs)
  11. grid_search.fit(X, y)
  12. return grid_search
  13. #函数通过多次,多折交叉验证的形式找到最佳特征集和超参数,在多次的交叉验证结果中,以出现最多的特征和参数作为最终的优选结果。这里如果用贝叶斯方法,可能每次交叉验证的超参数结果都是唯一的,导致所有
  14. #结果出现次数都是1
  15. def mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=10,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=2,verbose=0,n_jobs=1):
  16. start_time = timeit.default_timer()
  17. inner_cv = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=1)
  18. if cv_outter==1:
  19. grid_search_result = SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)
  20. end_time = timeit.default_timer()
  21. print(f"函数运行时间为 {(end_time - start_time)/60} 分")
  22. print("Best score found: ", grid_search_result.best_score_)
  23. print("Best parameters found: ", grid_search_result.best_params_)
  24. print("Selected features:", np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_])
  25. return grid_search_result
  26. else:
  27. outer_cv = StratifiedKFold(n_splits=cv_split, shuffle=True, random_state=0)
  28. roc =[]
  29. best_params_history = []
  30. selected_features_history = []
  31. # 执行超参数优化
  32. for train_idx, test_idx in outer_cv.split(X, y):
  33. X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  34. y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
  35. search = SearchCV_Feature_and_Parameter(X_train,y_train,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)
  36. best_params_history.append(search.best_params_)
  37. best_model = search.best_estimator_
  38. selected_features = best_model.named_steps['feature_selector'].get_support()
  39. selected_features_history.append(selected_features)
  40. y_pred = best_model.predict(X_test)
  41. y_pred_proba = best_model.predict_proba(X_test)[:,1]
  42. roc.append(roc_auc_score(y_test, y_pred_proba))
  43. best_params_history_df = pd.DataFrame([dict(ordered_dict) for ordered_dict in best_params_history])
  44. best_params_history_df[SCORE_EVA] = roc
  45. print(f"{cv_split}{cv_split}折交叉验证平均ROC: {np.mean(roc):.4f} std: {np.std(roc):.4f} ,{[round(meta,3) for meta in roc]}")
  46. #for i, selected_features in enumerate(selected_features_history, start=1):
  47. # print(f"第{i}次交叉验证所选择的特征: {np.array(features)[selected_features]}")
  48. param_names = best_params_history[0].keys()
  49. overall_best_params = {}
  50. for param_name in param_names:
  51. value_counts = Counter([params[param_name] for params in best_params_history])
  52. most_common_value = value_counts.most_common(1)[0][0]
  53. overall_best_params[param_name] = most_common_value
  54. print("整体最佳超参数: ", overall_best_params)
  55. # 多模型集成+计算整体最佳作为最终超参数
  56. total_features = X.shape[1]
  57. feature_selection_counts = np.zeros(total_features)
  58. # 统计每个特征被选中的次数
  59. for selected_features in selected_features_history:
  60. feature_selection_counts += selected_features.astype(int)
  61. # 设置阈值以确定整体最佳特征集
  62. threshold = len(selected_features_history) // 2
  63. overall_best_features = feature_selection_counts > threshold
  64. print("整体最佳特征集: ", np.array(features)[overall_best_features])
  65. end_time = timeit.default_timer()
  66. print(f"函数运行时间为 {(end_time - start_time)/60} 分")
  67. return best_params_history_df,selected_features_history,roc
  68. def model_evaluate(X,y,model,n_times,test_size=0.3):
  69. scores = []
  70. # 进行多次随机数据划分
  71. for i in range(n_times):
  72. # 划分训练集和测试集
  73. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=i)
  74. # 训练并评估模型,每次调用fit都会重新初始化模型权重
  75. model.fit(X_train, y_train)
  76. y_pred = model.predict_proba(X_test)[:,1]
  77. score = roc_auc_score(y_test, y_pred)
  78. scores.append(score)
  79. return scores
  1. # 通过建立ML工作流对,针对较优模型进行RFE特征选择,和超参数调优,作为本次建模的核心模型
  2. grid_n_estimator = Integer(1, 300)
  3. grid_ratio = Real(0.01, 1.0, 'log-uniform')
  4. grid_learn = Real(0.01, 1.0, 'log-uniform')
  5. grid_max_depth = Integer(1, 15)
  6. grid_min_samples = [5, 10, .03, .05, .10]
  7. grid_criterion = ['gini', 'entropy']
  8. grid_bool = [True, False]
  9. grid_seed = [0]
  10. # 定义超参数搜索空间
  11. param_grid = {
  12. #'feature_selector__k': Integer(5, 15),
  13. 'feature_selector__n_features_to_select': Integer(5, 15),
  14. 'model__learning_rate': Real(0.01, 1.0, 'log-uniform'),
  15. 'model__max_depth': Integer(1, 50),
  16. 'model__n_estimators': Integer(50, 200),
  17. 'model__random_state': grid_seed
  18. }
  19. clf_model = XGBClassifier(scale_pos_weight=2,objective='binary:logistic',seed=0)
  20. grid_search_result = mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=1,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=10,verbose=0,n_jobs=n_jobs)
  21. # 得到optimal特征子集和超参数后,通过多次数据划分,评估模型整体和泛化性能,其中,泛化性能以std结果体现
  22. X_best = X[np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_]]
  23. X_best = StandardScaler().fit_transform(X_best)
  24. clf_model.set_params(**{k.replace("model__", ""): v for k, v in grid_search_result.best_params_.items() if k.startswith("model__")})
  25. scores = model_evaluate(X_best,y,clf_model_EVA,n_times=100,test_size=0.3)
  26. mean_score = round(np.mean(scores),3)
  27. std_score = round(np.std(scores),3)
  28. print('最佳模型',mean_score,std_score)

4.完成特征选择后,基于优选特征子集,结合贝叶斯交叉验证,对备选模型进行超参数调优

  1. #基于optimal特征子集,进行多模型集成
  2. #why choose one model, when you can pick them all with voting classifier
  3. #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
  4. #removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
  5. vote_est = [
  6. #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
  7. ('ada', ensemble.AdaBoostClassifier()),
  8. ('rfc', ensemble.RandomForestClassifier()),
  9. ('gbc', ensemble.GradientBoostingClassifier()),
  10. ('xgb', XGBClassifier())
  11. #('bc', ensemble.BaggingClassifier()),
  12. #('etc',ensemble.ExtraTreesClassifier()),
  13. #('gbc', ensemble.GradientBoostingClassifier()),
  14. #('rfc', ensemble.RandomForestClassifier()),
  15. #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
  16. #('gpc', gaussian_process.GaussianProcessClassifier()),
  17. #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  18. #('lr', linear_model.LogisticRegressionCV()),
  19. #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
  20. #('bnb', naive_bayes.BernoulliNB()),
  21. #('gnb', naive_bayes.GaussianNB()),
  22. #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
  23. #('knn', neighbors.KNeighborsClassifier()),
  24. #SVM: http://scikit-learn.org/stable/modules/svm.html
  25. # ('svc', svm.SVC(probability=True)),
  26. #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
  27. #('xgb', XGBClassifier())
  28. ]
  29. #WARNING: Running is very computational intensive and time expensive.
  30. #Code is written for experimental/developmental purposes and not production ready!
  31. #Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
  32. grid_param = [
  33. [{
  34. #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
  35. 'n_estimators': grid_n_estimator, #default=50
  36. 'learning_rate': grid_learn, #default=1
  37. #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
  38. 'random_state': grid_seed
  39. }],
  40. [{
  41. #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
  42. 'n_estimators': grid_n_estimator, #default=10
  43. 'criterion': grid_criterion, #default=”gini”
  44. 'max_depth': grid_max_depth, #default=None
  45. 'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
  46. 'random_state': grid_seed
  47. }],
  48. [{
  49. #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
  50. #'loss': ['deviance', 'exponential'], #default=’deviance’
  51. 'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
  52. 'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
  53. #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
  54. 'max_depth': grid_max_depth, #default=3
  55. 'random_state': grid_seed
  56. }],
  57. [{
  58. #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
  59. 'learning_rate': grid_learn, #default: .3
  60. 'max_depth': [1,2,4,6,8,10], #default 2
  61. 'n_estimators': grid_n_estimator,
  62. 'seed': grid_seed
  63. }] ,
  64. '''
  65. [{
  66. #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
  67. 'n_estimators': grid_n_estimator, #default=10
  68. 'criterion': grid_criterion, #default=”gini”
  69. 'max_depth': grid_max_depth, #default=None
  70. 'random_state': grid_seed
  71. }],
  72. [{
  73. #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
  74. 'n_estimators': grid_n_estimator, #default=10
  75. 'max_samples': grid_ratio, #default=1.0
  76. 'random_state': grid_seed
  77. }],
  78. [{
  79. #GaussianProcessClassifier
  80. 'max_iter_predict': grid_n_estimator, #default: 100
  81. 'random_state': grid_seed
  82. }],
  83. [{
  84. #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
  85. 'fit_intercept': grid_bool, #default: True
  86. #'penalty': ['l1','l2'],
  87. 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
  88. 'random_state': grid_seed
  89. }],
  90. [{
  91. #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
  92. 'alpha': grid_ratio, #default: 1.0
  93. }],
  94. #GaussianNB -
  95. [{}],
  96. [{
  97. #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
  98. 'n_neighbors': [1,2,3,4,5,6,7], #default: 5
  99. 'weights': ['uniform', 'distance'], #default = ‘uniform’
  100. 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
  101. }],
  102. [{
  103. #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
  104. #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
  105. #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
  106. 'C': [1,2,3,4,5], #default=1.0
  107. 'gamma': grid_ratio, #edfault: auto
  108. 'decision_function_shape': ['ovo', 'ovr'], #default:ovr
  109. 'probability': [True],
  110. 'random_state': grid_seed
  111. }],
  112. '''
  113. ]
  114. start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
  115. for clf, param in zip (vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip
  116. #print(clf[0],clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm
  117. #print(param)
  118. start = time.perf_counter()
  119. #best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = SCORE_EVA)
  120. best_search = BayesSearchCV(clf[1], search_spaces=param, scoring=SCORE_EVA, cv=cv_split, n_iter=50,n_jobs=16)
  121. best_search.fit(X_best, y)
  122. run = time.perf_counter() - start
  123. best_param = best_search.best_params_
  124. clf[1].set_params(**best_param)
  125. #对模型进行多次评估,以分析泛化能力
  126. scores = model_evaluate(X_best, y,clf[1],10,test_size=0.3)
  127. print('The best parameter for {} is {} with a runtime of {:.2f} seconds, scoreing is {:.3f}, std: {:.3f}'.format(clf[1].__class__.__name__, best_param, run,np.mean(scores),np.std(scores)))
  128. run_total = time.perf_counter() - start_total
  129. print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
  130. print('-'*10)

5.完成各模型超参数调优后,进行多模型集成,包括ensemble 或 stacking

  1. #通过投票法 进行多模型集成
  2. #Soft Vote or weighted probabilities w/Tuned Hyperparameters
  3. vote = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft') #voting = 'hard'
  4. vote_cv = model_selection.cross_validate(vote, X_best, y, cv = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
  5. print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_cv['train_score'].mean()*100))
  6. print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_cv['test_score'].mean()*100))
  7. print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_cv['test_score'].std()*100*3))
  8. print('-'*10)
  9. # stacking 法
  10. meta_learner = LogisticRegression()
  11. stacking_model = StackingClassifier(estimators=vote_est, final_estimator=meta_learner)
  12. stacking_cv = model_selection.cross_validate(stacking_model, X_best, y, cv = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
  13. print("Stacking Training w/bin score mean: {:.2f}". format(stacking_cv['train_score'].mean()*100))
  14. print("Stacking Test w/bin score mean: {:.2f}". format(stacking_cv['test_score'].mean()*100))
  15. print("Stacking Test w/bin score 3*std: +/- {:.2f}". format(stacking_cv['test_score'].std()*100*3))
  16. print('-'*10)

参考: A Data Science Framework: To Achieve 99% Accuracy | Kaggle

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/384263
推荐阅读
相关标签
  

闽ICP备14008679号