赞
踩
本文的目的是演示如何构建一个相对完整的机器学习工作流
- # 进行建模基本配置
- SCORE_EVA = 'roc_auc'
- random_state_clf = 1
- n_jobs= 4
- cv_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
- cv_split2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
- X, y = data_of_features, label
- #Machine Learning Algorithm (MLA) Selection and Initialization
- MLA = [
- #Ensemble Methods
- ensemble.AdaBoostClassifier(random_state=random_state_clf),
- ensemble.BaggingClassifier(random_state=random_state_clf),
- ensemble.ExtraTreesClassifier(random_state=random_state_clf),
- ensemble.GradientBoostingClassifier(random_state=random_state_clf),
- ensemble.RandomForestClassifier(random_state=random_state_clf),
-
- #Gaussian Processes
- gaussian_process.GaussianProcessClassifier(random_state=random_state_clf),
-
- #GLM
- linear_model.LogisticRegressionCV(random_state=random_state_clf),
- linear_model.PassiveAggressiveClassifier(random_state=random_state_clf),
- linear_model.RidgeClassifierCV(random_state=random_state_clf),
- linear_model.SGDClassifier(random_state=random_state_clf),
- linear_model.Perceptron(random_state=random_state_clf),
-
- #Navies Bayes
- naive_bayes.BernoulliNB(random_state=random_state_clf),
- naive_bayes.GaussianNB(random_state=random_state_clf),
-
- #Nearest Neighbor
- neighbors.KNeighborsClassifier(random_state=random_state_clf),
-
- #SVM
- svm.SVC(probability=True,random_state=random_state_clf),
- svm.NuSVC(probability=True,random_state=random_state_clf),
- svm.LinearSVC(random_state=random_state_clf),
-
- #Trees
- tree.DecisionTreeClassifier(random_state=random_state_clf),
- tree.ExtraTreeClassifier(random_state=random_state_clf),
-
- #Discriminant Analysis
- discriminant_analysis.LinearDiscriminantAnalysis(random_state=random_state_clf),
- discriminant_analysis.QuadraticDiscriminantAnalysis(random_state=random_state_clf),
-
- XGBClassifier(random_state=random_state_clf)
- ]
-
- MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Matric Mean', 'MLA Test Matric Mean', 'MLA Test Matric 3*STD' ,'MLA Time']
- MLA_compare = pd.DataFrame(columns = MLA_columns)
- row_index = 0
- for alg in MLA:
-
- #set name and parameters
- MLA_name = alg.__class__.__name__
- MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
- MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
-
- #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
- cv_results = model_selection.cross_validate(alg, X, y, cv = cv_split,return_train_score=True,scoring = SCORE_EVA)
-
- MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
- MLA_compare.loc[row_index, 'MLA Train Matric Mean'] = cv_results['train_score'].mean()
- MLA_compare.loc[row_index, 'MLA Test Matric Mean'] = cv_results['test_score'].mean()
- #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
- MLA_compare.loc[row_index, 'MLA Test Matric 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!
-
- row_index+=1
-
- MLA_compare = MLA_compare.sort_values(by='MLA Test Matric Mean', ascending=False) # 降序排列
- MLA_compare
- #函数可以选择使用REF或者SelectKBest方法结合贝叶斯或网格进行交叉验证寻优
- def SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_split,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref', bayes_n_iter=10,verbose = 0,n_jobs=1):
- if feature_method == 'ref':
- pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', RFE(estimator=clf_model)),('model', clf_model)])
- else:
- pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', SelectKBest(f_classif)), ('model', clf_model)])
-
- if Search_method =='grid':
- grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv_split, verbose=verbose,scoring = SCORE_EVA,n_jobs=n_jobs)
- else:
- grid_search = BayesSearchCV(pipe, search_spaces=param_grid, verbose=verbose, scoring=SCORE_EVA, cv=cv_split, n_iter=bayes_n_iter,n_jobs=n_jobs)
- grid_search.fit(X, y)
- return grid_search
- #函数通过多次,多折交叉验证的形式找到最佳特征集和超参数,在多次的交叉验证结果中,以出现最多的特征和参数作为最终的优选结果。这里如果用贝叶斯方法,可能每次交叉验证的超参数结果都是唯一的,导致所有
- #结果出现次数都是1
-
- def mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=10,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=2,verbose=0,n_jobs=1):
- start_time = timeit.default_timer()
- inner_cv = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=1)
- if cv_outter==1:
- grid_search_result = SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)
- end_time = timeit.default_timer()
- print(f"函数运行时间为 {(end_time - start_time)/60} 分")
- print("Best score found: ", grid_search_result.best_score_)
- print("Best parameters found: ", grid_search_result.best_params_)
- print("Selected features:", np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_])
- return grid_search_result
- else:
- outer_cv = StratifiedKFold(n_splits=cv_split, shuffle=True, random_state=0)
-
- roc =[]
- best_params_history = []
- selected_features_history = []
- # 执行超参数优化
- for train_idx, test_idx in outer_cv.split(X, y):
- X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
- y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
- search = SearchCV_Feature_and_Parameter(X_train,y_train,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)
- best_params_history.append(search.best_params_)
- best_model = search.best_estimator_
-
- selected_features = best_model.named_steps['feature_selector'].get_support()
- selected_features_history.append(selected_features)
-
- y_pred = best_model.predict(X_test)
- y_pred_proba = best_model.predict_proba(X_test)[:,1]
- roc.append(roc_auc_score(y_test, y_pred_proba))
-
- best_params_history_df = pd.DataFrame([dict(ordered_dict) for ordered_dict in best_params_history])
- best_params_history_df[SCORE_EVA] = roc
-
- print(f"{cv_split}次{cv_split}折交叉验证平均ROC: {np.mean(roc):.4f} std: {np.std(roc):.4f} ,{[round(meta,3) for meta in roc]}")
- #for i, selected_features in enumerate(selected_features_history, start=1):
- # print(f"第{i}次交叉验证所选择的特征: {np.array(features)[selected_features]}")
- param_names = best_params_history[0].keys()
- overall_best_params = {}
- for param_name in param_names:
- value_counts = Counter([params[param_name] for params in best_params_history])
- most_common_value = value_counts.most_common(1)[0][0]
- overall_best_params[param_name] = most_common_value
- print("整体最佳超参数: ", overall_best_params)
- # 多模型集成+计算整体最佳作为最终超参数
- total_features = X.shape[1]
- feature_selection_counts = np.zeros(total_features)
- # 统计每个特征被选中的次数
- for selected_features in selected_features_history:
- feature_selection_counts += selected_features.astype(int)
- # 设置阈值以确定整体最佳特征集
- threshold = len(selected_features_history) // 2
- overall_best_features = feature_selection_counts > threshold
- print("整体最佳特征集: ", np.array(features)[overall_best_features])
- end_time = timeit.default_timer()
- print(f"函数运行时间为 {(end_time - start_time)/60} 分")
- return best_params_history_df,selected_features_history,roc
- def model_evaluate(X,y,model,n_times,test_size=0.3):
- scores = []
- # 进行多次随机数据划分
- for i in range(n_times):
- # 划分训练集和测试集
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=i)
- # 训练并评估模型,每次调用fit都会重新初始化模型权重
- model.fit(X_train, y_train)
- y_pred = model.predict_proba(X_test)[:,1]
- score = roc_auc_score(y_test, y_pred)
- scores.append(score)
- return scores
- # 通过建立ML工作流对,针对较优模型进行RFE特征选择,和超参数调优,作为本次建模的核心模型
- grid_n_estimator = Integer(1, 300)
- grid_ratio = Real(0.01, 1.0, 'log-uniform')
- grid_learn = Real(0.01, 1.0, 'log-uniform')
- grid_max_depth = Integer(1, 15)
- grid_min_samples = [5, 10, .03, .05, .10]
- grid_criterion = ['gini', 'entropy']
- grid_bool = [True, False]
- grid_seed = [0]
-
- # 定义超参数搜索空间
- param_grid = {
- #'feature_selector__k': Integer(5, 15),
- 'feature_selector__n_features_to_select': Integer(5, 15),
- 'model__learning_rate': Real(0.01, 1.0, 'log-uniform'),
- 'model__max_depth': Integer(1, 50),
- 'model__n_estimators': Integer(50, 200),
- 'model__random_state': grid_seed
- }
-
- clf_model = XGBClassifier(scale_pos_weight=2,objective='binary:logistic',seed=0)
- grid_search_result = mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=1,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=10,verbose=0,n_jobs=n_jobs)
- # 得到optimal特征子集和超参数后,通过多次数据划分,评估模型整体和泛化性能,其中,泛化性能以std结果体现
- X_best = X[np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_]]
- X_best = StandardScaler().fit_transform(X_best)
- clf_model.set_params(**{k.replace("model__", ""): v for k, v in grid_search_result.best_params_.items() if k.startswith("model__")})
- scores = model_evaluate(X_best,y,clf_model_EVA,n_times=100,test_size=0.3)
- mean_score = round(np.mean(scores),3)
- std_score = round(np.std(scores),3)
- print('最佳模型',mean_score,std_score)
- #基于optimal特征子集,进行多模型集成
- #why choose one model, when you can pick them all with voting classifier
- #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
- #removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
- vote_est = [
- #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
- ('ada', ensemble.AdaBoostClassifier()),
- ('rfc', ensemble.RandomForestClassifier()),
- ('gbc', ensemble.GradientBoostingClassifier()),
-
- ('xgb', XGBClassifier())
- #('bc', ensemble.BaggingClassifier()),
- #('etc',ensemble.ExtraTreesClassifier()),
- #('gbc', ensemble.GradientBoostingClassifier()),
- #('rfc', ensemble.RandomForestClassifier()),
-
- #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
- #('gpc', gaussian_process.GaussianProcessClassifier()),
-
- #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
- #('lr', linear_model.LogisticRegressionCV()),
-
- #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
- #('bnb', naive_bayes.BernoulliNB()),
- #('gnb', naive_bayes.GaussianNB()),
-
- #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
- #('knn', neighbors.KNeighborsClassifier()),
-
- #SVM: http://scikit-learn.org/stable/modules/svm.html
- # ('svc', svm.SVC(probability=True)),
-
- #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
- #('xgb', XGBClassifier())
-
- ]
-
-
- #WARNING: Running is very computational intensive and time expensive.
- #Code is written for experimental/developmental purposes and not production ready!
- #Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- grid_param = [
- [{
- #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
- 'n_estimators': grid_n_estimator, #default=50
- 'learning_rate': grid_learn, #default=1
- #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
- 'random_state': grid_seed
- }],
-
- [{
- #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
- 'n_estimators': grid_n_estimator, #default=10
- 'criterion': grid_criterion, #default=”gini”
- 'max_depth': grid_max_depth, #default=None
- 'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
- 'random_state': grid_seed
- }],
-
- [{
- #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
- #'loss': ['deviance', 'exponential'], #default=’deviance’
- 'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
- 'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
- #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
- 'max_depth': grid_max_depth, #default=3
- 'random_state': grid_seed
- }],
-
- [{
- #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
- 'learning_rate': grid_learn, #default: .3
- 'max_depth': [1,2,4,6,8,10], #default 2
- 'n_estimators': grid_n_estimator,
- 'seed': grid_seed
- }] ,
- '''
- [{
- #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
- 'n_estimators': grid_n_estimator, #default=10
- 'criterion': grid_criterion, #default=”gini”
- 'max_depth': grid_max_depth, #default=None
- 'random_state': grid_seed
- }],
-
-
- [{
- #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
- 'n_estimators': grid_n_estimator, #default=10
- 'max_samples': grid_ratio, #default=1.0
- 'random_state': grid_seed
- }],
-
-
- [{
- #GaussianProcessClassifier
- 'max_iter_predict': grid_n_estimator, #default: 100
- 'random_state': grid_seed
- }],
-
-
- [{
- #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
- 'fit_intercept': grid_bool, #default: True
- #'penalty': ['l1','l2'],
- 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
- 'random_state': grid_seed
- }],
-
-
- [{
- #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
- 'alpha': grid_ratio, #default: 1.0
- }],
-
-
- #GaussianNB -
- [{}],
-
- [{
- #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
- 'n_neighbors': [1,2,3,4,5,6,7], #default: 5
- 'weights': ['uniform', 'distance'], #default = ‘uniform’
- 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
- }],
-
-
- [{
- #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
- #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
- #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
- 'C': [1,2,3,4,5], #default=1.0
- 'gamma': grid_ratio, #edfault: auto
- 'decision_function_shape': ['ovo', 'ovr'], #default:ovr
- 'probability': [True],
- 'random_state': grid_seed
- }],
- '''
-
- ]
-
- start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
- for clf, param in zip (vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip
-
- #print(clf[0],clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm
- #print(param)
-
- start = time.perf_counter()
- #best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = SCORE_EVA)
- best_search = BayesSearchCV(clf[1], search_spaces=param, scoring=SCORE_EVA, cv=cv_split, n_iter=50,n_jobs=16)
- best_search.fit(X_best, y)
- run = time.perf_counter() - start
-
- best_param = best_search.best_params_
- clf[1].set_params(**best_param)
- #对模型进行多次评估,以分析泛化能力
- scores = model_evaluate(X_best, y,clf[1],10,test_size=0.3)
- print('The best parameter for {} is {} with a runtime of {:.2f} seconds, scoreing is {:.3f}, std: {:.3f}'.format(clf[1].__class__.__name__, best_param, run,np.mean(scores),np.std(scores)))
-
- run_total = time.perf_counter() - start_total
- print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
- print('-'*10)
- #通过投票法 进行多模型集成
- #Soft Vote or weighted probabilities w/Tuned Hyperparameters
- vote = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft') #voting = 'hard'
- vote_cv = model_selection.cross_validate(vote, X_best, y, cv = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
- print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_cv['train_score'].mean()*100))
- print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_cv['test_score'].mean()*100))
- print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_cv['test_score'].std()*100*3))
- print('-'*10)
-
- # stacking 法
- meta_learner = LogisticRegression()
- stacking_model = StackingClassifier(estimators=vote_est, final_estimator=meta_learner)
- stacking_cv = model_selection.cross_validate(stacking_model, X_best, y, cv = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
- print("Stacking Training w/bin score mean: {:.2f}". format(stacking_cv['train_score'].mean()*100))
- print("Stacking Test w/bin score mean: {:.2f}". format(stacking_cv['test_score'].mean()*100))
- print("Stacking Test w/bin score 3*std: +/- {:.2f}". format(stacking_cv['test_score'].std()*100*3))
- print('-'*10)
参考: A Data Science Framework: To Achieve 99% Accuracy | Kaggle
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。