赞
踩
本文作者:何百圣 哈尔滨工业大学(威海) 经济管理学院 数量金融方向
MLAT系列文章为校内课程作业,以blog的形式记录作业。笔者的课程任务是12gradient_boosting,本篇承接上一篇create dataset,用多种boosting方法对dataset进行处理。
- import sys, os
- import warnings
- from time import time
- from itertools import product
- import joblib
- from pathlib import Path
- import numpy as np
- import pandas as pd
-
- import matplotlib.pyplot as plt
- from matplotlib.ticker import FuncFormatter
- from mpl_toolkits.mplot3d import Axes3D
- import seaborn as sns
-
- from xgboost import XGBClassifier
- from lightgbm import LGBMClassifier
- from catboost import CatBoostClassifier
- from sklearn.model_selection import cross_validate
- from sklearn.dummy import DummyClassifier
- from sklearn.tree import DecisionTreeClassifier
- # needed for HistGradientBoostingClassifier
- from sklearn.experimental import enable_hist_gradient_boosting
- from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
- from sklearn.inspection import partial_dependence, plot_partial_dependence
- from sklearn.metrics import roc_auc_score
其他设置
- results_path = Path(r'E:/machine learning for algorithmic trading','results', 'baseline')
-
- warnings.filterwarnings('ignore')
- sns.set_style("whitegrid")
- idx = pd.IndexSlice
- np.random.seed(42)
-
- DATA_STORE = r'E:/machine learning for algorithmic trading/wiki.h5'
这里使用的就是上一节得到的数据集,在原书GitHub中属于第4节。
- def get_data(start='2000', end='2018', task='classification', holding_period=1, dropna=False):
-
- idx = pd.IndexSlice
- target = f'target_{holding_period}m'
-
- with pd.HDFStore(DATA_STORE) as store:
- df = store['engineered_features']
-
-
- if start is not None and end is not None:
- df = df.loc[idx[:, start: end], :]
- if dropna:
- df = df.dropna()
-
- y = (df[target]>0).astype(int)
- #这里target收益率大于零则y为1,否则y为0,应用于classification方法,做方向判断
-
- X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
-
- return y, X
1. startswith() 用来检查字符串是否以target开头 还可设置 beg与end参数,默认 beg = 0, end = len(string)
2. factorize指的是对sector进行分类(例如将建筑业,制造业分为0,1),这里factorize函数还可以加入参数sort,则为有排序的分类,否则先看见的为0,后续出现不同则增加分类
#返回变量为tuple
- cat_cols = ['year', 'month', 'age', 'msize', 'sector']
- def factorize_cats(df, cats=['sector']):
- cat_cols = ['year', 'month', 'age', 'msize'] + cats
- for cat in cats:
- df[cat] = pd.factorize(df[cat])[0]
-
- df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna(-1).astype(int)
- return df
3. one hot编码是将类别变量转换为机器学习算法易于利用的一种形式的过程,get_dummies返回变量为dataframe,为稀疏矩阵。
注意:
#在使用get_dummies时,要防止多重共线性问题,例如:如果将性别数据使用get_dummies,就必须要删除一列,因为知道了一列就知道了另一列
#除了onehot 和 factorize,映射函数map()也可以起到分类的效果
#在分类时,需要注意数据是否有大小效果,例如red和yellow就没有,但是年龄就有
- def get_one_hot_data(df, cols=cat_cols[:-1]):
- df = pd.get_dummies(df,
- columns=cols + ['sector'],
- #columns参数表明这些columns参与分类
- prefix=cols + [''],
- prefix_sep=['_'] * len(cols) + ['']
- #其实get_dummies函数有默认prefix,这里是为了sector分类前不出现sector
- )
- return df.rename(columns={c: c.replace('.0', '') for c in df.columns})
holdout set用于估计交叉验证后的泛化误差
- def get_holdout_set(target, features, period=6):
- idx = pd.IndexSlice
- label = target.name
- dates = np.sort(y.index.get_level_values('date').unique())
- cv_start, cv_end = dates[0], dates[-period - 2]
- holdout_start, holdout_end = dates[-period - 1], dates[-1]
-
- #这里用了大部分的数据来做cross validation,留最后七天做测试集
-
- df = features.join(target.to_frame())
- train = df.loc[idx[:, cv_start: cv_end], :]
- y_train, X_train = train[label], train.drop(label, axis=1)
-
- test = df.loc[idx[:, holdout_start: holdout_end], :]
- y_test, X_test = test[label], test.drop(label, axis=1)
- return y_train, X_train, y_test, X_test
- y, features = get_data()
- X_dummies = get_one_hot_data(features)
- X_factors = factorize_cats(features)
-
- y_clean, features_clean = get_data(dropna=True)
- X_dummies_clean = get_one_hot_data(features_clean)
- X_factors_clean = factorize_cats(features_clean)
- #(clean)将滞后收益率为nan的项都删去了
交叉验证,原理比较简单,这里采用了12-fold
- class OneStepTimeSeriesSplit:
- """Generates tuples of train_idx, test_idx pairs
- Assumes the index contains a level labeled 'date'"""
-
- def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
- self.n_splits = n_splits
- self.test_period_length = test_period_length
- self.shuffle = shuffle
-
- @staticmethod
- def chunks(l, n):
- for i in range(0, len(l), n):
- yield l[i:i + n]
-
- def split(self, X, y=None, groups=None):
- unique_dates = (X.index
- .get_level_values('date')
- .unique()
- .sort_values(ascending=False)
- [:self.n_splits*self.test_period_length])
-
- dates = X.reset_index()[['date']]
- for test_date in self.chunks(unique_dates, self.test_period_length):
- train_idx = dates[dates.date < min(test_date)].index
- test_idx = dates[dates.date.isin(test_date)].index
- if self.shuffle:
- np.random.shuffle(list(train_idx))
- yield train_idx, test_idx
-
- #yield 用于在循环程序里多次返回
-
- def get_n_splits(self, X, y, groups=None):
- return self.n_splits
class实例化
- cv = OneStepTimeSeriesSplit(n_splits=12,
- test_period_length=1,
- shuffle=False)
-
- run_time = {}
交叉验证的各项评价指标
- metrics = {'balanced_accuracy': 'Accuracy' ,
- 'roc_auc': 'AUC',
- 'neg_log_loss': 'Log Loss',
- 'f1_weighted': 'F1',
- 'precision_weighted': 'Precision',
- 'recall_weighted': 'Recall'
- }
-
- def run_cv(clf, X=X_dummies, y=y, metrics=metrics, cv=cv, fit_params=None, n_jobs=-1):
- start = time()
- #scores是一个字典的形式,key是metrics,value是值
- scores = cross_validate(estimator=clf,
- X=X,
- y=y,
- scoring=list(metrics.keys()),
- cv=cv,
- return_train_score=True,
- n_jobs=n_jobs,
- verbose=1,
- fit_params=fit_params)
-
- duration = time() - start
- return scores, duration
结果处理函数,包括metics dataframe以及plot函数、
4.melt() 是 pivot() 逆转操作函数,也是数据透视的一种处理方法,非常好用,只是一言半语讲不清楚。python melt()用法
- def stack_results(scores):
-
- #利用元组创建多重索引
- columns = pd.MultiIndex.from_tuples(
- [tuple(m.split('_', 1)) for m in scores.keys()],
- names=['Dataset', 'Metric'])
- data = np.array(list(scores.values())).T
- df = (pd.DataFrame(data=data,
- columns=columns)
- .iloc[:, 2:])
- results = pd.melt(df, value_name='Value')
- results.Metric = results.Metric.apply(lambda x: metrics.get(x))
- results.Dataset = results.Dataset.str.capitalize()
- return results
-
-
- def plot_result(df, model=None, fname=None):
- m = list(metrics.values())
-
- #catplot函数,表示为用分类型数据(categorical data)绘图
- g = sns.catplot(x='Dataset',
- y='Value',
- hue='Dataset',
- col='Metric',
- data=df,
- col_order=m,
- order=['Train', 'Test'],
- kind="box",
- col_wrap=3,
- sharey=False,
- height=4, aspect=1.2)
- #aspect*height = width
-
- df = df.groupby(['Metric', 'Dataset']).Value.mean().unstack().loc[m]
-
- #遍历子图
- for i, ax in enumerate(g.axes.flat):
- s = f"Train: {df.loc[m[i], 'Train'] :>7.4f}\nTest: {df.loc[m[i], 'Test'] :>7.4f}"
- #用来限制小数位数
-
- #text函数用于给fig增加图例
- ax.text(0.05, 0.85, s, fontsize=10,transform=ax.transAxes,
- bbox=dict(facecolor='white', edgecolor='grey', boxstyle='round,pad=0.5'))
- g.fig.suptitle(model, fontsize=16)
- g.fig.subplots_adjust(top=.9)
- if fname:
- g.savefig(fname, dpi=300);
-
- #transform = ax.transAxes是转换坐标系的意思,不加入这一command结果差别很大,但是具体意思也还没搞清楚
接下来的流程比较重复,对不同的算法建模,cross_validatde,接着plot。最后对比不同算法的效果。算法包括
'Baseline', dummy_result,'Random Forest','AdaBoost','Gradient Booster','XGBoost' 'LightGBM Dummies', 'LightGBM Factors'
5.DummyClassfier是一种使用简单规则进行预测的分类器。通常该分类器作为一个简单的基线(baseline)与其他(真实)分类器进行比较。
6.在机器学习中我们训练模型后,需要把模型保存到本地,这里我们采用joblib来保存
- dummy_clf = DummyClassifier(strategy='stratified',
- random_state=42)
-
- algo = 'dummy_clf'
-
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- dummy_cv_result, run_time[algo] = run_cv(dummy_clf)
- joblib.dump(dummy_cv_result, fname)
- else:
- dummy_cv_result = joblib.load(fname)
-
-
- dummy_result = stack_results(dummy_cv_result)
- dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
plot_result(dummy_result, model='Dummy Classifier')
7. 参数解释,详见代码段。
- rf_clf = RandomForestClassifier(n_estimators=100, #决策树模型的个数
- criterion='gini',
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
-
- # min_sample_split是最低编号。分割所需的样本数量。例如,如果min_sample_split = 6节点中有4个样本,则不会发生分裂(与熵无关)。
-
- # min_sample_leaf另一方面基本上是最小编号。的样本必须是叶节点。
- # 假设min_sample_leaf = 3一个包含5个样本的节点可以拆分为两个大小分别为2和3的叶子节点,则该拆分不会发生,因为最小叶子大小为3
- min_weight_fraction_leaf=0.0,
- max_features='auto',
- #代表sqrt(n_features),default状态就是auto,意思是对于每一个树,随机抽取的特征个数是总个数的平方根
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- min_impurity_split=None,
- bootstrap=True,
- #是否有放回地采样,bootstrap的概念
- oob_score=True,
- #是否使用带外样本来估计泛化精度
- n_jobs=-1,
- random_state=42,
-
- #整数型,默认为0,如果为0则不输出日志,如果为1,则每隔一段时间输出日志,大于1输出日志会更频繁。
- verbose=1)
-
- #n_jobs指定并行性,默认值为None或者数字1,如果设置成-1,则表示将任务派发到所有CPU上
疑问:random forest 用的clean数据,区别是什么呢
- algo = 'random_forest'
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- rf_cv_result, run_time[algo] = run_cv(rf_clf, y=y_clean, X=X_dummies_clean)
- joblib.dump(rf_cv_result, fname)
- else:
- rf_cv_result = joblib.load(fname)
-
- rf_result = stack_results(rf_cv_result)
- rf_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
8.对于train 和 test相差过大的问题的探索
#注意到在train set中,random forest完美的拟合了数据,train和test产生了很大的区别,一开始以为是test样本数目太小了,但是增大交叉验证的test set的数据集,也没有显著的变化,并且train set一直为1。这个问题仍待解决
plot_result(rf_result, model='Random Forest')
9. 参数解释,详见代码段。
- #基础模型设置为单层决策树
- base_estimator = DecisionTreeClassifier(criterion='gini',
- splitter='best',
- max_depth=1,
- min_samples_split=2,
- min_samples_leaf=20,
- min_weight_fraction_leaf=0.0,
- max_features=None,
- random_state=None,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- min_impurity_split=None,
- class_weight=None)
-
- # splitter:取值为"best"和"random","best"在特征的所有划分点中找出最优的划分点,
- # 适合样本量不大的情况,"random"随机地在部分划分点中找局部最优的划分点,适合样本量非常大的情况,默认选择"best"
-
- # min_weight_fraction_leaf:叶子节点最小的样本权重和,默认取0,
- # 即不考虑权重问题,如果小于该数值,该叶子节点会和兄弟节点一起被剪枝(即剔除该叶子节点和其兄弟节点,并停止分裂)。
- # 如果较多样本有缺失值或者样本的分布类别偏差很大,则需考虑样本权重问题。
10.为防止Adaboost过拟合,可以向模型加入正则化项,即为每个弱学习器的权重缩减系数ν,也即learning rate,也称之为学习率。 取值范围为(0,1] 取值较大意味着相同的学习效果,迭代次数更少,需要训练的弱学习器更少 取值较小意味着达到一定的误分类数或学习效果,需要更多迭代次数和更多
- ada_clf = AdaBoostClassifier(base_estimator=base_estimator,
- #n_estimator 参数实际上控制了number of boosting stages
- n_estimators=100,
-
- learning_rate=1.0,
- algorithm='SAMME.R',
- random_state=42)
-
-
11.#'SAMME'代表'使用对样本集分类效果调整弱学习器权重','SAMME.R'代表使用对样本集分类的预测概率调整弱学习器权重,default 为SAMME.R
- algo = 'adaboost'
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- ada_cv_result, run_time[algo] = run_cv(ada_clf, y=y_clean, X=X_dummies_clean)
- joblib.dump(ada_cv_result, fname)
- else:
- ada_cv_result = joblib.load(fname)
-
- ada_result = stack_results(ada_cv_result)
- ada_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
plot_result(ada_result, model='AdaBoost')
The following HistGradientBoostingClassifier initialization code illustrates the key tuning parameters that we previously introduced, in addition to those that we are familiar with from looking at standalone decision tree models.
This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000).
This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples.
- gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy',
- learning_rate=0.1,
-
- # 在adaboost中n_estimator用于控制 number of boosting stages
- # 在histgradientboosting中采用max_iter
- max_iter=100,
- min_samples_leaf=20,
- max_depth=None,
- random_state=None,
- max_leaf_nodes=31, # opt value depends on feature interaction
- warm_start=False,
-
- verbose=0,
- tol=0.0001)
建模后交叉验证
- algo = 'sklearn_gbm'
-
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- gb_cv_result, run_time[algo] = run_cv(gb_clf, y=y_clean, X=X_dummies_clean)
- joblib.dump(gb_cv_result, fname)
- else:
- gb_cv_result = joblib.load(fname)
-
- gb_result = stack_results(gb_cv_result)
- gb_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
plot_result(gb_result, model='Gradient Boosting Classifier')
12.Partial_dependence思路类似于统计中的边际效应,就是在控制其他变量不变的情况下,改变target feature的值,来看模型的fit结果如何变化
但在开始之前,我们还需要去掉公司IPO的时间列数据,避免数据对时间产生过度依赖
13.product(A,B)函数,返回A和B中的元素组成的笛卡尔积的元组 ([0, 1], repeat=2)表示([0, 1],[0, 1])
14.'{:.0%}'.format(y)是格式化的灵活用法之一,应当掌握|
15.当我们只关注某一feature和target之间的关系,那么partial dependency plots的结果就是2D,如果是同时看两个feature,那么最终结果就是一个3D
- X_ = X_factors_clean.drop(['year', 'month'], axis=1)
-
- fname = results_path / f'{algo}_model.joblib'
- if not Path(fname).exists():
- gb_clf.fit(y=y_clean, X=X_)
- joblib.dump(gb_clf, fname)
- else:
- gb_clf = joblib.load(fname)
-
-
- gb_clf.score(X=X_, y=y_clean)
- >>>0.5889181460403748
-
- y_score = gb_clf.predict_proba(X_)[:, 1]
- roc_auc_score(y_score=y_score, y_true=y_clean)
- >>>0.6183261924270361
画图
- fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
-
- plot_partial_dependence(
- estimator=gb_clf,
- X=X_,
- features=['return_12m', 'return_6m', 'CMA', ('return_12m', 'return_6m')],
- percentiles=(0.05, 0.95),
- n_jobs=-1,
- n_cols=2,
- response_method='decision_function',
- grid_resolution=250,
- ax=axes)
-
- for i, j in product([0, 1], repeat=2):
- if i!=1 or j!= 0:
- axes[i][j].xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
-
- axes[1][1].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
-
- axes[0][0].set_ylabel('Partial Dependence')
- axes[1][0].set_ylabel('Partial Dependence')
- axes[0][0].set_xlabel('12-Months Return')
- axes[0][1].set_xlabel('6-Months Return')
- axes[1][0].set_xlabel('Conservative Minus Aggressive')
-
- axes[1][1].set_xlabel('12-Month Return')
- axes[1][1].set_ylabel('6-Months Return')
- fig.suptitle('Partial Dependence Plots', fontsize=16)
- fig.tight_layout()
- fig.subplots_adjust(top=.95)
3D
- targets = ['return_12m', 'return_6m']
- pdp, axes = partial_dependence(estimator=gb_clf,
- features=targets,
- X=X_,
- grid_resolution=100)
-
- XX, YY = np.meshgrid(axes[0], axes[1])
- Z = pdp[0].reshape(list(map(np.size, axes))).T
-
- fig = plt.figure(figsize=(14, 8))
- ax = Axes3D(fig)
- surface = ax.plot_surface(XX, YY, Z,
- rstride=1,
- cstride=1,
- cmap=plt.cm.BuPu,
- edgecolor='k')
- ax.set_xlabel('12-Month Return')
- ax.set_ylabel('6-Month Return')
- ax.set_zlabel('Partial Dependence')
- ax.view_init(elev=22, azim=30)
- ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
- ax.xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
-
- # fig.colorbar(surface)
- fig.suptitle('Partial Dependence by 6- and 12-month Returns', fontsize=16)
- fig.tight_layout()
16. ax.view_init() 改变绘制图像的视角,即相机的位置,azim沿着z轴旋转,elev沿着y轴
17. number of values to plot on x axis 代表横坐标上点的数量,越大表示图上的点越多,默认为100,一般来说,grid_resolution参数不要太大,否则图形的锯齿状明显
18.meshgrid()根据输入的坐标向量生成对应的坐标矩阵
meshgrid用法
- xgb_clf = XGBClassifier(max_depth=3,
- learning_rate=0.1,
- n_estimators=100, # Number of boosted trees to fit.
- silent=True, # Whether to print messages while running
- objective='binary:logistic', # Task and objective or custom objective function
- booster='gbtree', # Select booster: gbtree, gblinear or dart
-
- n_jobs=-1,
- gamma=0, # Min loss reduction for further splits
- min_child_weight=1, # Min sum of sample weight(hessian) needed
- max_delta_step=0, # Max delta step for each tree's weight estimation
- subsample=1, # Subsample ratio of training samples
- colsample_bytree=1, # Subsample ratio of cols for each tree
- colsample_bylevel=1, # Subsample ratio of cols for each split
- reg_alpha=0, # L1 regularization term on weights
- reg_lambda=1, # L2 regularization term on weights
- scale_pos_weight=1, # Balancing class weights
- base_score=0.5, # Initial prediction score; global bias
- random_state=42) # random seed
建模后cv
- algo = 'xgboost'
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- xgb_cv_result, run_time[algo] = run_cv(xgb_clf)
- joblib.dump(xgb_cv_result, fname)
- else:
- xgb_cv_result = joblib.load(fname)
-
- xbg_result = stack_results(xgb_cv_result)
- xbg_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
plot_result(xbg_result, model='XG Boost')
特征重要性分析
- fi = pd.Series(xgb_clf.feature_importances_,
- index=X_dummies.columns)
-
- fi.nlargest(25).sort_values().plot.barh(figsize=(10, 5),
- title='Feature Importance')
- sns.despine()
- plt.tight_layout();
- lgb_clf = LGBMClassifier(boosting_type='gbdt',
- objective='binary', # learning task
- metric='auc',
- num_leaves=31, # Maximum tree leaves for base learners.
- max_depth=-1, # Maximum tree depth for base learners, -1 means no limit.
- learning_rate=0.1, # Adaptive lr via callback override in .fit() method
- n_estimators=100, # Number of boosted trees to fit
- subsample_for_bin=200000, # Number of samples for constructing bins.
- class_weight=None, # dict, 'balanced' or None
- min_split_gain=0.0, # Minimum loss reduction for further split
- min_child_weight=0.001, # Minimum sum of instance weight(hessian)
- min_child_samples=20, # Minimum number of data need in a child(leaf)
- subsample=1.0, # Subsample ratio of training samples
- subsample_freq=0, # Frequency of subsampling, <=0: disabled
- colsample_bytree=1.0, # Subsampling ratio of features
- reg_alpha=0.0,
- reg_lambda=0.0,
- random_state=42, # Random number seed; default: C++ seed
- n_jobs=-1, # Number of parallel threads.
- silent=False,
- importance_type='gain', # default: 'split' or 'gain'
- )
cv
- algo = 'lgb_factors'
-
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- lgb_factor_cv_result, run_time[algo] = run_cv(lgb_clf, X=X_factors, fit_params={'categorical_feature': cat_cols})
- joblib.dump(lgb_factor_cv_result, fname)
- else:
- lgb_factor_cv_result = joblib.load(fname)
-
- lgb_factor_result = stack_results(lgb_factor_cv_result)
- lgb_factor_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
plot_result(lgb_factor_result, model='Light GBM | Factors')
- algo = 'lgb_dummies'
- fname = results_path / f'{algo}.joblib'
- if not Path(fname).exists():
- lgb_dummy_cv_result, run_time[algo] = run_cv(lgb_clf)
- joblib.dump(lgb_dummy_cv_result, fname)
- else:
- lgb_dummy_cv_result = joblib.load(fname)
-
- lgb_dummy_result = stack_results(lgb_dummy_cv_result)
- lgb_dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
plot_result(lgb_dummy_result, model='Light GBM | Factors')
- results = {'Baseline': dummy_result,
- 'Random Forest': rf_result,
- 'AdaBoost': ada_result,
- 'Gradient Booster': gb_result,
- 'XGBoost': xbg_result,
- 'LightGBM Dummies': lgb_dummy_result,
- 'LightGBM Factors': lgb_factor_result}
-
- df = pd.DataFrame()
- for model, result in results.items():
- df = pd.concat([df, result.groupby(['Metric', 'Dataset']
- ).Value.mean().unstack()['Test'].to_frame(model)], axis=1)
-
- df.T.sort_values('AUC', ascending=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。