赞
踩
1、损失函数通常有0-1损失,平方损失,绝对值损失和对数损失
2、通常用测试集对模型评估,评估的方法有留出法,交叉验证法,留一法,自助法
#留出法:直接将数据分为对立的三部分(也可以是两部分,此时训练集也是验证集),在训练集上训练模型,在验证集上选择模型,最后用测试集上的误差作为泛化误差的估计,数据集划分尽量保持数据分布一致,可以用分层采样来保持比例,通常采用多次随机划分,取平均值值作为留出法的评估结果
#交叉验证法:K折交叉验证将数据集分为个大小相同互不相交的子集,每次用其中一个子集作为测试集,求误差均值作为泛化误差的估计
# 留一法:是K=N时交叉验证集的一个特例,每次用一个样本测试,结果较准确,数据集大的时候计算量大
# 自助法:有放回的重复独立采样,总的数据集中约有63.2%的样本会出现在采样集中,数据集较小时使用合适
3、评估指标:
# 1.混淆矩阵
# 2.P-R曲线(查准率/精确率、查全率/召回率)以及precision和recall调和平均f1
# 3.ROC图(AUC值,真正率-假正率)AR=2AUC-1
# 4.准确率accuracy(类别不平衡时不适合使用此评估指标)
# 5.区分能力KS值等
-
- #损失函数
- from sklearn.metrics import zero_one_loss
- from sklearn.metrics import log_loss
- y_true = [1,1,1,1,1,1,0,0,0,0]
- y_pred = [0,1,0,1,1,0,1,0,1,0]
- print('zero_one_loss<fraction>',zero_one_loss(y_true, y_pred, normalize=True)) #normalize=True返回误分类样本的比例,否则返回数量
- print('zero_one_loss<num>',zero_one_loss(y_true, y_pred, normalize=False)) #还有参数sample_weight,默认每个样本权重为1
-
- y_true = [1,1,1,0,0,0]
- y_pred_prob =[[0.1,0.9],
- [0.2,0.8],
- [0.3,0.7],
- [0.7,0.3],
- [0.8,0.2],
- [0.9,0.1]] #最后一个代表预测为类别0的概率为0.9,预测为1的概率为0.1
- print('log_loss average', log_loss(y_true, y_pred_prob, normalize=True))#normalize为true返回对数损失均值
- print('log_loss total', log_loss(y_true, y_pred_prob, normalize=False))#为false返回对数误差的总和
-
-
- #数据集切分,对于类别不平衡数据集,采用带有分层抽样设置的切分方法更优
- import numpy as np
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import KFold,StratifiedKFold
- from sklearn.model_selection import StratifiedShuffleSplit
- from sklearn.model_selection import LeaveOneOut
- from sklearn.model_selection import cross_val_score
- x = [[1,2,3,4],
- [11,12,13,14],
- [21,22,23,24],
- [31,32,33,34],
- [41,42,43,44],
- [51,52,53,54],
- [61,62,63,64],
- [71,72,73,74]]
- y = [1,1,0,0,1,1,0,0]
- x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.4, random_state=0)
- print('x_train',x_train)
- print('x_test',x_test)
- print('y_train',y_train)
- print('y_test',y_test)
- #stratify参数为一个数组或者None,如果不是None,将分层采样,采样的标记数组由此参数指定
- x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.4, stratify=y, random_state=0)
- print('stratify x_train',x_train)
- print('stratify x_test',x_test)
- print('stratify y_train',y_train)
- print('stratify y_test',y_test)
- #从运行结果可看出,分层采样之后,训练集和测试集的类别比例与原始几乎相同
-
- #K折交叉
- x = np.array(x)
- y = np.array(y)
- #shuffle为False,切分之前不混洗数据(按顺序切分为4部分),若为True,切分之前混洗数据(随机分为4部分)
- folder = KFold(n_splits=4, shuffle=False, random_state=42)
- for train_index,test_index in folder.split(x,y):
- x_train,x_test = x[train_index],x[test_index]
- y_test,y_train = y[test_index],y[train_index]
- print('x_train,x_test',x_train,x_test)
- print('y_train,y_test',y_train,y_test)
-
- #StratifiedShuffleSplit为分层采样的K折交叉切分,参数与KFold同
- stra_folder = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
- for train_index,test_index in stra_folder.split(x,y):
- x_train
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。