赞
踩
在文章开始之前首先我们先来了解一下分类器是什么?
在这里以鸢尾花为例,在鸢尾花中我们可以根据花萼长度、花萼宽度、花瓣长度和花瓣宽度来将鸢尾花分为三类,分别为山鸢尾、杂色鸢尾和维吉尼亚鸢尾。
在上述描述中,花萼长度、花萼宽度、花瓣长度和花瓣宽度均为花的特征数据,山鸢尾、杂色鸢尾和维吉尼亚鸢尾均为花的标签数据。而机器学习分类器的目的就是按照某种训练方式将通过花的特征数据学习到不同种类花的特征,然后对花进行分类。
在python中有一些必要的库(包)需要导入,里面有许多封装好的方便的函数接口,调用起来很好用。
from sklearn import svm, tree, neighbors, neural_network
import numpy as np
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import KFold
import warnings
from sklearn import datasets
首先第一步先导入数据,这里在sklearning已经封装好了,只需要导入就可以了。接下来就是对数据的归一化处理,下面是归一化的公式。
x ′ = x − min ( x ) max ( x ) − min ( x ) x' = \frac{{x - \min (x)}}{{\max (x) - \min (x)}} x′=max(x)−min(x)x−min(x)
接下来对数据进行特征选择,这里采用的是python内置最小方差特征选择方法,顾名思义就是将所有特征中,方差较小的那一列特征筛除。
# 关闭警告
warnings.filterwarnings("ignore")
# 导入iris数据集
iris = datasets.load_iris()
# 获取数据与标签
x1=iris.data
y=iris.target
print('特征选择前,特征数为:%d' % len(x1[0]))
x=np.array(x1)
y=np.array(y)
# 进行归一化处理
index = [i for i in range(len(x))]
np.random.shuffle(index)
x = x[index]
y = y[index]
for i in range(0,len(x[0])):
smax=np.max(x[:,i])
smin = np.min(x[:,i])
x[:,i]=(x[:,i]-smin)/(smax-smin)
# 特征选择(采用最小方差法进行特征选择)
select = VarianceThreshold()
x = select.fit_transform(x)
print('特征选择后,特征数为:%d' % len(x[0]))
数据处理好了之后就能进行模型的搭建了,我在这里选取了多个模型,都是sklearning内置的,包括SVM、决策树、KNN、人工神经网络、高斯分类器、Logistic回归和随机森林,最后还设置了一些继承学习模型,就是将以上所有用过的模型以一种特定地结合方式一起训练(这里采用Voting投票法进行集成)。最后训练过程采用10折交叉验证(如下图所示)。这里的交叉验证指的是将数据集划分为K份(几折交叉验证就分为几份),然后将其中一份作为训练集,其他的作为测试集进行划分,重复K轮,每次轮一下。
Voting集成学习则是指投票法,以投票的方式结合几个弱学习器。投票法又分为软投票和硬投票。这里采用的是硬投票,那么就浅浅的介绍一下硬投票机制。如下图所示,硬投票就是指同时将数据集导入进入几个模型中,最后选取分类结果时,服从少数听从多数。
# 10折交叉验证
kf = KFold(n_splits=10)
iter=0
# 所有的分类器模型
model=[
# SVM
svm.SVC(C=2, kernel='poly',degree=5, decision_function_shape='ovo'),
# 决策树
tree.DecisionTreeClassifier(),
# KNN
neighbors.KNeighborsClassifier(10),
# 人工神经网络
neural_network.MLPClassifier(alpha=1, max_iter=1000),
# 高斯分类器
GaussianProcessClassifier(1.0 * RBF(1.0)),
# Logistic回归
LogisticRegression (multi_class='multinomial'),
# 随机森林
RandomForestClassifier(),
]
model_name=[
'SVM',
'决策树',
'KNN',
'MLP',
'高斯过程分类器',
'Logistic',
'RandomForest',
'Voting',
]
model_num=len(model_name)
# 指标有准确率 精准率 召回率 F1
auc=[];precision=[];recall=[];f1_scores=[]
for i in range(0,model_num):
auc.append(0)
precision.append(0)
recall.append(0)
f1_scores.append(0)
for train_index, test_index in kf.split(x):
y = y.astype('int')
print('*****************第%d折*****************' % iter)
# print("TRAIN:", train_index, "TEST:", test_index)
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
for i in range(0,model_num-1):
model_iter=model[i]
model_iter.fit(x_train, y_train.ravel())
pre_svm = model_iter.predict(x_test)
auc1 = accuracy_score(y_test, pre_svm)
precision1 = precision_score(y_test, pre_svm, average='macro')
recall1 = recall_score(y_test, pre_svm, average='macro')
f1_score1 = f1_score(y_test, pre_svm, average='macro')
print(pre_svm)
print(y_test)
print('%s的准确率:%.5f' % (model_name[i],auc1))
print('%s的精准率:%.5f' % (model_name[i],precision1))
print('%s的召回率:%.5f' % (model_name[i],recall1))
print('%s的F1: %.5f' % (model_name[i],f1_score1))
auc[i] += auc1
precision[i] += precision1
recall[i] += recall1
f1_scores[i] += f1_score1
model_sum=[]
for j in range(0,model_num-1):
model_sum.append((model_name[j],model[j]))
ensemble = VotingClassifier(estimators=model_sum,voting ='hard')
ensemble.fit(x_train, y_train.ravel())
pre_Voting=ensemble.predict(x_test)
auc1 = accuracy_score(y_test, pre_Voting)
precision1 = precision_score(y_test, pre_Voting, average='macro')
recall1 = recall_score(y_test, pre_Voting, average='macro')
f1_score1 = f1_score(y_test, pre_Voting, average='macro')
print(pre_Voting)
print(y_test)
print('Voting的准确率:%.5f' % auc1)
print('Voting的精准率:%.5f' % precision1)
print('Voting的召回率:%.5f' % recall1)
print('Voting的F1: %.5f' % f1_score1)
auc[model_num-1] += auc1
precision[model_num-1] += precision1
recall[model_num-1] += recall1
f1_scores[model_num-1] += f1_score1
iter=iter+1
针对每一种模型在python-sklearning中都已经封装好了,只需要调整参数即可,针对每一种参数都可以进行调整,具体的方法可以自行查找。
最后通过准确率、精准率、召回率和F1-score指标来检验模型效果。至于具体计算公式,我们首先来了解一下二分类的混淆矩阵。
这里有
a c c u r a y = T P + T N T P + T N + F P + F N p r e c i s i o n = T P T P + F P r e c a l l = T P T P + F N F 1 = 2 ∗ T P 2 ∗ T P + F P + F Naccuray=TP+TN+FP+FNTP+TNprecision=TP+FPTPrecall=TP+FNTPF1=2∗TP+FP+FN2∗TPaccuray=TP+TNTP+TN+FP+FNprecision=TPTP+FPrecall=TPTP+FNF1=2∗TP2∗TP+FP+FN
针对多分类在计算时,就是把计算每一类时把其当成1,其他的都算为0,然后计算指标,这样把每一类都计算一遍,取平均数作为最终指标。
最后放上代码合集
from sklearn import svm, tree, neighbors, neural_network
import numpy as np
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import KFold
import warnings
from sklearn import datasets
# 关闭警告
warnings.filterwarnings("ignore")
# 导入iris数据集
iris = datasets.load_iris()
# 获取数据与标签
x1=iris.data
y=iris.target
print('特征选择前,特征数为:%d' % len(x1[0]))
x=np.array(x1)
y=np.array(y)
# 进行归一化处理
index = [i for i in range(len(x))]
np.random.shuffle(index)
x = x[index]
y = y[index]
for i in range(0,len(x[0])):
smax=np.max(x[:,i])
smin = np.min(x[:,i])
x[:,i]=(x[:,i]-smin)/(smax-smin)
# 特征选择(采用最小方差法进行特征选择)
select = VarianceThreshold()
x = select.fit_transform(x)
print('特征选择后,特征数为:%d' % len(x[0]))
# 10折交叉验证
kf = KFold(n_splits=10)
iter=0
# 所有的分类器模型
model=[
# SVM
svm.SVC(C=2, kernel='poly',degree=5, decision_function_shape='ovo'),
# 决策树
tree.DecisionTreeClassifier(),
# KNN
neighbors.KNeighborsClassifier(10),
# 人工神经网络
neural_network.MLPClassifier(alpha=1, max_iter=1000),
# 高斯分类器
GaussianProcessClassifier(1.0 * RBF(1.0)),
# Logistic回归
LogisticRegression (multi_class='multinomial'),
# 随机森林
RandomForestClassifier(),
]
model_name=[
'SVM',
'决策树',
'KNN',
'MLP',
'高斯过程分类器',
'Logistic',
'RandomForest',
'Voting',
]
model_num=len(model_name)
# 指标有准确率 精准率 召回率 F1
auc=[];precision=[];recall=[];f1_scores=[]
for i in range(0,model_num):
auc.append(0)
precision.append(0)
recall.append(0)
f1_scores.append(0)
for train_index, test_index in kf.split(x):
y = y.astype('int')
print('*****************第%d折*****************' % iter)
# print("TRAIN:", train_index, "TEST:", test_index)
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
for i in range(0,model_num-1):
model_iter=model[i]
model_iter.fit(x_train, y_train.ravel())
pre_svm = model_iter.predict(x_test)
auc1 = accuracy_score(y_test, pre_svm)
precision1 = precision_score(y_test, pre_svm, average='macro')
recall1 = recall_score(y_test, pre_svm, average='macro')
f1_score1 = f1_score(y_test, pre_svm, average='macro')
print(pre_svm)
print(y_test)
print('%s的准确率:%.5f' % (model_name[i],auc1))
print('%s的精准率:%.5f' % (model_name[i],precision1))
print('%s的召回率:%.5f' % (model_name[i],recall1))
print('%s的F1: %.5f' % (model_name[i],f1_score1))
auc[i] += auc1
precision[i] += precision1
recall[i] += recall1
f1_scores[i] += f1_score1
model_sum=[]
for j in range(0,model_num-1):
model_sum.append((model_name[j],model[j]))
ensemble = VotingClassifier(estimators=model_sum,voting ='hard')
ensemble.fit(x_train, y_train.ravel())
pre_Voting=ensemble.predict(x_test)
auc1 = accuracy_score(y_test, pre_Voting)
precision1 = precision_score(y_test, pre_Voting, average='macro')
recall1 = recall_score(y_test, pre_Voting, average='macro')
f1_score1 = f1_score(y_test, pre_Voting, average='macro')
print(pre_Voting)
print(y_test)
print('Voting的准确率:%.5f' % auc1)
print('Voting的精准率:%.5f' % precision1)
print('Voting的召回率:%.5f' % recall1)
print('Voting的F1: %.5f' % f1_score1)
auc[model_num-1] += auc1
precision[model_num-1] += precision1
recall[model_num-1] += recall1
f1_scores[model_num-1] += f1_score1
iter=iter+1
print('********************************************************************')
for i in range(0,model_num):
auc[i]/=iter
precision[i]/=iter
recall[i]/=iter
f1_scores[i]/=iter
for i in range(0,model_num):
print('%s的准确率: %.5f | %s的精准率: %.5f | %s的召回率: %.5f | %s的F1:%.5f' % (model_name[i],auc[i],model_name[i],precision[i],model_name[i],recall[i],model_name[i],f1_scores[i]))
运行效果如图所示
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。