赞
踩
构建一个模型,根据鸢尾花的花萼和花瓣大小将其分为三种不同的品种。
总共包含150行数据
每一行数据由 4 个特征值及一个目标值组成。
4 个特征值分别为:萼片长度、萼片宽度、花瓣长度、花瓣宽度
目标值为三种不同类别的鸢尾花,分别为: Iris Setosa、Iris Versicolour、Iris Virginica
numpy:python第三方库,用于科学计算
matplotlib:python第三方库,主要用于进行可视化
sklearn:python的重要机器学习库,其中封装了大量的机器学习算法,如:分类、回归、降维以及聚类
import numpy as np
from matplotlib import colors
from sklearn import svm
from sklearn.svm import SVC
from sklearn import model_selection
import matplotlib.pyplot as plt
import matplotlib as mpl
#若提示缺少matplotlib,则可使用以下命令pip安装matplotlib !pip install matplotlib
Looking in indexes: https://mirror.baidu.com/pypi/simple/
Requirement already satisfied: matplotlib in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (2.2.3)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (2.4.2)
Requirement already satisfied: six>=1.10 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (1.15.0)
Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (2.8.0)
Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (1.1.0)
Requirement already satisfied: pytz in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (2019.3)
Requirement already satisfied: numpy>=1.7.1 in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from matplotlib) (1.16.4)
Requirement already satisfied: setuptools in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib) (41.4.0)
(1)从指定路径下加载数据
(2)对加载的数据进行数据分割,x_train,x_test,y_train,y_test分别表示训练集特征、训练集标签、测试集特征、测试集标签
#*************将字符串转为整型,便于数据加载***********************
def iris_type(s):
it = {b'Iris-setosa':0, b'Iris-versicolor':1, b'Iris-virginica':2}
return it[s]
#加载数据
data_path='/home/aistudio/data/data5420/iris.data' #数据文件的路径
data = np.loadtxt(data_path, #数据文件路径
dtype=float, #数据类型
delimiter=',', #数据分隔符
converters={4:iris_type}) #将第5列使用函数iris_type进行转换
#print(data) #data为二维数组,data.shape=(150, 5)
#print(data.shape)
#数据分割
x, y = np.split(data, #要切分的数组
(4,), #沿轴切分的位置,第5列开始往后为y
axis=1) #代表纵向分割,按列分割
x = x[:, 1:3] #在X中我们取前两列作为特征,为了后面的可视化。x[:,0:4]代表第一维(行)全取,第二维(列)取0~2
#print(x)
x_train,x_test,y_train,y_test=model_selection.train_test_split(x, #所要划分的样本特征集
y, #所要划分的样本结果
random_state=1, #随机数种子
test_size=0.34) #测试样本占比
C越大,相当于惩罚松弛变量,希望松弛变量接近0,即对误分类的惩罚增大,趋向于对训练集全分对的情况,这样对训练集测试时准确率很高,但泛化能力弱。 C值小,对误分类的惩罚减小,允许容错,将他们当成噪声点,泛化能力较强。
kernel='linear’时,为线性核
decision_function_shape='ovr’时,为one v rest,即一个类别与其他类别进行划分,
decision_function_shape='ovo’时,为one v one,即将类别两两之间进行划分,用二分类的方法模拟多分类的结果。
#**********************SVM分类器构建*************************
def classifier():
#clf = svm.SVC(C=0.8,kernel='rbf', gamma=50,decision_function_shape='ovr')
clf = svm.SVC(C=4.5, #误差项惩罚系数,默认值是1
kernel='linear', #线性核 kenrel="rbf":高斯核
decision_function_shape='ovr') #决策函数
return clf
# 2.定义模型:SVM模型定义
clf = classifier()
#***********************训练模型*****************************
def train(clf,x_train,y_train):
clf.fit(x_train, #训练集特征向量
y_train.ravel()) #训练集目标值
#***********************训练模型*****************************
def train(clf,x_train,y_train):
clf.fit(x_train, #训练集特征向量
y_train.ravel()) #训练集目标值
# 3.训练SVM模型
train(clf,x_train,y_train)
#**************并判断a b是否相等,计算acc的均值*************
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print('%s Accuracy:%.3f' %(tip, np.mean(acc)))
def print_accuracy(clf,x_train,y_train,x_test,y_test):
#分别打印训练集和测试集的准确率 score(x_train,y_train):表示输出x_train,y_train在模型上的准确率
print('trianing prediction:%.3f' %(clf.score(x_train, y_train)))
print('test data prediction:%.3f' %(clf.score(x_test, y_test)))
#原始结果与预测结果进行对比 predict()表示对x_train样本进行预测,返回样本类别
show_accuracy(clf.predict(x_train), y_train, 'traing data')
show_accuracy(clf.predict(x_test), y_test, 'testing data')
#计算决策函数的值,表示x到各分割平面的距离
print('decision_function:\n', clf.decision_function(x_train))
# 4.模型评估
print_accuracy(clf,x_train,y_train,x_test,y_test)
trianing prediction:0.949
test data prediction:0.981
traing dataAccuracy:0.949
testing data Accuracy:0.981
decision_function:
[[-0.28511551 0.75134361 2.29949102] [-0.26974562 1.14329711
2.25906204] [-0.28533774 0.7765415 2.296209 ] [-0.27610295 1.01515872 2.27563096] [-0.25846862 2.23714 1.16559583] [-0.23697606 2.2604873 0.82415842] [-0.29010407 0.74461096
2.30299551] [-0.2312083 2.27098667 0.7748048 ] [ 2.23847258 1.3083452 -0.3123076 ] [-0.26712727 1.21250938 2.2315817 ] [-0.26246778 1.22680441 2.20386853] [ 2.23911297 1.30919283
-0.31294081] [ 2.24717505 1.30982316 -0.31377967] [-0.27785033 0.89056789 2.28202443] [ 2.24690829 1.3094283 -0.31349321] [-0.25866887 1.23016767 2.18408193] [-0.2616156 1.20963165
2.22046128] [-0.23882917 2.26354069 0.8148892 ] [-0.24868798 2.25708785 0.89914189] [-0.27377792 0.97401576 2.27466412] [-0.2556862 2.23899147 1.14390451] [ 2.24636978 1.30859741
-0.31289434] [-0.23967907 2.27463577 0.7735167 ] [-0.23850813 2.26680301 0.80026683] [-0.27265412 1.09823536 2.26765863] [-0.18706845 2.29075241 0.717563 ] [-0.27879139 0.83407467
2.28640248] [-0.23292763 2.27567155 0.76297874] [ 2.24840202 1.3090801 -0.31333567] [ 2.23660532 1.30871466 -0.31248168] [ 2.25478814 1.30931776 -0.31386716] [-0.27377792 0.97401576 2.27466412] [ 2.24346907 1.30970985 -0.31351513] [-0.26728393 1.20130133 2.23869247] [-0.2714871 1.15974394 2.25875329] [-0.26399581 1.20642893 2.22853633] [ 2.24480305 1.3089595
-0.31305889] [ 2.23116706 1.30769913 -0.31149848] [-0.2332875 2.27323691 0.77035264] [ 2.23660532 1.30871466 -0.31248168] [-0.28344712 0.80198332 2.29240959] [-0.23850813 2.26680301
0.80026683] [-0.28816489 0.75199776 2.30092725] [-0.26638405 1.18993017 2.2421202 ] [-0.25330869 2.25932553 0.91567442] [-0.26670367 1.15374145 2.25294845] [-0.27817922 0.81878095
2.28726104] [-0.18933212 2.28655396 0.7240306 ] [ 2.25898113 1.30949297 -0.31424761] [-0.22180668 2.27914197 0.74675214] [ 2.22754249 1.30930452 -0.31253004] [-0.22428169 2.28085009 0.74428392] [-0.2573104 2.23124206 1.17607022] [ 2.23847258 1.3083452 -0.3123076 ] [-0.25846862 2.23714 1.16559583] [ 2.24057836 1.30840873 -0.31245742] [ 2.22220171 1.3065231 -0.31023733] [-0.27097863 1.0670203 2.26789817] [-0.26124517 1.2279496 2.19788287] [-0.2779604 0.86013373 2.2838974 ] [ 2.24508294 1.30936986 -0.31335793] [ 2.24840202 1.3090801 -0.31333567] [-0.25268898 2.24077301 1.11576984] [ 2.23660532 1.30871466 -0.31248168] [-0.26728393 1.20130133 2.23869247] [ 2.23153963 1.30815264 -0.31184494] [-0.26638405 1.18993017 2.2421202 ] [-0.25353745 2.25526631 0.97177439] [ 2.24317883 1.30931114 -0.3132208 ] [-0.26654425 1.17386312 2.24787597] [ 2.25107516 1.31068018 -0.31457189] [-0.225545 2.27432218 0.76040048] [ 2.23693935 1.30913324 -0.31279787] [-0.21779973 2.28323389 0.73657323] [-0.21683537 2.28652883 0.73003149] [ 2.24865966 1.30948645 -0.31362667] [ 2.23911297 1.30919283 -0.31294081] [-0.18176273 2.29093656 0.71669389] [ 2.26244793 1.31143358 -0.31571836] [-0.2312083 2.27098667 0.7748048 ] [-0.25399108 2.24564741 1.09519758] [ 2.2428867 1.30889874
-0.3129176 ] [-0.25085893 2.2471885 1.04896913] [-0.2336448 2.27058761 0.77894929] [-0.25786136 2.2533556 1.06641552] [-0.24919825 2.2479389 1.01840075] [-0.26728393 1.20130133
2.23869247] [ 2.2336055 1.30776599 -0.31166002] [ 2.22899783 1.30808779 -0.31168617] [-0.27762889 0.99334782 2.2778178 ] [-0.22512716 2.2766714 0.75433278] [-0.27785033 0.89056789
2.28202443] [-0.19343185 2.28783229 0.72275308] [-0.26622309 1.20305596 2.23553311] [-0.27175995 1.10374143 2.26615264] [-0.26479267 1.2244377 2.21438166] [-0.27365058 1.04339035
2.27200737] [ 2.23660532 1.30871466 -0.31248168]]
def draw(clf, x):
iris_feature = 'sepal length', 'sepal width', 'petal lenght', 'petal width'
# 开始画图
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() #第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() #第1列的范围
x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j] #生成网格采样点
grid_test = np.stack((x1.flat, x2.flat), axis=1) #stack():沿着新的轴加入一系列数组
print('grid_test:\n', grid_test)
# 输出样本到决策面的距离
z = clf.decision_function(grid_test)
print('the distance to decision plane:\n', z)
grid_hat = clf.predict(grid_test) # 预测分类值 得到【0,0.。。。2,2,2】
print('grid_hat:\n', grid_hat)
grid_hat = grid_hat.reshape(x1.shape) # reshape grid_hat和x1形状一致
#若3*3矩阵e,则e.shape()为3*3,表示3行3列
cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'b', 'r'])
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) # pcolormesh(x,y,z,cmap)这里参数代入
# x1,x2,grid_hat,cmap=cm_light绘制的是背景。
plt.scatter(x[:, 0], x[:, 1], c=np.squeeze(y), edgecolor='k', s=50, cmap=cm_dark) # 样本点
plt.scatter(x_test[:, 0], x_test[:, 1], s=120, facecolor='none', zorder=10) # 测试点
plt.xlabel(iris_feature[0], fontsize=20)
plt.ylabel(iris_feature[1], fontsize=20)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title('svm in iris data classification', fontsize=30)
plt.grid()
plt.show()
# 5.模型使用
draw(clf,x)
grid_test: [[2. 1. ] [2. 1.02964824] [2.
1.05929648] … [4.4 6.84070352] [4.4 6.87035176] [4.4 6.9 ]] the distance to decision plane: [[
2.23603876 1.31062179 -0.3138434 ] [ 2.2345731 1.31048229 -0.31368203] [ 2.2330626 1.31034107 -0.31351797] … [-0.28340112 0.74683883 2.29943538] [-0.28377855 0.7451476 2.2999127 ] [-0.28415031 0.74352625 2.30037676]] grid_hat: [0. 0. 0. … 2. 2.
2.]
欢迎大家加我微信交流讨论
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。