赞
踩
- # -*- coding: utf-8 -*-
-
- from logisticRegression import *
- from numpy import *
- import operator
-
- #知道了Iris共有三种类别Iris-setosa,Iris-versicolor和Iris-virginica
- def loadDataSet(filename):
- numFeat = len(open(filename).readline().split(','))-1
- dataMat = []; labelMat = []
- fr = open(filename)
- for line in fr.readlines():
- lineArr = []
- curLine = line.strip().split(',')
- for i in range(numFeat):
- lineArr.append(float(curLine[i]))
- dataMat.append([1]+lineArr) #这里是为了使 x0 等于 1
- labelMat.append(curLine[-1])
- return dataMat,labelMat
-
- # voteResult = {'Iris-setosa':0,'Iris-versicolo':0,'Iris-virginica':0}#记录投票情况
- voteResult = [0,0,0]
- categorylabels = ['Iris-setosa','Iris-versicolor','Iris-virginica']#类别标签
- opts = {'alpha': 0.01, 'maxIter': 100, 'optimizeType': 'smoothStocGradDescent'}
- #训练过程
- dataMat,labelMat = loadDataSet('train.txt')
-
- weight1 = []
- for i in range(3):#三类
- labelMat1 = []
- for j in range(len(labelMat)):#把名称变成0或1的数字
- if labelMat[j] == categorylabels[i]:
- labelMat1.append(1)
- else:
- labelMat1.append(0)
- dataMat = mat(dataMat);labelMat1 = mat(labelMat1).T
- weight1.append(logisticRegression(dataMat,labelMat1,opts))
-
- #测试过程
- dataMat,labelMat = loadDataSet('test.txt')
- dataMat = mat(dataMat)
-
- initial_value = 0
- list_length = len(labelMat)
- h = [initial_value]*list_length
-
- for j in range(len(labelMat)):
- voteResult = [0,0,0]
- for i in range(3):
- h[j] = float(sigmoid(dataMat[j]*weight1[i]))#得到训练结果
- if h[j] > 0.5 and h[j] <= 1:
- voteResult[i] = voteResult[i]+1+h[j]#由于类别少,为了防止同票,投票数要加上概率值
- elif h[j] >= 0 and h[j] <= 0.5:
- voteResult[i] = voteResult[i]-1+h[j]
- else:
- print 'Properbility wrong!'
- h[j] = voteResult.index(max(voteResult))
- print h
- labelMat2 = []
- for j in range(len(labelMat)):#把名称变成0或1或2的数字
- for i in range(3):#三类
- if labelMat[j] == categorylabels[i]:
- labelMat2.append(i);break
-
- #计算正确率
- error = 0.0
- for j in range(len(labelMat)):
- if h[j] != labelMat2[j]:
- error = error +1
-
- pro = 1 - error / len(labelMat)#正确率
- print pro
</pre><pre class="python" name="code">
<span style="white-space:pre"> </span><span style="font-family:SimSun;font-size:24px;">没有优化的情况下的准确率:<img src="" alt="" /></span>
2.第二种多分类方法为所有对所有(All-versus-all,AVA),也就是每次对一类学习一个分类器(one vs on at a time)。假定有M类,那么要构建m(m-1)/2个二元分类器。每一个分类器都使用它应该区分的两个类的元组来训练。为了对未知元组分类,所有的分类器投票表决。该元组被指派到得票数醉倒的类。一般来说‘所有对所有’优于‘一对所有’。解决了不平衡性,但是会占用更大的空间下面的程序主要修改了训练过程:
- -*- coding: utf-8 -*-
-
- from logisticRegression import *
- from numpy import *
- import operator
-
- #知道了Iris共有三种类别Iris-setosa,Iris-versicolor和Iris-virginica
- def loadDataSet(filename):
- numFeat = len(open(filename).readline().split(','))-1
- dataMat = []; labelMat = []
- fr = open(filename)
- for line in fr.readlines():
- lineArr = []
- curLine = line.strip().split(',')
- for i in range(numFeat):
- lineArr.append(float(curLine[i]))
- dataMat.append([1]+lineArr) #这里是为了使 x0 等于 1
- labelMat.append(curLine[-1])
- return dataMat,labelMat
-
- # voteResult = {'Iris-setosa':0,'Iris-versicolo':0,'Iris-virginica':0}#记录投票情况
- voteResult = [0,0,0]
- categorylabels = ['Iris-setosa','Iris-versicolor','Iris-virginica']#类别标签
- opts = {'alpha': 0.01, 'maxIter': 50, 'optimizeType': 'smoothStocGradDescent'}
- #训练过程
- dataMat,labelMat = loadDataSet('train.txt')
-
-
- dataMat2 = dataMat[0:40]+dataMat[80:120]
- dataMat2 = mat(dataMat2)
- dataMat = mat(dataMat)
-
- weight1 = []
- for i in range(3):#三类
- labelMat1 = []
- for j in range(len(labelMat)):#把名称变成0或1的数字
- if labelMat[j] == categorylabels[i]:
- labelMat1.append(1)
- else:
- labelMat1.append(0)
- if i == 0:
- weight1.append(logisticRegression(dataMat[0:80,:],labelMat1[0:80],opts))
- elif i == 1:
- weight1.append(logisticRegression(dataMat[40:120,:],labelMat1[40:120],opts))
- else:
- labelMat12 = labelMat1[0:40]+labelMat1[80:120]
- labelMat12 = labelMat12
- weight1.append(logisticRegression(dataMat2,labelMat12,opts))
-
- #测试过程
- dataMat,labelMat = loadDataSet('test.txt')
- dataMat = mat(dataMat)
-
- initial_value = 0
- list_length = len(labelMat)
- h = [initial_value]*list_length
-
- for j in range(len(labelMat)):
- voteResult = [0,0,0]
- for i in range(2):
- h[j] = float(sigmoid(dataMat[j]*weight1[i]))#得到训练结果
- if h[j] > 0.5 and h[j] <= 1:
- voteResult[i] = voteResult[i]+1#由于类别少,为了防止同票,投票数要加上概率值
- elif h[j] >= 0 and h[j] <= 0.5:
- voteResult[i+1] = voteResult[i+1]+1
- else:
- print 'Properbility wrong!'
- h[j] = float(sigmoid(dataMat[j]*weight1[2]))#得到训练结果
- if h[j] > 0.5 and h[j] <= 1:
- voteResult[2] = voteResult[2]+1#由于类别少,为了防止同票,投票数要加上概率值
- elif h[j] >= 0 and h[j] <= 0.5:
- voteResult[0] = voteResult[0]+1
- else:
- print 'Properbility wrong!'
- h[j] = voteResult.index(max(voteResult))
-
- print h
- labelMat2 = []
- for j in range(len(labelMat)):#把名称变成0或1或2的数字
- for i in range(3):#三类
- if labelMat[j] == categorylabels[i]:
- labelMat2.append(i);break
-
- #计算正确率
- error = 0.0
- for j in range(len(labelMat)):
- if h[j] != labelMat2[j]:
- error = error +1
-
- pro = 1 - error / len(labelMat)#正确率
- print pro
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。