赞
踩
原始数据:行为特征(基因),列为样本(两类,B为与癌症无关记-1类,A为癌症有关记1类)34262*140
2.数据预处理
import numpy as np import random from sklearn.decomposition import PCA def loadData(file): ##输入为 行为特征,列为样本 dataMat = [] f=open(file) for line in f.readlines(): dataMat.append(line.split(',')[1::]) dataMat=dataMat[1::] m=len(dataMat) ##特征数 n=len(dataMat[0]) ##样本数 # print(m,n) list2=[] for i in range(n): list1 = [] for j in range(m): # print(j) list1.append(float(dataMat[j][i])) list2.append(list1) labelMat = [1 for i in range(len(dataMat[0]))] for i in range(len(labelMat)): if i % 2 == 0: labelMat[i] = -1 # print(labelMat) return list2,labelMat ##已经变为行为样本,列为特征 def devideData(dataMat,labelMat): m,n=np.shape(dataMat) train=[] test=[] train_label=[] test_label=[] label1=[] ##存类别为1的样本 label2=[] ##存类别为-1的样本 for i in range(m): if labelMat[i]==1: label1.append(dataMat[i]) else: label2.append(dataMat[i]) # print(type(label1)) num1=len(label1) num2=len(label2) for i in range(int(0.8*num1)): r=random.randint(0,len(label1)-1) train.append(label1[r]) label1.pop(r) train_label.append(1) for j in range(int(0.8*num2)): r=random.randint(0,len(label2)-1) train.append(label2[r]) label2.pop(r) train_label.append(-1) for i in range(len(label1)): test.append(label1[i]) test_label.append(1) for j in range(len(label2)): test.append(label2[j]) test_label.append(-1) return train,train_label,test,test_label if __name__=='__main__': file = r'C:\Users\admin\Desktop\testsets_HCC.csv' ##输出的dataMat是一个行为样本,列为特征的列表 140*34262 dataMat,labelMat=loadData(file) ##主成成分分析,调n_components测试 pca=PCA(n_components=140) ##ValueError: n_components='mle' is only supported if n_samples >= n_features newdataMat=pca.fit_transform(dataMat) train,train_label,test,test_label=devideData(newdataMat,labelMat) for i in range(1): print(type(train[i]))
3.基本模型
(1)simple SVM(SMO算法的外循环简略)
import random import numpy as np import matplotlib.pyplot as plt #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj def simple_SOM(dataMat,labelMat,C,toler,maxinter): dataMatrix=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(dataMatrix) alphas=np.mat(np.zeros((m,1))) b=0 iter=0 while iter<maxinter: alphapairschanged=0 for i in range(m): fxi=float((np.multiply(alphas,labelMatrix).transpose())*(dataMatrix*dataMatrix[i,:].transpose()))+b Ei=fxi-float(labelMat[i]) if ((labelMat[i]*Ei<-toler) and (alphas[i]<C)) or ((labelMat[i]*Ei>toler) and (alphas[i]>0)): j=select_int(i,m) fxj=float((np.multiply(alphas,labelMatrix).transpose())*(dataMatrix*dataMatrix[j,:].transpose()))+b # print('fxj:',fxj) # print('b:',b) Ej=fxj-float(labelMat[j]) alphaIold=alphas[i].copy() alphaJold = alphas[j].copy() if labelMat[i]!=labelMat[j]: L=max(0,alphas[j]-alphas[i]) H=min(C,C+alphas[j]-alphas[i]) else: L=max(0,alphas[j]+alphas[i]-C) H=min(C,alphas[j]+alphas[i]) if L==H: print('L==H') continue nn=2*dataMatrix[i,:]*dataMatrix[j,:].transpose()-dataMatrix[i,:]*dataMatrix[i,:].transpose()-dataMatrix[j,:]*dataMatrix[j,:].transpose() if nn>=0: print('nn>=0') continue # print(Ei,Ej,nn) alphas[j]-=labelMat[j]*(Ei-Ej)/nn # print(alphas[j]) alphas[j]=clipalpha(alphas[j],H,L) if abs(alphas[j]-alphaJold)<0.00001: print('J not moving enough') alphas[i]+=labelMat[j]*labelMat[i]*(alphaJold-alphas[j]) b1=b-Ei-labelMat[i]*dataMatrix[i,:]*dataMatrix[i,:].transpose()*(alphas[i]-alphaIold)-labelMat[j]*dataMatrix[i,:]*dataMatrix[j,:].transpose()*(alphas[j]-alphaJold) b2=b-Ej-labelMat[i]*dataMatrix[i,:]*dataMatrix[j,:].transpose()*(alphas[i]-alphaIold)-labelMat[j]*dataMatrix[j,:]*dataMatrix[j,:].transpose()*(alphas[j]-alphaJold) if alphas[i]>0 and C>alphas[i]: b=b1 elif alphas[j]>0 and C>alphas[j]: b=b2 else: b=(b1+b2)/2.0 alphapairschanged+=1 print('inter:%d i:%d,pairs changed:%d'%(iter,i,alphapairschanged)) if alphapairschanged==0: iter+=1 else: iter=0 print('interaction number:%d'%(iter)) return b,alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() if __name__=='__main__': filename=r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch06\testSet.txt' dataMat,labelMat=dataloader(filename) b,alphas=simple_SOM(dataMat,labelMat,0.6,0.001,40) # m,n=np.shape(b) # print(b) label1=[] label2=[] for i in range(100): if alphas[i]>0 and labelMat[i]==1: label1.append(dataMat[i]) if alphas[i]>0 and labelMat[i]==-1: label2.append(dataMat[i]) print('支持向量1: ', label1) print('支持向量2:', label2) slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) b1=label2[0][1]-slope*label2[0][0] b2=label1[0][1]-slope*label1[0][0] b0=(b1+b2)/2 ##算出来的b效果并不是很好,不如b0 Plot_final(dataMat,labelMat,b0,slope) ws = calWs(alphas, dataMat, labelMat) print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 guess = np.mat(dataMat)[0] * np.mat(ws) + b print(guess, labelMat[0])
(2)外循环完整版本
import random import numpy as np import matplotlib.pyplot as plt ##########此方法比simple版本快,但是鲁棒性不太好 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.X*oS.X[k,:].transpose())+oS.b Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.X[i,:]*oS.X[j,:].transpose()-oS.X[i,:]*oS.X[i,:].transpose()-oS.X[j,:]*oS.X[j,:].transpose() if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.X[i, :] * oS.X[i, :].transpose() * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.X[i, :] * oS.X[j, :].transpose() * (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.X[i, :] * oS.X[j, :].transpose() * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.X[j, :] * oS.X[j, :].transpose() * (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def Outcircle(dataMat,labelMat,C,toler,maxIter): oS=altStruct(np.mat(dataMat),labelMat,C,toler) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() if __name__=='__main__': filename=r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch06\testSet.txt' dataMat,labelMat=dataloader(filename) b,alphas=Outcircle(dataMat,labelMat,0.6,0.001,40) # print(type(b)) label1=[] label2=[] for i in range(100): if alphas[i]>0 and labelMat[i]==1: label1.append(dataMat[i]) if alphas[i]>0 and labelMat[i]==-1: label2.append(dataMat[i]) print('支持向量1: ', label1) print('支持向量2:',label2) slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) b1=label2[0][1]-slope*label2[0][0] b2=label1[0][1]-slope*label1[0][0] b0=(b1+b2)/2 Plot_final(dataMat,labelMat,b0,slope) ws=calWs(alphas,dataMat,labelMat) print('分隔面的法向量:',ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 guess=np.mat(dataMat)[0]*np.mat(ws)+b print('one of the guess sample :',guess,'its real label:',labelMat[0]) m, n = np.shape(dataMat) errorcount = 0 for i in range(m): predict =np.mat(dataMat)[0]*np.mat(ws)+b if np.sign(predict) != np.sign(labelMat[i]): errorcount += 1 print('training error rate is:', float(errorcount / m))
(3)使用高斯核函数版本
import random import numpy as np import matplotlib.pyplot as plt ##########此方法比simple版本快,但是鲁棒性不太好 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler,kTup): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) self.K=np.mat(np.zeros((self.m,self.m))) for i in range(self.m): self.K[:,i]=kernelTrans(self.X,self.X[i,:],kTup) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.K[:,k]+oS.b) Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.K[i,i] * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[i,j]* (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.K[i,j]* (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[j,j]* (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def newOutcircle(dataMat,labelMat,C,toler,maxIter,kTup): oS=altStruct(np.mat(dataMat),labelMat,C,toler,kTup) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def kernelTrans(X,A,kTup): m,n=np.shape(X) K=np.mat(np.zeros((m,1))) if kTup[0]=='lin': K=X*A.transpose() ##高斯核函数 elif kTup[0]=='rbf': for j in range(m): deltaRow=X[j,:]-A K[j]=deltaRow*deltaRow.transpose() K=np.exp(K/(-1*kTup[1]**2)) else: raise NameError('Houston we have a problem that kernel is not recoginized') return K def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() if __name__=='__main__': filename=r'F:\机器学习实战源代码和数据\machinelearninginaction\Ch06\testSet.txt' dataMat,labelMat=dataloader(filename) b,alphas=newOutcircle(dataMat,labelMat,0.6,0.001,40,kTup=('rbf',1.3)) ##C的调参很重要 # print(type(b)) # label1=[] # label2=[] # for i in range(100): # if alphas[i]>0 and labelMat[i]==1: # label1.append(dataMat[i]) # if alphas[i]>0 and labelMat[i]==-1: # label2.append(dataMat[i]) # print('支持向量1: ', label1) # print('支持向量2:',label2) # # slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) # b1=label2[0][1]-slope*label2[0][0] # b2=label1[0][1]-slope*label1[0][0] # b0=(b1+b2)/2 # Plot_final(dataMat,labelMat,b0,slope) ws=calWs(alphas,dataMat,labelMat) print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 # guess=np.mat(dataMat)[0]*np.mat(ws)+b # print(guess,labelMat[0]) ##计算误差 dataM=np.mat(dataMat) labelM=np.mat(labelMat).transpose() svInd=np.nonzero(alphas.A>0)[0] sVs=dataM[svInd] labelSV=labelM[svInd] m,n=np.shape(dataM) errorcount=0 for i in range(m): eva=kernelTrans(sVs,dataM[i,:],('rbf',1.3)) predict=eva.transpose()*np.multiply(labelSV,alphas[svInd])+b print('predict,real label:',predict,labelMat[i]) if np.sign(predict)!=np.sign(labelMat[i]): errorcount+=1 # print(errorcount,m) print('training error rate is:',float(errorcount/m))
4.实验(实际测试)
(1)不使用核函数的实战,测试集精确度:30%左右
import random import numpy as np import matplotlib.pyplot as plt import pre_processHCC import SVM_kernel ##########此方法比simple版本快,但是鲁棒性不太好 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## 没有用核函数,C也没有改,精准度很低 def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.X*oS.X[k,:].transpose())+oS.b Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.X[i,:]*oS.X[j,:].transpose()-oS.X[i,:]*oS.X[i,:].transpose()-oS.X[j,:]*oS.X[j,:].transpose() if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.X[i, :] * oS.X[i, :].transpose() * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.X[i, :] * oS.X[j, :].transpose() * (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.X[i, :] * oS.X[j, :].transpose() * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.X[j, :] * oS.X[j, :].transpose() * (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def Outcircle(dataMat,labelMat,C,toler,maxIter): oS=altStruct(np.mat(dataMat),labelMat,C,toler) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() if __name__=='__main__': file = r'C:\Users\admin\Desktop\testsets_HCC.csv' dataMat, labelMat = pre_processHCC.loadData(file) b,alphas=Outcircle(dataMat,labelMat,0.6,0.001,40) # print(type(b)) ##打印出支持向量 # label1=[] # label2=[] # for i in range(100): # if alphas[i]>0 and labelMat[i]==1: # label1.append(dataMat[i]) # if alphas[i]>0 and labelMat[i]==-1: # label2.append(dataMat[i]) # print('支持向量1: ', label1) # print('支持向量2:',label2) # slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) # b1=label2[0][1]-slope*label2[0][0] # b2=label1[0][1]-slope*label1[0][0] # b0=(b1+b2)/2 # Plot_final(dataMat,labelMat,b0,slope) ws=calWs(alphas,dataMat,labelMat) # print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 for sample in range(10): guess=np.mat(dataMat)[sample]*np.mat(ws)+b print(guess,labelMat[sample])
(2)使用核函数,精确度65%~70%
import random import numpy as np import matplotlib.pyplot as plt import pre_processHCC import SVM_kernel from sklearn.decomposition import PCA ##########此方法比simple版本快 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## 用了核函数同时也改了C的大小,没有用PCA,预测时候没有将数据进行核函数转变 def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler,kTup): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) self.K=np.mat(np.zeros((self.m,self.m))) for i in range(self.m): self.K[:,i]=kernelTrans(self.X,self.X[i,:],kTup) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.K[:,k]+oS.b) Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.K[i,i] * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[i,j]* (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.K[i,j]* (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[j,j]* (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def newOutcircle(dataMat,labelMat,C,toler,maxIter,kTup): oS=altStruct(np.mat(dataMat),labelMat,C,toler,kTup) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def kernelTrans(X,A,kTup): m,n=np.shape(X) K=np.mat(np.zeros((m,1))) if kTup[0]=='lin': K=X*A.transpose() ##高斯核函数 elif kTup[0]=='rbf': for j in range(m): deltaRow=X[j,:]-A K[j]=deltaRow*deltaRow.transpose() K=np.exp(K/(-1*kTup[1]**2)) else: raise NameError('Houston we have a problem that kernel is not recoginized') return K def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() def calTrain(dataMat,labelMat,alphas,b): dataM = np.mat(dataMat) labelM = np.mat(labelMat).transpose() svInd = np.nonzero(alphas.A > 0)[0] sVs = dataM[svInd] labelSV = labelM[svInd] m, n = np.shape(dataM) errorcount = 0 TP = 0 P = 0 for i in range(m): eva = kernelTrans(sVs, dataM[i, :], ('rbf', 1.3)) predict = eva.transpose() * np.multiply(labelSV, alphas[svInd]) + b print('predict,real label:', predict, labelMat[i]) if np.sign(predict) != np.sign(labelMat[i]): errorcount += 1 if np.sign(predict) == 1 and labelMat[i] == 1: TP += 1 if labelMat[i] == 1: P += 1 # print(errorcount,m) print('training error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) def calTest(test,test_label,alphas,b,train,train_label): dataM = np.mat(test) labelM = np.mat(test_label).transpose() ws = calWs(alphas, train, train_label) errorcount = 0 TP = 0 P = 0 m, n = np.shape(dataM) for i in range(m): # o,p=np.shape(dataM[i]) # e,w=np.shape(ws) # print(o,p,e,w) predict = dataM[i] * np.mat(ws) + b print('predict,real label:', predict, test_label[i]) if np.sign(predict) != np.sign(test_label[i]): errorcount += 1 if np.sign(predict) == 1 and test_label[i] == 1: TP += 1 if test_label[i] == 1: P += 1 # print(errorcount,m) print('test accurate rate is:', 1 - float(errorcount / m)) print('test error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) return 1 - float(errorcount / m) if __name__=='__main__': file = r'C:\Users\admin\Desktop\testsets_HCC.csv' dataMat1, labelMat1 = pre_processHCC.loadData(file) train, train_label, test, test_label = pre_processHCC.devideData(dataMat1, labelMat1) dataMat=train labelMat=train_label b, alphas = newOutcircle(dataMat, labelMat, 100, 0.001, 100,kTup=('rbf',1.3)) ##改C的大小会很大程度改变精确度 # print(type(b)) # label1=[] # label2=[] # for i in range(100): # if alphas[i]>0 and labelMat[i]==1: # label1.append(dataMat[i]) # if alphas[i]>0 and labelMat[i]==-1: # label2.append(dataMat[i]) # print('支持向量1: ', label1) # print('支持向量2:',label2) # # slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) # b1=label2[0][1]-slope*label2[0][0] # b2=label1[0][1]-slope*label1[0][0] # b0=(b1+b2)/2 # Plot_final(dataMat,labelMat,b0,slope) # ws=calWs(alphas,dataMat,labelMat) # print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 # guess=np.mat(dataMat)[0]*np.mat(ws)+b # print(guess,labelMat[0]) ##计算训练集上的误差 calTrain(dataMat,labelMat,alphas,b) ##计算测试集上的误差 calTest(test,test_label,alphas,b,dataMat,labelMat)
(3)同(2),但测试时使用的是核函数版本 (理论上这样才保持一致,才正确)测试集精确度为100%,很诡异
import random import numpy as np import matplotlib.pyplot as plt import pre_processHCC import SVM_kernel from sklearn.decomposition import PCA ##########此方法比simple版本快 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## 用了核函数同时也改了C的大小,没有用PCA,并且求预测时也用了核函数转变 def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler,kTup): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) self.K=np.mat(np.zeros((self.m,self.m))) for i in range(self.m): self.K[:,i]=kernelTrans(self.X,self.X[i,:],kTup) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.K[:,k]+oS.b) Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.K[i,i] * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[i,j]* (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.K[i,j]* (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[j,j]* (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def newOutcircle(dataMat,labelMat,C,toler,maxIter,kTup): oS=altStruct(np.mat(dataMat),labelMat,C,toler,kTup) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def kernelTrans(X,A,kTup): m,n=np.shape(X) K=np.mat(np.zeros((m,1))) if kTup[0]=='lin': K=X*A.transpose() ##高斯核函数 elif kTup[0]=='rbf': for j in range(m): deltaRow=X[j,:]-A K[j]=deltaRow*deltaRow.transpose() K=np.exp(K/(-1*kTup[1]**2)) ##m*1 算的是0到m-1个样本与第指定的样本的核函数值 else: raise NameError('Houston we have a problem that kernel is not recoginized') return K def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() def calTrain(dataMat,labelMat,alphas,b): dataM = np.mat(dataMat) labelM = np.mat(labelMat).transpose() svInd = np.nonzero(alphas.A > 0)[0] sVs = dataM[svInd] labelSV = labelM[svInd] m, n = np.shape(dataM) errorcount = 0 TP = 0 P = 0 for i in range(m): eva = kernelTrans(sVs, dataM[i, :], ('rbf', 1.3)) predict = eva.transpose() * np.multiply(labelSV, alphas[svInd]) + b print('predict,real label:', predict, labelMat[i]) if np.sign(predict) != np.sign(labelMat[i]): errorcount += 1 if np.sign(predict) == 1 and labelMat[i] == 1: TP += 1 if labelMat[i] == 1: P += 1 # print(errorcount,m) print('training error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) def calTest(test,test_label,alphas,b): dataM = np.mat(test) labelM = np.mat(test_label).transpose() errorcount = 0 TP = 0 P = 0 m, n = np.shape(dataM) for j in range(m): # o,p=np.shape(dataM[i]) # e,w=np.shape(ws) # print(o,p,e,w) evas=0 for i in range(m): eva=kernelTrans(dataM[j,:],dataM[i, :], ('rbf', 1.3)) evas+=np.multiply(labelM[i],alphas[i])*eva predict =evas+b print('predict,real label:', predict, test_label[j]) if np.sign(predict) != np.sign(test_label[j]): errorcount += 1 if np.sign(predict) == 1 and test_label[j] == 1: TP += 1 if test_label[j] == 1: P += 1 # print(errorcount,m) print('test accurate rate is:', 1 - float(errorcount / m)) print('test error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) return 1 - float(errorcount / m) if __name__=='__main__': file = r'C:\Users\admin\Desktop\testsets_HCC.csv' dataMat1, labelMat1 = pre_processHCC.loadData(file) train, train_label, test, test_label = pre_processHCC.devideData(dataMat1, labelMat1) dataMat=train labelMat=train_label b, alphas = newOutcircle(dataMat, labelMat, 100, 0.001, 100,kTup=('rbf',1.3)) ##改C的大小会很大程度改变精确度 # print(type(b)) # label1=[] # label2=[] # for i in range(100): # if alphas[i]>0 and labelMat[i]==1: # label1.append(dataMat[i]) # if alphas[i]>0 and labelMat[i]==-1: # label2.append(dataMat[i]) # print('支持向量1: ', label1) # print('支持向量2:',label2) # # slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) # b1=label2[0][1]-slope*label2[0][0] # b2=label1[0][1]-slope*label1[0][0] # b0=(b1+b2)/2 # Plot_final(dataMat,labelMat,b0,slope) # ws=calWs(alphas,dataMat,labelMat) # print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 # guess=np.mat(dataMat)[0]*np.mat(ws)+b # print(guess,labelMat[0]) ##计算训练集上的误差 calTrain(dataMat,labelMat,alphas,b) ##计算测试集上的误差 calTest(test,test_label,alphas,b)
(4)在(2)的基础上使用PCA‘,测试集上精确度达到85%~90%
import random import numpy as np import matplotlib.pyplot as plt import pre_processHCC import SVM_kernel from sklearn.decomposition import PCA ##########此方法比simple版本快 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## 用了核函数改了C,以及PCA使用,PCA必须用在训练集上所以一下方法存在错误,用在全集的话会用到测试集的先验信息 def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler,kTup): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) self.K=np.mat(np.zeros((self.m,self.m))) for i in range(self.m): self.K[:,i]=kernelTrans(self.X,self.X[i,:],kTup) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.K[:,k]+oS.b) Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.K[i,i] * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[i,j]* (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.K[i,j]* (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[j,j]* (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def newOutcircle(dataMat,labelMat,C,toler,maxIter,kTup): oS=altStruct(np.mat(dataMat),labelMat,C,toler,kTup) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def kernelTrans(X,A,kTup): m,n=np.shape(X) K=np.mat(np.zeros((m,1))) if kTup[0]=='lin': K=X*A.transpose() ##高斯核函数 elif kTup[0]=='rbf': for j in range(m): deltaRow=X[j,:]-A K[j]=deltaRow*deltaRow.transpose() K=np.exp(K/(-1*kTup[1]**2)) else: raise NameError('Houston we have a problem that kernel is not recoginized') return K def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() def calTrain(dataMat,labelMat,alphas,b): dataM = np.mat(dataMat) labelM = np.mat(labelMat).transpose() svInd = np.nonzero(alphas.A > 0)[0] sVs = dataM[svInd] labelSV = labelM[svInd] m, n = np.shape(dataM) errorcount = 0 TP = 0 P = 0 for i in range(m): eva = kernelTrans(sVs, dataM[i, :], ('rbf', 1.3)) predict = eva.transpose() * np.multiply(labelSV, alphas[svInd]) + b print('predict,real label:', predict, labelMat[i]) if np.sign(predict) != np.sign(labelMat[i]): errorcount += 1 if np.sign(predict) == 1 and labelMat[i] == 1: TP += 1 if labelMat[i] == 1: P += 1 # print(errorcount,m) print('training error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) def calTest(test,test_label,alphas,b,train,train_label): dataM = np.mat(test) labelM = np.mat(test_label).transpose() ws = calWs(alphas,train,train_label) errorcount = 0 TP = 0 P = 0 m, n = np.shape(dataM) for i in range(m): # o,p=np.shape(dataM[i]) # e,w=np.shape(ws) # print(o,p,e,w) predict = dataM[i] * np.mat(ws) + b print('predict,real label:', predict, test_label[i]) if np.sign(predict) != np.sign(test_label[i]): errorcount += 1 if np.sign(predict) == 1 and test_label[i] == 1: TP += 1 if test_label[i] == 1: P += 1 # print(errorcount,m) print('test accurate rate is:', 1 - float(errorcount / m)) print('test error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) if __name__=='__main__': file = r'C:\Users\admin\Desktop\testsets_HCC.csv' dataMat1, labelMat1 = pre_processHCC.loadData(file) ##主成成分分析 pca = PCA(n_components=140) ##ValueError: n_components='mle' is only supported if n_samples >= n_features newdataMat = pca.fit_transform(dataMat1) train, train_label, test, test_label = pre_processHCC.devideData(newdataMat, labelMat1) dataMat=train labelMat=train_label b, alphas = newOutcircle(dataMat, labelMat, 100, 0.001, 100,kTup=('rbf',1.3)) ##改C的大小会很大程度改变精确度 # print(type(b)) # label1=[] # label2=[] # for i in range(100): # if alphas[i]>0 and labelMat[i]==1: # label1.append(dataMat[i]) # if alphas[i]>0 and labelMat[i]==-1: # label2.append(dataMat[i]) # print('支持向量1: ', label1) # print('支持向量2:',label2) # # slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) # b1=label2[0][1]-slope*label2[0][0] # b2=label1[0][1]-slope*label1[0][0] # b0=(b1+b2)/2 # Plot_final(dataMat,labelMat,b0,slope) # ws=calWs(alphas,dataMat,labelMat) # print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 # guess=np.mat(dataMat)[0]*np.mat(ws)+b # print(guess,labelMat[0]) ##计算训练集上的误差 calTrain(dataMat,labelMat,alphas,b) ##计算测试集上的误差 calTest(test,test_label,alphas,b,dataMat,labelMat)
附加:自己写的PCA 时间复杂度太高,在三万多个特征下用不了
import numpy as np ##此方法simple版本无法用在特征很大的数据集 ##输入要求:行为样本,列为特征 def PCA(dataX): dataX=np.array(dataX) ##进行规范化 m,n=np.shape(dataX) ##m为样本数 avs=np.tile(np.mean(dataX,0),(m,1)) ##扩充到与原来同规模大 # print(avs) stds=np.tile(np.std(dataX,0),(m,1)) # print(stds) adujstX =(dataX-avs)/stds ##这里都是array形式所以/是对每个元素对应除 covX=np.cov(adujstX.transpose()) # print(covX) # print(dataX-avs) # print(covX) featValue,featVec=np.linalg.eig(covX) # print(featValue[1]) index=np.argsort(-featValue) allfeat=0 for i in range(len(featValue)): allfeat+=featValue[i] percent=0 k=0 for i in range(len(featValue)): percent+=featValue[i]/allfeat if percent>0.75: k=i break # print(featVec) selectVec = np.matrix(featVec.T[index[:k]]) ##featVec的列向量是单位特征向量所以必须先转置 # print(selectVec) # print(selectVec.T) finalData=adujstX*selectVec.T return finalData if __name__=='__main__': a=[[1,91,3,4,3,3,3],[4,5,9,2,5,5,5],[2,3,3,5,7,7,7]] print(PCA(a))
(5)使用SVM-RFE特征选择,这里存在瑕疵应该用与核函数下的特征函数计算,但是调用包时用的是线性SVM下的计算方法
首先附上自己写的DQ(核函数下的特征函数计算),由于时间复杂度高没用成
其实就是在(2)的基础上加了一个DQ函数
import random import numpy as np import matplotlib.pyplot as plt import pre_processHCC import SVM_kernel from sklearn.decomposition import PCA ##########此方法比simple版本快 #########比赛用的时候一定要复制粘贴一份!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ## 用了核函数同时也改了C的大小,没有用PCA,使用自己写的SVMRFE算法对特征进行打分排序,注意这里用的不是线性核所以SVM-RFE算法算特征贡献有小改动 ##由于自己写的SVMRFE算法时间复杂度在n**3次方所以电脑带不动,尤其是在三万多个特征上 def dataloader(filename): dataMat=[] labelMat=[] f=open(filename) for line in f.readlines(): res=line.strip().split('\t') dataMat.append([float(res[0]),float(res[1])]) labelMat.append(float(res[2])) return dataMat,labelMat def select_int(i,m): ##i是第一个a的下标,m是a的总数目 j=i while j==i: j=int(random.uniform(0,m)) return j ##调整a的大小 def clipalpha(aj,h,l): if aj>h: aj=h if aj<l: aj=l return aj class altStruct: def __init__(self,dataMat,labelMat,C,toler,kTup): self.X=dataMat self.L=np.mat(labelMat).transpose() self.LL=labelMat self.C=C self.T=toler self.m=np.shape(self.X)[0] self.alphas=np.mat(np.zeros((self.m,1))) self.b=0 self.echae=np.mat(np.zeros((self.m,2))) self.K=np.mat(np.zeros((self.m,self.m))) for i in range(self.m): self.K[:,i]=kernelTrans(self.X,self.X[i,:],kTup) def getEk(self,oS,k): fxk=float(np.multiply(oS.alphas,oS.L).transpose()*oS.K[:,k]+oS.b) Ek=fxk-float(oS.L[k]) return Ek ## 内循环中的启发式方法,寻找a2 def selectJ(self,i,oS,Ei): ##寻找步长最大的来增加查找速度 maxK=-1 maxdeltaE=0 Ej=0 ValidEcacheList=np.nonzero(self.echae[:,0].A)[0] ##获取非零的E所在的对应的行索引 if len(ValidEcacheList)>1: for k in ValidEcacheList: if k==i: continue Ek=oS.getEk(oS,k) deltaE=abs(Ek-Ei) if deltaE>maxdeltaE: maxdeltaE=deltaE maxK=k Ej=Ek return maxK,Ej else: j=select_int(i,oS.m) Ej=oS.getEk(oS,j) return j,Ej def updateEk(self,oS,k): Ek=oS.getEk(oS,k) oS.echae[k]=[1,Ek] ##内循环 def inner(i,oS): Ei=oS.getEk(oS,i) if ((oS.LL[i]*Ei<-oS.T) and (oS.alphas[i]<oS.C)) or ((oS.LL[i]*Ei>oS.T) and (oS.alphas[i]>0)): j,Ej=oS.selectJ(i,oS,Ei) alphaIold=oS.alphas[i].copy() alphaJold=oS.alphas[j].copy() if oS.LL[i] != oS.LL[j]: L = max(0, oS.alphas[j] - oS.alphas[i]) H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) else: L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) H = min(oS.C, oS.alphas[j] + oS.alphas[i]) if L==H: print('L==H') return 0 nn=2*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] if nn >= 0: print('nn>=0') return 0 print(oS.alphas[j]) print(oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn) oS.alphas[j]-=oS.LL[j]*(np.mat(Ei)-np.mat(Ej))/nn oS.alphas[j]=clipalpha(oS.alphas[j],H,L) oS.updateEk(oS,j) if abs(oS.alphas[j] - alphaJold) < 0.00001: print('J not moving enough') return 0 oS.alphas[i] += oS.LL[j] * oS.LL[i] * (alphaJold - oS.alphas[j]) oS.updateEk(oS,i) b1 = oS.b - Ei - oS.LL[i] * oS.K[i,i] * (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[i,j]* (oS.alphas[j] - alphaJold) b2 = oS.b - Ej - oS.LL[i] * oS.K[i,j]* (oS.alphas[i] - alphaIold) - \ oS.LL[j] * oS.K[j,j]* (oS.alphas[j] - alphaJold) if oS.alphas[i] > 0 and oS.C > oS.alphas[i]: oS.b = b1 elif oS.alphas[j] > 0 and oS.C > oS.alphas[j]: oS.b = b2 else: oS.b = (b1 + b2) / 2.0 return 1 else: return 0 ##外循环,寻找a1 def newOutcircle(dataMat,labelMat,C,toler,maxIter,kTup): oS=altStruct(np.mat(dataMat),labelMat,C,toler,kTup) iter=0 entireSet=True alphapairsChanged=0 ##main body while (iter<maxIter) and ((alphapairsChanged>0) or (entireSet)): ##当超过迭代次数或没有a更新时退出 alphapairsChanged=0 ##遍历所有值 if entireSet: for i in range(oS.m): alphapairsChanged+=inner(i,oS) print('fullset,iter:%d i:%d, pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##遍历在间隔边界上的支持向量,即满足KKT条件的0<ai<C的, (遍历非‘边界’值) else: nonBoundIs=np.nonzero((oS.alphas.A>0)*(oS.alphas.A<C))[0] for i in nonBoundIs: alphapairsChanged+=inner(i,oS) print('nonbound,iter:%d i:%d,pairs changed %d'%(iter,i,alphapairsChanged)) iter+=1 ##使得遍历全值和遍历非‘边界’值交叉进行 if entireSet: entireSet=False elif alphapairsChanged==0: entireSet=True print('interaction number:%d'%(iter)) ##迭代次数iter return oS.b,oS.alphas ##计算超平面的w def calWs(alphas,dataMat,labelMat): X=np.mat(dataMat) labelMatrix=np.mat(labelMat).transpose() m,n=np.shape(X) w=np.zeros((n,1)) for i in range(m): w+=np.multiply(alphas[i]*labelMatrix[i],X[i,:].transpose()) return w def kernelTrans(X,A,kTup): m,n=np.shape(X) K=np.mat(np.zeros((m,1))) if kTup[0]=='lin': K=X*A.transpose() ##高斯核函数 elif kTup[0]=='rbf': for j in range(m): deltaRow=X[j,:]-A K[j]=deltaRow*deltaRow.transpose() K=np.exp(K/(-1*kTup[1]**2)) else: raise NameError('Houston we have a problem that kernel is not recoginized') return K def Plot_final(dataMat,labelMat,b0,slope): dataArray=np.array(dataMat) n,m=np.shape(dataArray) x1=[] y1=[] x2=[] y2=[] for i in range(n): if int(labelMat[i])==1: x1.append(dataArray[i,0]) y1.append(dataArray[i,1]) else: x2.append(dataArray[i,0]) y2.append(dataArray[i,1]) fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(x1,y1,s=30,c='red',marker='s') ax.scatter(x2,y2,s=30,c='green') x=np.arange(-2.0, 10.0, 0.1) y=slope*x+b0 ax.plot(x, y) plt.xlabel('x1') plt.ylabel('x2') plt.show() def calTrain(dataMat,labelMat,alphas,b): dataM = np.mat(dataMat) labelM = np.mat(labelMat).transpose() svInd = np.nonzero(alphas.A > 0)[0] sVs = dataM[svInd] labelSV = labelM[svInd] m, n = np.shape(dataM) errorcount = 0 TP = 0 P = 0 for i in range(m): eva = kernelTrans(sVs, dataM[i, :], ('rbf', 1.3)) predict = eva.transpose() * np.multiply(labelSV, alphas[svInd]) + b print('predict,real label:', predict, labelMat[i]) if np.sign(predict) != np.sign(labelMat[i]): errorcount += 1 if np.sign(predict) == 1 and labelMat[i] == 1: TP += 1 if labelMat[i] == 1: P += 1 # print(errorcount,m) print('training error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) def calTest(test,test_label,alphas,b,train,train_label): dataM = np.mat(test) labelM = np.mat(test_label).transpose() ws = calWs(alphas,train,train_label) errorcount = 0 TP = 0 P = 0 m, n = np.shape(dataM) for i in range(m): # o,p=np.shape(dataM[i]) # e,w=np.shape(ws) # print(o,p,e,w) predict = dataM[i] * np.mat(ws) + b print('predict,real label:', predict, test_label[i]) if np.sign(predict) != np.sign(test_label[i]): errorcount += 1 if np.sign(predict) == 1 and test_label[i] == 1: TP += 1 if test_label[i] == 1: P += 1 # print(errorcount,m) print('test accurate rate is:', 1 - float(errorcount / m)) print('test error rate is:', float(errorcount / m)) ##计算查全率recall指标 R = TP / P print('recall rate of class 1:', R) def DQ(k,dataMat,labelMat,alphas): dataM=np.mat(dataMat) newdataMat=dataM.copy() newdataMat=np.delete(newdataMat,k,axis=1) labelM=np.mat(labelMat).transpose() m, n = np.shape(dataM) dq=0 for i in range(m): for j in range(m): kernel=kernelTrans(dataM[i,:],dataM[j,:],kTup=('rbf',1.3))-kernelTrans((newdataMat[i,:]),newdataMat[j,:],kTup=('rbf',1.3)) dq+=alphas[i]*alphas[j]*labelM[i]*labelM[j]*kernel dqs=0.5*dq return dqs if __name__=='__main__': file = r'C:\Users\admin\Desktop\testsets_HCC.csv' dataMat1, labelMat1 = pre_processHCC.loadData(file) train, train_label, test, test_label = pre_processHCC.devideData(dataMat1, labelMat1) dataMat=train labelMat=train_label b, alphas = newOutcircle(dataMat, labelMat, 100, 0.001, 100,kTup=('rbf',1.3)) ##改C的大小会很大程度改变精确度 ##使用SVM-RFE进行特征排序与剔除 m,n=np.shape(dataMat) valueDq=[] for k in range(n): valueDq.append(DQ(k,dataMat,labelMat,alphas)) x=np.array(valueDq) kSort=x.np.argsort() print(kSort) print(valueDq[kSort]) # print(type(b)) # label1=[] # label2=[] # for i in range(100): # if alphas[i]>0 and labelMat[i]==1: # label1.append(dataMat[i]) # if alphas[i]>0 and labelMat[i]==-1: # label2.append(dataMat[i]) # print('支持向量1: ', label1) # print('支持向量2:',label2) # # slope=(label2[1][1]-label2[0][1])/(label2[1][0]-label2[0][0]) # b1=label2[0][1]-slope*label2[0][0] # b2=label1[0][1]-slope*label1[0][0] # b0=(b1+b2)/2 # Plot_final(dataMat,labelMat,b0,slope) # ws=calWs(alphas,dataMat,labelMat) # print(ws) ##进行分类预测,用第一个测试点测试,若算出来的值小于0则预测为-1类 # guess=np.mat(dataMat)[0]*np.mat(ws)+b # print(guess,labelMat[0]) ##计算训练集上的误差 calTrain(dataMat,labelMat,alphas,b) ##计算测试集上的误差 calTest(test,test_label,alphas,b,dataMat,labelMat)
下面是调用RFE包的文件,后面会用到
from sklearn.svm import SVC from sklearn.feature_selection import RFE import matplotlib.pyplot as plt import pre_processHCC import numpy as np def RFEmine(dataMat,labelMat,n): # 1.读取训练数据集 ##输出的dataMat是一个行为样本,列为特征的列表 140*34262 X=np.array(dataMat)[:,:5000] ##选取一万以上就带不动了 # print(X.shape) y=np.array(labelMat) # Create the RFE object and rank each pixel svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=n, step=1) ##step是每次移除的特征数 rfe.fit(X, y) return rfe.support_ # print(rfe.ranking_) # # Plot pixel ranking # plt.matshow(ranking) # plt.colorbar() # plt.title("Ranking of pixels with RFE") # plt.show() if __name__=='__main__': file= r'C:\Users\admin\Desktop\testsets_HCC.csv' n=500 RFEmine(file,n)
最终使用特征选择,根据保留的特征数目n为横坐标,纵坐标为精确度进行画图
程序如下:
import matplotlib.pyplot as plt import RFE_test3 import pre_processHCC import experiment2HCC_1 import numpy as np ##运行一次需要很长时间,时间复杂度比较大,特征选择必须在训练集上进行,在全集上进行则会用到测试集的标签信息 ##严格上下面在全集上进行特征选择是错误的 file= r'C:\Users\admin\Desktop\testsets_HCC.csv' dataMat1, labelMat1 = pre_processHCC.loadData(file) y1=[] for n in range(1,10): support=RFE_test3.RFEmine(dataMat1,labelMat1,n) dataMat1=np.array(dataMat1)[:,:5000] ##电脑带不动只能5000开始 dataMat2=np.array(dataMat1)[:,support] train, train_label, test, test_label = pre_processHCC.devideData(dataMat2, labelMat1) dataMat = train labelMat = train_label b, alphas = experiment2HCC_1.newOutcircle(dataMat, labelMat, 100, 0.001, 100, kTup=('rbf', 1.3)) y1.append(experiment2HCC_1.calTest(test,test_label,alphas,b,dataMat,labelMat)) x1=range(1,10) plt.plot(x1,y1,label='Frist line',linewidth=3,color='r',marker='o', markerfacecolor='blue',markersize=12) # plt.plot(x2,y2,label='second line') plt.xlabel('n') plt.ylabel('accuracy') plt.title('Interesting Graph\nCheck it out') plt.legend() plt.show()
结果图:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。