赞
踩
用于聚类的数据集
- %matplotlib inline
- import scipy.io as sio
- import matplotlib.pyplot as plt
-
-
- '''
- 各种聚类数据
- '''
- #two_cluster
- def two_cluster():
- two_cluster=u'cluster_data/two_cluster.mat'
- two_cluster=sio.loadmat(two_cluster)['X'].T
- data = two_cluster
- return data
- #three_cluster
- def three_cluster():
- path=u'cluster_data/three_cluster.mat'
- three_cluster=sio.loadmat(path)['X'].T
- data = three_cluster
- return data
- #five_cluster
- def five_cluster():
- path=u'cluster_data/five_cluster.mat'
- five_cluster=sio.loadmat(path)
- x=five_cluster['x'] #得到的数据为二行n列
- y=five_cluster['y'] #到的数据为一行n列
- data = np.vstack((x,y)).T #先垂直合并,而后转置
- #data = np.array([x[0,:],x[1,:],y[0,:]]).T #list与array互换
- return data
- #spiral
- def spiral():
- path=u'cluster_data/spiral.mat'
- spiral=sio.loadmat(path)['spiral']
- spiral = spiral[0::3,:] #每隔3行取一个数据
- data = spiral
- data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
- return data
- #spiral_unbalance
- def spiral_unbalance():
- path=u'cluster_data/spiral_unbalance.mat'
- spiral_unbalance=sio.loadmat(path)['spiral_unbalance']
- spiral_unbalance = spiral_unbalance[0::3,:] #每隔3行取一个数据
- data = spiral_unbalance
- data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
- return data
- #ThreeCircles
- def ThreeCircles():
- path=u'cluster_data/ThreeCircles.mat'
- ThreeCircles=sio.loadmat(path)['ThreeCircles']
- ThreeCircles = ThreeCircles[0::3,:] #每隔3行取一个数据
- data = ThreeCircles
- data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
- return data
- #Twomoons
- def Twomoons():
- path=u'cluster_data/Twomoons.mat'
- Twomoons=sio.loadmat(path)['Twomoons']
- Twomoons = Twomoons[0::3,:] #每隔3行取一个数据
- data = Twomoons
- data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
- plt.scatter(data[:,0],data[:,1],c=data[:,2])
- return data
- #Twomoons1
- def Twomoons1():
- path=u'cluster_data/Twomoons.mat'
- Twomoons1=sio.loadmat(path)['Twomoons']
- Twomoons1 = Twomoons1[0::3,:] #每隔3行取一个数据
- data = Twomoons1
- data = np.array([data[:,1],data[:,2],data[:,0]]).T #list与array互换
- return data
- def test():
- print 'test'
-
-
- def show_all():
- plt.figure(figsize=(16,8))
- #动态调用方法
- func_name_list = ['two_cluster','three_cluster','five_cluster','spiral','spiral_unbalance','ThreeCircles','Twomoons','Twomoons1']
- for i in range(8):
- data_list.append(eval(func_name_list[i])())
- #动态画图
- for i in range(8):
- data = data_list[i]
- plt.subplot(2,4,i+1)
- #plt.figure()
- plt.scatter(data[:,0],data[:,1],c=data[:,2])
-
- data_list = []
- show_all()

使用scikit的kmeans进行聚类
- %matplotlib inline
- import scipy.io as sio
- #matlab文件名
- two_cluster=u'cluster_data/two_cluster.mat'
- data=sio.loadmat(two_cluster)
- print data
- %matplotlib inline
- import matplotlib.pyplot as plt
- x = data['X']
- cValue = x[2]
- plt.scatter(x[0],x[1],c=cValue)
- from sklearn import cluster, datasets
- b = np.array(x).T
- b = b[:,0:2]
-
- y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(b)
-
- cValue = x[2]
- plt.scatter(x[0],x[1],c=y_pred)
- %matplotlib inline
- import scipy.io as sio
- #matlab文件名
- two_cluster=u'cluster_data/spiral.mat'
- spiral=sio.loadmat(two_cluster)['spiral']
- spiral = spiral[0::3,:] #每隔3行取一个数据
- print len(spiral),len(spiral[0])
- cValue = spiral[:,0]
- print cValue.shape
- color = ['b','y']
- cValue = [color[int(i)] for i in list(cValue)]
- plt.scatter(spiral[:,1],spiral[:,2],c=cValue)
使用kmeans结果
- from sklearn import cluster, datasets
-
- y_pred = cluster.KMeans(n_clusters=2, random_state=170).fit_predict(spiral[:,1:3])
-
- plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
使用scipy进行聚类效果
- # -*- coding: utf8 -*-
- %matplotlib inline
- import scipy.io as sio
- import matplotlib.pyplot as plt
- import scipy.cluster.hierarchy as hcluster
- from sklearn.cluster import AgglomerativeClustering
- import numpy.random as random
- import numpy as np
- import numpy.core.fromnumeric
-
-
- def loadData():
- #matlab文件名
- two_cluster=u'cluster_data/spiral.mat'
- spiral=sio.loadmat(two_cluster)['spiral']
- spiral = spiral[0::3,:] #每隔3行取一个数据
- print len(spiral),len(spiral[0])
- cValue = spiral[:,0]
- print cValue.shape
- color = ['b','y']
- cValue = [color[int(i)] for i in list(cValue)]
- plt.scatter(spiral[:,1],spiral[:,2],c=cValue)
-
-
- def spiralSample():
- plt.subplot(131)
- plt.title(u'origal data')
- plt.scatter(spiral[:,1],spiral[:,2],c=spiral[:,0])
- #scipy进行聚类,默认depth=2(可得到两类),阈值t为距离阈值,设置criterion='maxclust',找到两类之间最小距离小于t的进行合并
- #http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster
- y_pred=hcluster.fclusterdata(spiral[:,1:3],criterion='maxclust',t=2)
- plt.subplot(132)
- plt.title(u'use scipy to hierarchy cluster')
- plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
- #scikit进行聚类
- plt.subplot(133)
- plt.title(u'use scikit to hierarchy cluster')
- y_pred = AgglomerativeClustering(n_clusters=2, linkage='ward').fit_predict(spiral[:,1:3])
- plt.scatter(spiral[:,1],spiral[:,2],c=y_pred)
- plt.show()
- spiralSample()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。