赞
踩
输入 数据集和簇的数量
返回 聚类中心坐标(codebook)、观测值与生成的质心之间的平均(非平方)欧氏距离(distortion)
例1:
- import numpy as np
- from scipy.cluster.vq import vq, kmeans, whiten
- import matplotlib.pyplot as plt
-
- features = np.array([[1,1],
- [2,2],
- [3,3],
- [4,4],
- [5,5]])
-
- wf = whiten(features)
- print("whiten features: \n", wf)
-
- book = np.array((wf[0], wf[1]))
-
- codebook, distortion = kmeans(wf, book)
- # 可以写kmeans(wf,2), 2表示两个质心,同时启用iter参数
- print("codebook:", codebook)
- print("distortion: ", distortion)
-
- plt.scatter(wf[:,0], wf[:,1])
- plt.scatter(codebook[:, 0], codebook[:, 1], c='r')
- plt.show()
结果:
- whiten features:
- [[0.70710678 0.70710678]
- [1.41421356 1.41421356]
- [2.12132034 2.12132034]
- [2.82842712 2.82842712]
- [3.53553391 3.53553391]]
- codebook: [[1.06066017 1.06066017]
- [2.82842712 2.82842712]]
- distortion: 0.5999999999999999
例2:
- import numpy as np
- from scipy.cluster.vq import vq, kmeans, whiten
- import matplotlib.pyplot as plt
- pts = 5
- a = np.random.multivariate_normal([0, 0], [[4, 1], [1, 4]], size=pts)
- b = np.random.multivariate_normal([30, 10],
- [[10, 2], [2, 10]],
- size=pts)#np.random.multivariate_normal这个官方解释说从多元正态分布中抽取随机样本
- features = np.concatenate((a, b))
- #print(features)
- print(features.shape)
- whitened = whiten(features)
- print(whitened)
- codebook, distortion = kmeans(whitened, 2) #这个Kmeans好像只返回聚类中心、观测值和聚类中心之间的失真
- plt.scatter(whitened[:, 0], whitened[:, 1],c = 'g')
- plt.scatter(codebook[:, 0], codebook[:, 1], c='r')
- plt.show()
结果:
- (10, 2)
- [[-0.01741221 -0.49577372]
- [-0.08524789 -0.17768591]
- [ 0.0657376 0.47027214]
- [ 0.27825025 -0.37835465]
- [-0.0079966 0.50196071]
- [ 1.824303 2.55040188]
- [ 2.05886112 2.08174181]
- [ 2.13775252 1.69008105]
- [ 2.26411531 1.2035603 ]
- [ 1.83463368 0.45632992]]
- """Find optimal number of clustres from a Dataset."""
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- from sklearn.cluster import KMeans
- from scipy.cluster.vq import kmeans
- from sklearn.datasets import load_iris
- from scipy.spatial.distance import cdist
- from scipy.spatial.distance import pdist
-
-
- def load_dataset():
- """Load dataset."""
- # Loading dataset.
- return load_iris().data
-
-
- def find_clusters(dataset):
- """Function to find optimal number of clusters in dataset."""
- # cluster data into K=1..10 clusters
- num_clusters = range(1, 50)
- #从1到50个分别进行聚类,得到50种kmeans的质点坐标和欧式距离
- k_means = [kmeans(dataset, k) for k in num_clusters]
- # cluster's centroids,得到k_means的质点坐标
- centroids = [cent for (cent, var) in k_means]
- #计算[ [每个数据点到每个质点的距离] (<-中括号为一个点到每个质点的距离)....] <-指当质心数为某时,所有点到质点距离的集合
- clusters_dist = [cdist(dataset, cent, 'euclidean') for cent in centroids]
- #argmin:返回每组距离矩阵最小的值的下标。这里每组的意思是一个点到每个质点的距离集合。
- # #所以这里用来判断该数据属于哪一类(哪一个质点),并且argmin会将多重数组平铺成一重数组,即每组clusters_dist都会放到同一个数组中,即最后只有50组数组
- cidx = [np.argmin(_dist, axis=1) for _dist in clusters_dist]
- #返回最短的距离
- dist = [np.min(_dist, axis=1) for _dist in clusters_dist]
- # Mean within-cluster (sum of squares)
- avg_within_sum_sqrd = [sum(d) / dataset.shape[0] for d in dist]
- return {'cidx': cidx, 'avg_within_sum_sqrd': avg_within_sum_sqrd,
- 'K': num_clusters}
-
-
- def plot_elbow_curv(details):
- """Function to plot elbo curv."""
- kidx = 2
- fig = plt.figure()
- ax = fig.add_subplot(111)
- ax.plot(details['K'], details['avg_within_sum_sqrd'], 'b*-')
- ax.plot(details['K'][kidx], details['avg_within_sum_sqrd'][kidx],
- marker='o', markersize=12, markeredgewidth=2,
- markeredgecolor='r', markerfacecolor='None')
- plt.grid(True)
- plt.xlabel('Number of clusters')
- plt.ylabel('Average within-cluster sum of squares')
- plt.title('Elbow for KMeans clustering')
-
-
- def scatter_plot(dataset, details):
- """Function to plot scatter plot of clusters."""
- kidx = 2
- fig = plt.figure()
- ax = fig.add_subplot(111)
- clr = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
- #这个循环为质点个数,每次循环代表一个聚落
- for i in range(details['K'][kidx]):
- #返回Ture或False
- #其实就是当前聚落设为True,其余为False。数组读取的只能为True的值
- ind = (details['cidx'][kidx] == i)
- #数组[Ture,0]指取[0]值,数组[False,0]指不取[0]值
- #dataset[ind[index],2]指:取dataset的第二列的每行,其中ind对应行为False时dataset数组不取
- ax.scatter(dataset[ind, 2], dataset[ind, 1],
- s=30, c=clr[i], label='Cluster %d' % i)
- plt.xlabel('Petal Length')
- plt.ylabel('Sepal Width')
- plt.title('Iris Dataset, KMeans clustering with K=%d' % details['K'][kidx])
- plt.legend()
- plt.show()
-
- #判断肘点,需肉眼观察,上述kidx的值基于这个函数。n代表聚类个数
- def eblow(n):
- """Elbow testing."""
- #测试集的属性
- cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
- df = pd.read_csv('data.csv', usecols=cols).values
- #聚类数从1到n-1的模型
- kmeans_var = [KMeans(n_clusters=k).fit(df) for k in range(1, n)]
- #得到中心点坐标,每个属性看作一维,所以每个中心点坐标有4个值
- centroids = [x.cluster_centers_ for x in kmeans_var]
- #得到每个数据点到每个中心点的距离
- k_euclid = [cdist(df, cent) for cent in centroids]
- #得到k_euclid每组最小的值
- dist = [np.min(ke, axis=1) for ke in k_euclid]
- #每种聚类数对应的最小距离平方和
- wcss = [sum(d**2) for d in dist]
- #原数据集距离平方和的均值
- tss = sum(pdist(df)**2) / df.shape[0]
- bss = tss - wcss
- plt.plot(bss)
- plt.show()
-
- dataset = load_dataset()
- details = find_clusters(dataset)
- plot_elbow_curv(details)
- scatter_plot(dataset, details)
- #可以观察到肘点
- eblow(10)
结果:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。