当前位置:   article > 正文

在K-Means算法中使用肘部法寻找最佳聚类数_肘部法确定最佳聚类中心

肘部法确定最佳聚类中心

1.from scipy.cluster.vq import kmeans包的介绍:

输入 数据集和簇的数量 

返回 聚类中心坐标(codebook)、观测值与生成的质心之间的平均(非平方)欧氏距离(distortion

例1:

  1. import numpy as np
  2. from scipy.cluster.vq import vq, kmeans, whiten
  3. import matplotlib.pyplot as plt
  4. features = np.array([[1,1],
  5. [2,2],
  6. [3,3],
  7. [4,4],
  8. [5,5]])
  9. wf = whiten(features)
  10. print("whiten features: \n", wf)
  11. book = np.array((wf[0], wf[1]))
  12. codebook, distortion = kmeans(wf, book)
  13. # 可以写kmeans(wf,2), 2表示两个质心,同时启用iter参数
  14. print("codebook:", codebook)
  15. print("distortion: ", distortion)
  16. plt.scatter(wf[:,0], wf[:,1])
  17. plt.scatter(codebook[:, 0], codebook[:, 1], c='r')
  18. plt.show()

结果:

  1. whiten features:
  2. [[0.70710678 0.70710678]
  3. [1.41421356 1.41421356]
  4. [2.12132034 2.12132034]
  5. [2.82842712 2.82842712]
  6. [3.53553391 3.53553391]]
  7. codebook: [[1.06066017 1.06066017]
  8. [2.82842712 2.82842712]]
  9. distortion: 0.5999999999999999

例2:

  1. import numpy as np
  2. from scipy.cluster.vq import vq, kmeans, whiten
  3. import matplotlib.pyplot as plt
  4. pts = 5
  5. a = np.random.multivariate_normal([0, 0], [[4, 1], [1, 4]], size=pts)
  6. b = np.random.multivariate_normal([30, 10],
  7. [[10, 2], [2, 10]],
  8. size=pts)#np.random.multivariate_normal这个官方解释说从多元正态分布中抽取随机样本
  9. features = np.concatenate((a, b))
  10. #print(features)
  11. print(features.shape)
  12. whitened = whiten(features)
  13. print(whitened)
  14. codebook, distortion = kmeans(whitened, 2) #这个Kmeans好像只返回聚类中心、观测值和聚类中心之间的失真
  15. plt.scatter(whitened[:, 0], whitened[:, 1],c = 'g')
  16. plt.scatter(codebook[:, 0], codebook[:, 1], c='r')
  17. plt.show()

结果:

  1. (10, 2)
  2. [[-0.01741221 -0.49577372]
  3. [-0.08524789 -0.17768591]
  4. [ 0.0657376 0.47027214]
  5. [ 0.27825025 -0.37835465]
  6. [-0.0079966 0.50196071]
  7. [ 1.824303 2.55040188]
  8. [ 2.05886112 2.08174181]
  9. [ 2.13775252 1.69008105]
  10. [ 2.26411531 1.2035603 ]
  11. [ 1.83463368 0.45632992]]

2.针对鸢尾花数据集的k-means的肘部法优化的聚类

  1. """Find optimal number of clustres from a Dataset."""
  2. import numpy as np
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from sklearn.cluster import KMeans
  6. from scipy.cluster.vq import kmeans
  7. from sklearn.datasets import load_iris
  8. from scipy.spatial.distance import cdist
  9. from scipy.spatial.distance import pdist
  10. def load_dataset():
  11. """Load dataset."""
  12. # Loading dataset.
  13. return load_iris().data
  14. def find_clusters(dataset):
  15. """Function to find optimal number of clusters in dataset."""
  16. # cluster data into K=1..10 clusters
  17. num_clusters = range(1, 50)
  18. #从1到50个分别进行聚类,得到50种kmeans的质点坐标和欧式距离
  19. k_means = [kmeans(dataset, k) for k in num_clusters]
  20. # cluster's centroids,得到k_means的质点坐标
  21. centroids = [cent for (cent, var) in k_means]
  22. #计算[ [每个数据点到每个质点的距离] (<-中括号为一个点到每个质点的距离)....] <-指当质心数为某时,所有点到质点距离的集合
  23. clusters_dist = [cdist(dataset, cent, 'euclidean') for cent in centroids]
  24. #argmin:返回每组距离矩阵最小的值的下标。这里每组的意思是一个点到每个质点的距离集合。
  25. # #所以这里用来判断该数据属于哪一类(哪一个质点),并且argmin会将多重数组平铺成一重数组,即每组clusters_dist都会放到同一个数组中,即最后只有50组数组
  26. cidx = [np.argmin(_dist, axis=1) for _dist in clusters_dist]
  27. #返回最短的距离
  28. dist = [np.min(_dist, axis=1) for _dist in clusters_dist]
  29. # Mean within-cluster (sum of squares)
  30. avg_within_sum_sqrd = [sum(d) / dataset.shape[0] for d in dist]
  31. return {'cidx': cidx, 'avg_within_sum_sqrd': avg_within_sum_sqrd,
  32. 'K': num_clusters}
  33. def plot_elbow_curv(details):
  34. """Function to plot elbo curv."""
  35. kidx = 2
  36. fig = plt.figure()
  37. ax = fig.add_subplot(111)
  38. ax.plot(details['K'], details['avg_within_sum_sqrd'], 'b*-')
  39. ax.plot(details['K'][kidx], details['avg_within_sum_sqrd'][kidx],
  40. marker='o', markersize=12, markeredgewidth=2,
  41. markeredgecolor='r', markerfacecolor='None')
  42. plt.grid(True)
  43. plt.xlabel('Number of clusters')
  44. plt.ylabel('Average within-cluster sum of squares')
  45. plt.title('Elbow for KMeans clustering')
  46. def scatter_plot(dataset, details):
  47. """Function to plot scatter plot of clusters."""
  48. kidx = 2
  49. fig = plt.figure()
  50. ax = fig.add_subplot(111)
  51. clr = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
  52. #这个循环为质点个数,每次循环代表一个聚落
  53. for i in range(details['K'][kidx]):
  54. #返回Ture或False
  55. #其实就是当前聚落设为True,其余为False。数组读取的只能为True的值
  56. ind = (details['cidx'][kidx] == i)
  57. #数组[Ture,0]指取[0]值,数组[False,0]指不取[0]值
  58. #dataset[ind[index],2]指:取dataset的第二列的每行,其中ind对应行为False时dataset数组不取
  59. ax.scatter(dataset[ind, 2], dataset[ind, 1],
  60. s=30, c=clr[i], label='Cluster %d' % i)
  61. plt.xlabel('Petal Length')
  62. plt.ylabel('Sepal Width')
  63. plt.title('Iris Dataset, KMeans clustering with K=%d' % details['K'][kidx])
  64. plt.legend()
  65. plt.show()
  66. #判断肘点,需肉眼观察,上述kidx的值基于这个函数。n代表聚类个数
  67. def eblow(n):
  68. """Elbow testing."""
  69. #测试集的属性
  70. cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
  71. df = pd.read_csv('data.csv', usecols=cols).values
  72. #聚类数从1到n-1的模型
  73. kmeans_var = [KMeans(n_clusters=k).fit(df) for k in range(1, n)]
  74. #得到中心点坐标,每个属性看作一维,所以每个中心点坐标有4个值
  75. centroids = [x.cluster_centers_ for x in kmeans_var]
  76. #得到每个数据点到每个中心点的距离
  77. k_euclid = [cdist(df, cent) for cent in centroids]
  78. #得到k_euclid每组最小的值
  79. dist = [np.min(ke, axis=1) for ke in k_euclid]
  80. #每种聚类数对应的最小距离平方和
  81. wcss = [sum(d**2) for d in dist]
  82. #原数据集距离平方和的均值
  83. tss = sum(pdist(df)**2) / df.shape[0]
  84. bss = tss - wcss
  85. plt.plot(bss)
  86. plt.show()
  87. dataset = load_dataset()
  88. details = find_clusters(dataset)
  89. plot_elbow_curv(details)
  90. scatter_plot(dataset, details)
  91. #可以观察到肘点
  92. eblow(10)

结果:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/132336?site
推荐阅读
相关标签
  

闽ICP备14008679号