赞
踩
计算距离
距离通常使用欧几里得距离来衡量
def euclDistance(vector1, vector2):
return np.sqrt(sum((vector2 - vector1) ** 2))
初始化质心
def initCentroids(data, k):
numSamples, dim = data.shape
# k个质心,列数跟样本的列数一样
centroids = np.zeros((k, dim))
# 随机选出k个质心
for i in range(k):
# 随机选取一个样本的索引
index = int(np.random.uniform(0, numSamples))
# 作为初始化的质心
centroids[i, :] = data[index, :]
return centroids
算法实现过程
# 传入数据集和k值 def kmeans(data, k): # 计算样本个数 numSamples = data.shape[0] # 样本的属性,第一列保存该样本属于哪个簇,第二列保存该样本跟它所属簇的误差 clusterData = np.array(np.zeros((numSamples, 2))) # 决定质心是否要改变的质量 clusterChanged = True # 初始化质心 centroids = initCentroids(data, k) while clusterChanged: clusterChanged = False # 循环每一个样本 for i in range(numSamples): # 最小距离 minDist = 100000.0 # 定义样本所属的簇 minIndex = 0 # 循环计算每一个质心与该样本的距离 for j in range(k): # 循环每一个质心和样本,计算距离 distance = euclDistance(centroids[j, :], data[i, :]) # 如果计算的距离小于最小距离,则更新最小距离 if distance < minDist: minDist = distance # 更新最小距离 clusterData[i, 1] = minDist # 更新样本所属的簇 minIndex = j # 如果样本的所属的簇发生了变化 if clusterData[i, 0] != minIndex: # 质心要重新计算 clusterChanged = True # 更新样本的簇 clusterData[i, 0] = minIndex # 更新质心 for j in range(k): # 获取第j个簇所有的样本所在的索引 cluster_index = np.nonzero(clusterData[:, 0] == j) # 第j个簇所有的样本点 pointsInCluster = data[cluster_index] # 计算质心 centroids[j, :] = np.mean(pointsInCluster, axis=0) return centroids, clusterData
结果可视化
def showCluster(data, k, centroids, clusterData): numSamples, dim = data.shape if dim != 2: print('dimension of your data is not 2!') return 1 # 用不同颜色形状来表示各个类别 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'dr', '<r', 'pr'] if k > len(mark): print('your k is too large!') return 1 # 画样本点 for i in range(numSamples): markIndex = int(clusterData[i, 0]) plt.plot(data[i, 0], data[i, 1], mark[markIndex]) # 用不同颜色形状来表示各个类别 mark = ['*r', '*b', '*g', '*k', '^b', '+b', 'sb', 'db', '<b', 'pb'] # 画质心点 for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=20) plt.show()
设置k值
k = 4
centroids, clusterData = kmeans(data, k)
if np.isnan(centroids).any():
print('Error')
else:
print('cluster complete!')
# 显示结果
showCluster(data, k, centroids, clusterData)
导入已存在的数据集
聚类后的结果
1、解决K-Means的初始质心选择比较敏感的问题。
使用多次的随机初始化,计算每一次建模得到的代价函数的值,选取待机函数最小结果作为聚类结果
设置k值时代码添加:
min_loss = 10000
min_loss_centroids = np.array([])
min_loss_clusterData = np.array([])
for i in range(50):
centroids, clusterData = kmeans(data, k)
loss = sum(clusterData[:, 1])/data.shape[0]
if loss < min_loss:
min_loss = loss
min_loss_centroids = centroids
min_loss_clusterData = clusterData
2、肘部法则
list_lost = []
for k in range(2, 10):
min_loss = 10000
min_loss_centroids = np.array([])
min_loss_clusterData = np.array([])
for i in range(50):
centroids, clusterData = kmeans(data, k)
loss = sum(clusterData[:, 1])/data.shape[0]
if loss < min_loss:
min_loss = loss
min_loss_centroids = centroids
min_loss_clusterData = clusterData
list_lost.append(min_loss)
print(list_lost)
利用肘部法则来选取K值
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。