赞
踩
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataset=pd.read_csv('F:\\python\\dataset\\watermelon_4.csv', delimiter=",")
data=dataset.values
data
data:
import random #距离 def distance(x1,x2): return sum((x1-x2)**2) #Kmeans实现 def Kmeans(D,K,maxIter):#return K points and the belongings of every points m,n=np.shape(D) if K>=m:return D initSet=set() curK = K #聚类簇数 while(curK>0): #随机选择K个样本作为均值向量 randomInt = random.randint(0,m-1) #随机一个[0,29]的整数 if randomInt not in initSet: curK-=1 initSet.add(randomInt) #k个 U=D[list(initSet),:] print('随机选择的{}个样本作为均值向量为:\n{}\n'.format(K,U)) C=np.zeros(m) curIter = maxIter while curIter>0: #迭代 curIter -= 1 """ 4: forj = 1, 2, . . . ,m do 5: 计算样本 xj 与各均值向量 μi (1<=i<=k) 的距离: dji = ||xj-ui||2; 6: 根据距离最近的均值向量确定xj的簇标记:λj = argmi∈{1,2..., k} dji ; 7: 将样本xj划入相应的簇:Cλj =Cλj ∪{Xj}; 8: end for """ for i in range(m): #4 p=0 minDistance = distance(D[i],U[0]) for j in range(1,K): #5 if distance(D[i],U[j]) < minDistance: #6 minDistance = distance(D[i],U[j]) p=j C[i] = p # 7 newU = np.zeros((K,n)) cnt = np.zeros(K) """ 9: for i = 1, 2,… ,k do 10: 计算新均值向量:μi'= 1/|Ci|∑x∈Ci X; 11: if ui' != μi then 12: 将当前均值向量 μi更新为 μi'; 13: else 14: 保持当前均值向量不变 15: end if 16: end for 17: until 当前均值向量均未更新 """ for i in range(m): newU[int(C[i])]+=D[i] cnt[int(C[i])]+=1 # print('newU:{}'.format(newU)) # print('cnt:{}'.format(cnt)) changed=0 #标志符 for i in range(K): newU[i]/=cnt[i] #10 for j in range(n): #11-12 if U[i,j]!=newU[i,j]: U[i,j]=newU[i,j] changed=1 if changed==0: return U,C,maxIter-curIter return U,C,maxIter-curIter U,C,iter=Kmeans(data,2,100) print('最终的类中心向量:\n{}\n'.format(U)) print('样本最终的分类:\n{}\n'.format(C)) print('完成此任务的迭代次数:{}'.format(iter))
f1 = plt.figure(1)
plt.title('watermelon_4')
plt.xlabel('density')
plt.ylabel('ratio')
plt.scatter(data[:,0], data[:,1], marker='o', color='g', s=50)
plt.scatter(U[:,0], U[:,1], marker='o', color='r', s=100)
# plt.xlim(0,1)
# plt.ylim(0,1)
m,n=np.shape(data)
for i in range(m):
plt.plot([data[i,0],U[int(C[i]),0]],[data[i,1],U[int(C[i]),1]],"c--",linewidth=0.3)
plt.show()
k-means算法选的初始点离得越远越容易收敛,聚类效果也越好。
因此k-means算法的好坏与初始样本的选取有很大关系。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。