赞
踩
(1)4.5,3.4, 2.1, 0.8
(2)5.5,2.4, 7.3, 1.5
(3)7.6, 3.3, 5.7, 5.2
核心指标: S S E = ∑ i k ∑ ( p ∈ C i ) ∣ p − m i ∣ 2 SSE = \sum_i ^k\sum_(p\in C_i) |p-m_i|^2 SSE=∑ik∑(p∈Ci)∣p−mi∣2
代码
import numpy as np from sklearn.cluster import KMeans from scipy.spatial.distance import cdist import matplotlib.pyplot as plt # 读取数据并进行与预处理 a = pd.read_csv('iris.data', sep=',', names=[i for i in range(5)]) a.drop(4,axis = 1,inplace = True) a.dropna(axis = 0, how = 'all') # 去掉nan a = np.array(a) # k不超过10 meanDispersions = [] for k in range(1,10): kmeans = KMeans(n_clusters=k) kmeans.fit(a) #计算某个与其所属类聚中心的欧式距离 #计算所有点与对应中心的距离的平方和的均值 meanDispersions.append(sum(np.min(cdist(a, kmeans.cluster_centers_, 'euclidean'), axis=1)) / a.shape[0]) plt.plot(K, meanDispersions, 'o-') plt.xlabel('k') plt.ylabel('SSE') plt.title('Selecting k with the Elbow Method') plt.show()
运行结果
通过手肘法,由图得,应选择k = 3
(1).计算测试数据点与训练集的欧氏距离并升序排序;
(2). 选取前k个距离中,出现次数最多的类别,该类别即为预测的类别
步骤:
代码
import numpy as np import pandas as pd from collections import Counter import matplotlib.pyplot as plt # 通过手肘法选取的k值 k = 3 #将鸢尾花的数据分为测试集与训练集 def irisDivided(iris): # 取30个随机数 indexs = np.random.permutation(len(iris)) indexs = indexs[0:30] # 获得测试集 testSet = iris.take(indexs) # 获得训练集,即原本的数据集-测试集 trainSet =iris.drop(indexs) return [testSet, trainSet] def knn(trainSet, testSet, trainResults): testSet = testSet[:-1] # 创建和trainSet一样大的矩阵 dist = np.zeros((len(trainSet), len(testSet))) # 计算欧氏距离 for i in range(len(trainSet)): for j in range(len(testSet)): dist[i,j] = np.sqrt(np.sum((trainSet[i,:]-testSet)**2)) # 对距离求和并排序 dist = dist.sum(axis = 1) indexs = dist.argsort() #计算各个类别的个数 label_count = [] for i in range(k): label = trainResults[indexs[i]] label_count.append(label) counts = Counter(label_count) top = counts.most_common(1) #返回出现次数最多的元素的类别 return (str)(top[0][0]) # 读取数据,并进行数据预处理 iris = pd.read_csv('iris.data', sep = ',', names=[i for i in range(5)]) iris.columns = ['0','1','2','3','species'] # 获得训练集和测试集 sets = irisDivided(iris) trainSet = sets[1].drop(columns = ['species']).values trainResults = sets[1]['species'].values testSets = sets[0].values # 记录成功的次数 cnt = 0 # 预测测试集的类别并计算成功率 for i in testSets: ret = knn(trainSet, i, trainResults) print("predicted : " + repr(i[4]) + ",actual : " + repr(ret)) if i[4] == ret: cnt += 1 Rate = cnt / len(sets[0]) * 100 print('成功率为:\n',Rate,'%')
运行结果
(1)第一次运行
(2)第二次运行
成功率均在90%以上
(1)4.5,3.4, 2.1, 0.8
(2)5.5,2.4, 7.3, 1.5
(3)7.6, 3.3, 5.7, 5.2
步骤:
代码
import numpy as np import pandas as pd from collections import Counter import matplotlib.pyplot as plt # 通过手肘法选取的k值 k = 3 def knn(trainSet, testSet, trainResults): # 创建和trainSet一样大的矩阵 dist = np.zeros((len(trainSet), len(testSet))) # 计算欧氏距离 for i in range(len(trainSet)): for j in range(len(testSet)): dist[i,j] = np.sqrt(np.sum((trainSet[i,:]-testSet)**2)) # 对距离求和并排序 dist = dist.sum(axis = 1) indexs = dist.argsort() #计算各个类别的个数 label_count = [] for i in range(k): label = trainResults[indexs[i]] label_count.append(label) counts = Counter(label_count) top = counts.most_common(1) #返回出现次数最多的元素的类别 return (str)(top[0][0]) # 读取数据并进行预处理 iris = pd.read_csv('iris.data', sep = ',', names=[i for i in range(5)]) iris.columns = ['0','1','2','3','species'] trainSet = iris.drop(columns = ['species']).values trainResults = iris['species'].values # 记录成功的次数 cnt = 0 # 测试集 testSets = [[4.5,3.4,2.1,0.8], [5.5,2.4,7.3,1.5], [7.6,3.3,5.7,5.2]] ret = [] for i in testSets: ret.append(knn(trainSet, i, trainResults)) testSets = np.c_[testSets, ret] testSets = pd.DataFrame(testSets, columns = ['0','1','2','3','species']) # 查看预测结果 print(testSets) # 通过因子分析法对数据进行可视化 from sklearn import decomposition fa = decomposition.FactorAnalysis(n_components=2) X = fa.fit_transform(iris.iloc[:,:-1].values) color = [] # 给每个点标上颜色 for i in range(len(iris['species'])): if iris['species'][i] == 'Iris-virginica': color.append('blue') elif iris['species'][i] == 'Iris-setosa': color.append('green') else: color.append('red') # 绘图 plt.scatter(X[:,0],X[:,1],c = color) X = fa.fit_transform(testSets.iloc[:,:-1].values) plt.scatter(X[:,0],X[:,1],c = 'black', marker = '*', s = 50, label = 'test') plt.legend() plt.show()
运行结果
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。