赞
踩
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] # 绘图时可以显示中文
plt.rcParams['axes.unicode_minus']=False # 绘图时显示负号
warnings.filterwarnings("ignore") # 不要显示警告
heart = pd.read_csv('C:\\Users\\91333\\Documents\\semester6\\data science\\6.聚类分析\\heart.dat', header = None, sep = ' ')
sns.stripplot(x=heart.iloc[:,13],y=heart.iloc[:,3],hue=heart.iloc[:,1])
sns.boxplot(x=heart.iloc[:,2],y=heart.iloc[:,4],hue=heart.iloc[:,13])
sns.violinplot(x=heart.iloc[:,5],y=heart.iloc[:,7],hue=heart.iloc[:,13])
#对连续型数据标准化,减小量纲对聚类的影响。
#第十一个变量为顺序型变量,有大小关系,在这里我把它看成连续型变量
Numerical = [0,3,4,7,9,10,11]
Type = [1,2,5,6,8,12,13]
heart_norm0 = heart.iloc[:, Numerical].apply(lambda x: (x - np.mean(x)) / (np.std(x)))
heart_norm = heart
heart_norm.iloc[:,Numerical] = heart_norm0
原链接:https://blog.csdn.net/littlely_ll/article/details/80042928?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase
我似乎稍微改了一下下
prototype是考研词汇哦,意思是 原型
import numpy as np import random from collections import Counter def dist(x, y): return np.sqrt(sum((x-y)**2)) def sigma(x, y): return len(x) - sum(x == y) def KPrototypes(data, O, C, k, max_iters=10, gamma=0): data = np.array(data) m, n = data.shape num = random.sample(range(m), k) O_data = data[:, O] C_data = data[:, C] O_protos = O_data[num, :] C_protos = C_data[num, :] C_data = C_data.astype("str") C_protos = C_protos.astype("str") cluster = None clusterShip = [] clusterCount = {} sumInCluster = {} freqInCluster = {} for i in range(m): mindistance = float('inf') for j in range(k): distance = dist(O_data[i,:], O_protos[j,:]) + \ gamma * sigma(C_data[i,:], C_protos[j,:]) if distance < mindistance: mindistance = distance cluster = j clusterShip.append(cluster) if clusterCount.get(cluster) == None: clusterCount[cluster] = 1 else: clusterCount[cluster] += 1 for j in range(len(O)): if sumInCluster.get(cluster) == None: sumInCluster[cluster] = [O_data[i,j]] + [0] * (len(O) - 1) else: sumInCluster[cluster][j] += O_data[i,j] O_protos[cluster,j] = sumInCluster[cluster][j] / clusterCount[cluster] for j in range(len(C)): if freqInCluster.get(cluster) == None: freqInCluster[cluster] = [Counter(C_data[i,j])] + [Counter()] * (len(C) - 1) else: freqInCluster[cluster][j] += Counter(C_data[i,j]) C_protos[cluster,j] = freqInCluster[cluster][j].most_common()[0][0] for t in range(max_iters): for i in range(m): mindistance = float('inf') for j in range(k): distance = dist(O_data[i, :], O_protos[j, :]) + \ gamma * sigma(C_data[i, :], C_protos[j, :]) if distance < mindistance: mindistance = distance cluster = j if clusterShip[i] != cluster: oldCluster = clusterShip[i] clusterShip[i] = cluster clusterCount[cluster] += 1 clusterCount[oldCluster] -= 1 for j in range(len(O)): sumInCluster[cluster][j] += O_data[i,j] sumInCluster[oldCluster][j] -= O_data[i,j] O_protos[cluster,j] = sumInCluster[cluster][j] / clusterCount[cluster] O_protos[oldCluster, j] = sumInCluster[oldCluster][j] / clusterCount[oldCluster] for j in range(len(C)): freqInCluster[cluster][j] += Counter(C_data[i,j]) freqInCluster[oldCluster][j] -= Counter(C_data[i,j]) C_protos[cluster,j] = freqInCluster[cluster][j].most_common()[0][0] C_protos[oldCluster,j] = freqInCluster[oldCluster][j].most_common()[0][0] return clusterShip
调用k-prototypes函数,调节簇个数和分类变量权重两个参数,记录评价指标得分
ars=[]
sc=[]
for i in range(2,5):
ars_sub=[]
sc_sub=[]
for j in [0,0.5,1,1.5,2]:
cluster = KPrototypes(data=heart_norm, O=Numerical, C=Type, k=i, gamma=j, max_iters=100)
ars_sub.append(metrics.adjusted_rand_score(heart.iloc[:,13], cluster))
sc_sub.append(metrics.silhouette_score(heart_norm, cluster, metric='euclidean'))
ars.append(ars_sub)
sc.append(sc_sub)
ars # 可以从数值看出,效果不好
sc
from scipy.sparse import issparse import numpy as np import pandas as pd def gower_matrix(data_x, data_y=None, weight=None, cat_features=None): # function checks X = data_x if data_y is None: Y = data_x else: Y = data_y if not isinstance(X, np.ndarray): if not np.array_equal(X.columns, Y.columns): raise TypeError("X and Y must have same columns!") else: if not X.shape[1] == Y.shape[1]: raise TypeError("X and Y must have same y-dim!") if issparse(X) or issparse(Y): raise TypeError("Sparse matrices are not supported!") x_n_rows, x_n_cols = X.shape y_n_rows, y_n_cols = Y.shape if cat_features is None: if not isinstance(X, np.ndarray): is_number = np.vectorize(lambda x: not np.issubdtype(x, np.number)) cat_features = is_number(X.dtypes) else: cat_features = np.zeros(x_n_cols, dtype=bool) for col in range(x_n_cols): if not np.issubdtype(type(X[0, col]), np.number): cat_features[col]=True else: cat_features = np.array(cat_features) # print(cat_features) if not isinstance(X, np.ndarray): X = np.asarray(X) if not isinstance(Y, np.ndarray): Y = np.asarray(Y) Z = np.concatenate((X,Y)) x_index = range(0,x_n_rows) y_index = range(x_n_rows,x_n_rows+y_n_rows) Z_num = Z[:,np.logical_not(cat_features)] num_cols = Z_num.shape[1] num_ranges = np.zeros(num_cols) num_max = np.zeros(num_cols) for col in range(num_cols): col_array = Z_num[:, col].astype(np.float32) max = np.nanmax(col_array) min = np.nanmin(col_array) if np.isnan(max): max = 0.0 if np.isnan(min): min = 0.0 num_max[col] = max num_ranges[col] = (1 - min / max) if (max != 0) else 0.0 # This is to normalize the numeric values between 0 and 1. Z_num = np.divide(Z_num ,num_max,out=np.zeros_like(Z_num), where=num_max!=0) Z_cat = Z[:,cat_features] if weight is None: weight = np.ones(Z.shape[1]) #print(weight) weight_cat=weight[cat_features] weight_num=weight[np.logical_not(cat_features)] out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32) weight_sum = weight.sum() X_cat = Z_cat[x_index,] X_num = Z_num[x_index,] Y_cat = Z_cat[y_index,] Y_num = Z_num[y_index,] # print(X_cat,X_num,Y_cat,Y_num) for i in range(x_n_rows): j_start= i if x_n_rows != y_n_rows: j_start = 0 # call the main function res = gower_get(X_cat[i,:], X_num[i,:], Y_cat[j_start:y_n_rows,:], Y_num[j_start:y_n_rows,:], weight_cat, weight_num, weight_sum, cat_features, num_ranges, num_max) #print(res) out[i,j_start:]=res if x_n_rows == y_n_rows: out[i:,j_start]=res return out def gower_get(xi_cat,xi_num,xj_cat,xj_num,feature_weight_cat, feature_weight_num,feature_weight_sum,categorical_features, ranges_of_numeric,max_of_numeric ): # categorical columns sij_cat = np.where(xi_cat == xj_cat,np.zeros_like(xi_cat),np.ones_like(xi_cat)) sum_cat = np.multiply(feature_weight_cat,sij_cat).sum(axis=1) # numerical columns abs_delta=np.absolute(xi_num-xj_num) sij_num=np.divide(abs_delta, ranges_of_numeric, out=np.zeros_like(abs_delta), where=ranges_of_numeric!=0) sum_num = np.multiply(feature_weight_num,sij_num).sum(axis=1) sums= np.add(sum_cat,sum_num) sum_sij = np.divide(sums,feature_weight_sum) return sum_sij
import copy
k=2 #设定聚类个数
cat = [0,1,1,0,0,1,1,0,1,0,0,0,1,1]
np.random.seed(126)
x = np.array(heart_norm)
SampleIndex=np.random.randint(0,x.shape[0],size=k) #随机抽取3个样本作为初始聚类中心
Center=x[SampleIndex,] #初始聚类中心
COld=np.zeros(Center.shape) #老的聚类中心
CLabel=np.zeros(x.shape[0],dtype=np.int32) #聚类标签0,1,整型
Error=gower_matrix(np.append(Center, COld, axis=0),cat_features=cat).diagonal(offset=k)
while np.all(Error != 0):
for i in range(x.shape[0]): #给每个样本分配距离最近的聚类中心分配标签
dist=gower_matrix(np.append(x[i].reshape(1,14),Center, axis=0),cat_features=cat)[(0,0),(1,2)]
Label=np.argmin(dist) #把dist中距离最小的所在位置(即0,1)赋给标签
CLabel[i]=Label #将标签赋给对应的样本
COld=copy.deepcopy(Center) #存储原来的聚类中心
#更新新的聚类中心
for i in range(k):
Index=CLabel==i #找出相同标签的样本
Center[i]=np.mean(x[Index],axis=0) #更新聚类中心
Error=gower_matrix(np.append(Center, COld, axis=0),cat_features=cat).diagonal(offset=k)
print(metrics.adjusted_rand_score(heart.iloc[:,13], CLabel))
print(metrics.silhouette_score(heart_norm, CLabel, metric='euclidean'))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。