赞
踩
本篇记录那些各种组合聚类算法
1.DeepCluster
import numpy as np import pandas as pd import matplotlib.pyplot as plt from keras.layers import Dense, Input, Layer, InputSpec from keras.models import Model from tensorflow.keras.optimizers import SGD from sklearn.cluster import KMeans from sklearn.model_selection import train_test_split import keras.backend as K import metrics ''' 深度嵌入聚类模型主要包括: [1] - 一个自动编码器,预训练,以学习无标签数据集的初始压缩后的特征表示. [2] - 在编码器上堆积聚类层(clustering),以分配编码器输出到一个聚类组. 聚类层的权重初始化采用的是基于当前得到的 K-Means 聚类中心. [3] - 聚类模型的训练,以同时改善聚类层和编码器 ''' # ---------------------------------Data preprocessing------------------------------------------ # n_cells=466 n_genes=2000 dataset = pd.read_csv('dataset/darmanis.csv') dataset = dataset.to_numpy().T data = dataset[1:, 2:].astype(np.float32) labels = np.array(dataset[1:, 1:2]).flatten() # It is divided into training set and test set x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.30, random_state=42) # Connect training set and test set x = np.concatenate((x_train, x_test)) y = np.concatenate((y_train, y_test)) x = x.reshape((x.shape[0], -1)) print(x.shape[-1]) print(y.shape) n_clusters = len(np.unique(y)) print(n_clusters) km_pred = KMeans(n_clusters=n_clusters).fit_predict(x) nmi = metrics.nmi(km_pred, y) ari = metrics.ari(km_pred, y) print('Clustering performance of raw data: nmi = %.5f, ari = %.5f' % (nmi, ari)) # -------------------------------Autoencoder training------------------------------- input_sample = Input(shape=(2000,)) # Define the code layer output dimension encoding_dim = 10 batch_size = 512 ''' Keras 使用的是 API 函数模式构建网络,与 Sequential 不同,可以随意在训练过程中提取特定层 ''' # 第一层编码层 encoded_1 = Dense(units=512, activation='relu')(input_sample) # 第二层编码层 encoded_2 = Dense(units=256, activation='relu')(encoded_1) # 第三层编码层 encoded_3 = Dense(units=128, activation='relu')(encoded_2) # 第四层编码 --> 并输出给解码层 encoded_output = Dense(units=encoding_dim)(encoded_3) # encoded_output, [n_samples, 10], is two-dimensional data # 第一层解码层 decoded_1 = Dense(units=128, activation='relu')(encoded_output) # 第二层解码层 decoded_2 = Dense(units=256, activation='relu')(decoded_1) # 第三层解码层 decoded_3 = Dense(units=512, activation='relu')(decoded_2) # 第四层解码层 decoded_output = Dense(units=2000, activation='tanh')(decoded_3) # 构建自动编码模型 autoencoder = Model(inputs=input_sample, outputs=decoded_output) # 构建编码模型结构 二维数据 encoder = Model(inputs=input_sample, outputs=encoded_output) save_dir = './results' autoencoder.compile(optimizer='adam', loss='mse') autoencoder.fit(x, x, batch_size=batch_size, epochs=100) # 保存权重 # autoencoder.save_weights(save_dir + '/ae_weights.h5') # autoencoder.load_weights(save_dir + '/ae_weights.h5') # ----------------------------------------Build clustering model-------------------------------------------------# class ClusteringLayer(Layer): """ Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the sample belonging to each cluster. The probability is calculated with student's t-distribution. # Example ``` model.add(ClusteringLayer(n_clusters=10)) ``` # Arguments n_clusters: number of clusters. weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers. alpha: degrees of freedom parameter in Student's t-distribution. Default to 1.0. # Input shape 2D tensor with shape: `(n_samples, n_features)`. # Output shape 2D tensor with shape: `(n_samples, n_clusters)`. """ def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs): if 'input_shape' not in kwargs and 'input_dim' in kwargs: kwargs['input_shape'] = (kwargs.pop('input_dim'),) super(ClusteringLayer, self).__init__(**kwargs) self.n_clusters = n_clusters self.alpha = alpha self.initial_weights = weights self.input_spec = InputSpec(ndim=2) def build(self, input_shape): assert len(input_shape) == 2 input_dim = input_shape[1] self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim)) self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform', name='clustering') if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights self.built = True def call(self, inputs, **kwargs): """ student t-distribution, as same as used in t-SNE algorithm. Measure the similarity between embedded point z_i and centroid µ_j. q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it. q_ij can be interpreted as the probability of assigning sample i to cluster j. (i.e., a soft assignment) Arguments: inputs: the variable containing data, shape=(n_samples, n_features) Return: q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters) """ q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha)) q **= (self.alpha + 1.0) / 2.0 q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure each sample's 10 values add up to 1. return q def compute_output_shape(self, input_shape): assert input_shape and len(input_shape) == 2 return input_shape[0], self.n_clusters def get_config(self): config = {'n_clusters': self.n_clusters} base_config = super(ClusteringLayer, self).get_config() return dict(list(base_config.items()) + list(config.items())) clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output) model = Model(inputs=encoder.input, outputs=clustering_layer) model.compile(optimizer=SGD(0.01, 0.9), loss='kld') kmeans = KMeans(n_clusters=n_clusters) y_pred = kmeans.fit_predict(encoder.predict(x)) y_pred_last = np.copy(y_pred) # 给'clustering'层设置权重,权重由kmeans聚类得到的中心点而来 model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_]) # deep clustering # Compute p_i by first raising q_i to the second power and then normalizing by frequency per cluster: # computing an auxiliary target distribution def target_distribution(q): weight = q ** 2 / q.sum(0) return (weight.T / weight.sum(1)).T loss = 0 index = 0 maxiter = 1000 update_interval = 20 index_array = np.arange(x.shape[0]) tol = 0.001 # tolerance threshold to stop training for ite in range(int(maxiter)): if ite % update_interval == 0: q = model.predict(x, verbose=0) # print(q[:10,:]) p = target_distribution(q) # update the auxiliary target distribution p # evaluate the clustering performance y_pred = q.argmax(1) if y is not None: # np.round()对浮点数取近似值,保留5位小数 # acc = np.round(metrics.acc(y, y_pred), 5) nmi = np.round(metrics.nmi(y, y_pred), 5) ari = np.round(metrics.ari(y, y_pred), 5) loss = np.round(loss, 5) print('Iter %d: nmi = %.5f, ari = %.5f' % (ite, nmi, ari), ' ; loss=', loss) # check stop criterion - model convergence delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0] y_pred_last = np.copy(y_pred) if ite > 0 and delta_label < tol: print('delta_label ', delta_label, '< tol ', tol) print('Reached tolerance threshold. Stopping training.') break idx = index_array[index * batch_size: min((index + 1) * batch_size, x.shape[0])] loss = model.train_on_batch(x=x[idx], y=p[idx]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 # model.save_weights(save_dir + '/DEC_model_final.h5') # model.load_weights(save_dir + '/DEC_model_final.h5')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。