当前位置:   article > 正文

使用机器学习算法实现单细胞测序数据的降维及聚类(三)_单细胞测序 机器学习

单细胞测序 机器学习

本篇记录那些各种组合聚类算法
1.DeepCluster

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.layers import Dense, Input, Layer, InputSpec
from keras.models import Model
from tensorflow.keras.optimizers import SGD
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import keras.backend as K
import metrics

'''
深度嵌入聚类模型主要包括:

[1] - 一个自动编码器,预训练,以学习无标签数据集的初始压缩后的特征表示.

[2] - 在编码器上堆积聚类层(clustering),以分配编码器输出到一个聚类组. 聚类层的权重初始化采用的是基于当前得到的 K-Means 聚类中心.

[3] - 聚类模型的训练,以同时改善聚类层和编码器
'''
# ---------------------------------Data preprocessing------------------------------------------
# n_cells=466 n_genes=2000
dataset = pd.read_csv('dataset/darmanis.csv')
dataset = dataset.to_numpy().T
data = dataset[1:, 2:].astype(np.float32)
labels = np.array(dataset[1:, 1:2]).flatten()
# It is divided into training set and test set
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.30, random_state=42)
# Connect training set and test set
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))
x = x.reshape((x.shape[0], -1))
print(x.shape[-1])
print(y.shape)
n_clusters = len(np.unique(y))
print(n_clusters)

km_pred = KMeans(n_clusters=n_clusters).fit_predict(x)
nmi = metrics.nmi(km_pred, y)
ari = metrics.ari(km_pred, y)
print('Clustering performance of raw data: nmi = %.5f, ari = %.5f' % (nmi, ari))
# -------------------------------Autoencoder training-------------------------------
input_sample = Input(shape=(2000,))
# Define the code layer output dimension
encoding_dim = 10
batch_size = 512
'''
Keras 使用的是 API 函数模式构建网络,与 Sequential 不同,可以随意在训练过程中提取特定层
'''
# 第一层编码层
encoded_1 = Dense(units=512, activation='relu')(input_sample)
# 第二层编码层
encoded_2 = Dense(units=256, activation='relu')(encoded_1)
# 第三层编码层
encoded_3 = Dense(units=128, activation='relu')(encoded_2)
# 第四层编码 --> 并输出给解码层
encoded_output = Dense(units=encoding_dim)(encoded_3)
# encoded_output, [n_samples, 10], is two-dimensional data

# 第一层解码层
decoded_1 = Dense(units=128, activation='relu')(encoded_output)
# 第二层解码层
decoded_2 = Dense(units=256, activation='relu')(decoded_1)
# 第三层解码层
decoded_3 = Dense(units=512, activation='relu')(decoded_2)
# 第四层解码层
decoded_output = Dense(units=2000, activation='tanh')(decoded_3)
# 构建自动编码模型
autoencoder = Model(inputs=input_sample, outputs=decoded_output)
# 构建编码模型结构 二维数据
encoder = Model(inputs=input_sample, outputs=encoded_output)
save_dir = './results'
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(x, x, batch_size=batch_size, epochs=100)


# 保存权重
# autoencoder.save_weights(save_dir + '/ae_weights.h5')
# autoencoder.load_weights(save_dir + '/ae_weights.h5')

# ----------------------------------------Build clustering model-------------------------------------------------#
class ClusteringLayer(Layer):
    """
    Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
    sample belonging to each cluster. The probability is calculated with student's t-distribution.

    # Example
    ```
        model.add(ClusteringLayer(n_clusters=10))
    ```
    # Arguments
        n_clusters: number of clusters.
        weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
        alpha: degrees of freedom parameter in Student's t-distribution. Default to 1.0.
    # Input shape
        2D tensor with shape: `(n_samples, n_features)`.
    # Output shape
        2D tensor with shape: `(n_samples, n_clusters)`.
    """

    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
        self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform',
                                        name='clustering')
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        """ student t-distribution, as same as used in t-SNE algorithm.
         Measure the similarity between embedded point z_i and centroid µ_j.
                 q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
                 q_ij can be interpreted as the probability of assigning sample i to cluster j.
                 (i.e., a soft assignment)
        Arguments:
            inputs: the variable containing data, shape=(n_samples, n_features)
        Return:
            q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
        """
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1))  # Make sure each sample's 10 values add up to 1.
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters

    def get_config(self):
        config = {'n_clusters': self.n_clusters}
        base_config = super(ClusteringLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
model = Model(inputs=encoder.input, outputs=clustering_layer)
model.compile(optimizer=SGD(0.01, 0.9), loss='kld')
kmeans = KMeans(n_clusters=n_clusters)
y_pred = kmeans.fit_predict(encoder.predict(x))
y_pred_last = np.copy(y_pred)
# 给'clustering'层设置权重,权重由kmeans聚类得到的中心点而来
model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])


# deep clustering
# Compute p_i by first raising q_i to the second power and then normalizing by frequency per cluster:
# computing an auxiliary target distribution
def target_distribution(q):
    weight = q ** 2 / q.sum(0)
    return (weight.T / weight.sum(1)).T


loss = 0
index = 0
maxiter = 1000
update_interval = 20
index_array = np.arange(x.shape[0])
tol = 0.001  # tolerance threshold to stop training
for ite in range(int(maxiter)):
    if ite % update_interval == 0:
        q = model.predict(x, verbose=0)
        # print(q[:10,:])
        p = target_distribution(q)  # update the auxiliary target distribution p

        # evaluate the clustering performance
        y_pred = q.argmax(1)
        if y is not None:
            # np.round()对浮点数取近似值,保留5位小数
            # acc = np.round(metrics.acc(y, y_pred), 5)
            nmi = np.round(metrics.nmi(y, y_pred), 5)
            ari = np.round(metrics.ari(y, y_pred), 5)
            loss = np.round(loss, 5)
            print('Iter %d: nmi = %.5f, ari = %.5f' % (ite, nmi, ari), ' ; loss=', loss)

        # check stop criterion - model convergence
        delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
        y_pred_last = np.copy(y_pred)
        if ite > 0 and delta_label < tol:
            print('delta_label ', delta_label, '< tol ', tol)
            print('Reached tolerance threshold. Stopping training.')
            break
    idx = index_array[index * batch_size: min((index + 1) * batch_size, x.shape[0])]
    loss = model.train_on_batch(x=x[idx], y=p[idx])
    index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

# model.save_weights(save_dir + '/DEC_model_final.h5')
# model.load_weights(save_dir + '/DEC_model_final.h5')

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200

在这里插入图片描述

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/131729
推荐阅读
相关标签
  

闽ICP备14008679号