当前位置:   article > 正文

FCM (Fuzzy C-Means) 聚类算法原理推导及Python源码实现_fcm算法综述

fcm算法综述

本文介绍了FCM算法的公式推导和Python源码实现,并在鸢尾花数据集上做了验证。

源码和笔记已经上传至Github:https://github.com/datamonday/ML-Algorithm-Source-Code/

基于划分的聚类,层次聚类等都属于硬聚类,即始终将样本分配给单个聚类。相对地,软聚类则不同,其旨在将每个样本与一个向量相关联,该向量通常表示样本属于某个聚类的概率。模糊C均值(Fuzzy C-Means, FCM)是一种典型的软聚类方法。其原理如下:

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述


聚类中心初始化:
在这里插入图片描述
在这里插入图片描述

聚类结果:
在这里插入图片描述
在这里插入图片描述


# -*- coding: utf-8 -*-
"""
Created on Sat Jun  5 00:02:38 2021

@author: 34123
"""
import os
import pandas as pd
import numpy as np
import random
import operator
import math
from copy import deepcopy
import matplotlib.pyplot as plt
# # 将网格线置于曲线之下
# plt.rcParams['axes.axisbelow'] = False
plt.style.use('fivethirtyeight') # 'ggplot'

from PlotFunctions import plot_random_init_iris_sepal, plot_random_init_iris_petal, plot_cluster_iris_sepal, plot_cluster_iris_petal

from sklearn.datasets import load_iris

def load_iris_data():
    data = load_iris()
    # iris数据集的特征列
    features = data['data']
    # iris数据集的标签
    target = data['target']
    # 增加维度1,用于拼接
    target = target[:, np.newaxis]
    
    target_names = data['target_names']
    target_dicts = dict(zip(np.unique(target), target_names))
    
    feature_names = data['feature_names']
    
    # 浅拷贝,防止原地修改
    feature_names = data['feature_names'].copy() # deepcopy(data['feature_names'])
    feature_names.append('label')
    
    df_full = pd.DataFrame(data = np.concatenate([features, target], axis=1), 
                           columns=feature_names)
    # 保存数据集
    df_full.to_csv(str(os.getcwd()) + '/iris_data.csv', index=None)
    
    columns = list(df_full.columns)
    features = columns[:len(columns)-1]
    class_labels = list(df_full[columns[-1]])
    df = df_full[features]
    
    return df_full, df, class_labels, target_dicts


# 初始化隶属度矩阵 U
def init_fuzzy_matrix(n_sample, c):
    """
    初始化隶属度矩阵,注意针对一个样本,三个隶属度的相加和=1
    ----
    param n_sample: 样本数量
    param c: 聚类数量
    """
    # 针对数据集中所有样本的隶属度矩阵,shape = [n_sample, c]
    fuzzy_matrix = []
    
    for i in range(n_sample):
        # 生成 c 个随机数列表, random.random()方法随机生成[0,1)范围内的一个实数。
        random_list = [random.random() for i in range(c)]
        sum_of_random = sum(random_list)
        # 归一化之后的随机数列表
        # 单个样本的模糊隶属度列表
        norm_random_list = [x/sum_of_random for x in random_list]
        # 选择随机参数列表中最大的数的索引
        one_of_random_index = norm_random_list.index(max(norm_random_list))
        
        for j in range(0, len(norm_random_list)):
            if(j == one_of_random_index):
                norm_random_list[j] = 1
            else:
                norm_random_list[j] = 0
                
        fuzzy_matrix.append(norm_random_list)
    
    return fuzzy_matrix


# 计算FCM的聚类中心
def cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m):
    """
    param df: 数据集的特征集,不包含标签列
    param fuzzy_matrix: 隶属度矩阵
    param c: 聚类簇数量
    param m: 加权指数
    """
    # *字符称为解包运算符
    # zip(*fuzzy_amtrix) 相当于将fuzzy_matrix按列展开并拼接,但并不合并!
    # list(zip(*fuzzy_amtrix)) 包含 列数 个元组。
    fuzzy_mat_ravel = list(zip(*fuzzy_matrix))
    
    cluster_centers = []
    
    # 遍历聚类数量次
    for j in range(c):
        # 取出属于某一类的所有样本的隶属度列表(隶属度矩阵的一列)
        fuzzy_one_dim_list = list(fuzzy_mat_ravel[j])
        # 计算隶属度的m次方
        m_fuzzy_one_dim_list = [p ** m for p in fuzzy_one_dim_list]
        # 隶属度求和,求解聚类中心公式中的分母
        denominator = sum(m_fuzzy_one_dim_list)
        
        # 
        numerator_list = []
        
        # 遍历所有样本,求分子
        for i in range(n_sample):
            # 取出一个样本
            sample = list(df.iloc[i])
            # 聚类簇中心的分子部分,样本与对应的隶属度的m次方相乘
            mul_sample_fuzzy = [m_fuzzy_one_dim_list[i] * val for val in sample]
            numerator_list.append(mul_sample_fuzzy)
        # 计算分子,求和
        numerator = map(sum, list(zip(*numerator_list)))
        cluster_center = [val/denominator for val in numerator]
        cluster_centers.append(cluster_center)
        
    return cluster_centers

# 更新隶属度矩阵,参考公式 (8)
def update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers):
    # 分母的指数项
    order = float(2 / (m - 1))
    # 遍历样本
    for i in range(n_sample):
        # 单个样本
        sample = list(df.iloc[i])
        # 计算更新公式的分母:样本减去聚类中心
        distances = [np.linalg.norm(  np.array(list(  map(operator.sub, sample, cluster_centers[j])  ))  ) \
                     for j in range(c)]
        for j in range(c):
            # 更新公式的分母
            denominator = sum([math.pow(float(distances[j]/distances[val]), order) for val in range(c)])
            fuzzy_matrix[i][j] = float(1 / denominator)
            
    return fuzzy_matrix  #, distances


# 获取聚类中心
def get_clusters(fuzzy_matrix, n_sample):
    # 隶属度最大的那一个维度作为最终的聚类结果
    cluster_labels = []
    for i in range(n_sample):
        max_val, idx = max( (val, idx) for (idx, val) in enumerate(fuzzy_matrix[i]) )
        cluster_labels.append(idx)
    return cluster_labels


# 模糊c均值聚类算法
def fuzzy_c_means(df, fuzzy_matrix, n_sample, c, m, max_iter, init_method='random'):
    """
    param init_random: 聚类中心的初始化方法
            - random: 从样本中随机选择c个作为聚类中心
            - multi_normal: 多元高斯分布采样
    """
    # 样本特征数量
    n_features = df.shape[-1]
    # 初始化隶属度矩阵
    fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
    # 初始化迭代次数
    current_iter = 0
    # 初始化聚类中心
    init_cluster_centers = []
    cluster_centers = []
    # 初始化样本聚类标签的列表,每次迭代都需要保存每个样本的聚类
    max_iter_cluster_labels = []
    # 选择初始化方法
    if init_method == 'multi_normal':
        # 均值列表
        mean = [0] * n_features
        # 多元高斯分布的协方差矩阵,对角阵
        cov = np.identity(n_features)
        for i in range(0, c):
            init_cluster_centers.append(  list( np.random.multivariate_normal(mean, cov) )  )
#     else:
#         init_cluster_centers = [[0.1] * n_features ] * c
        
    print(init_cluster_centers)
    
    while current_iter < max_iter:
        if current_iter == 0 and init_method == 'multi_normal':
            cluster_centers = init_cluster_centers
        else:
            cluster_centers = cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m)
        fuzzy_matrix = update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers)
        cluster_labels = get_clusters(fuzzy_matrix, n_sample)
        max_iter_cluster_labels.append(cluster_labels)
        
        current_iter += 1
        
        print('-' * 32)
        print("Fuzzy Matrix U:\n")
        print(np.array(fuzzy_matrix))
        
    return cluster_centers, cluster_labels, max_iter_cluster_labels


if __name__ == '__main__':
    df_full, df, class_labels, target_dicts = load_iris_data()
    
    
    # 簇数量,鸢尾花数据集有3类
    c = 3
    # 最大迭代次数,防止无限循环
    max_iter = 20
    # 数据量
    n_sample = len(df)
    # 加权指数m,有论文建议 [1.5, 2.5] 范围之间比较好
    m = 1.7
    
    fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
    centers, labels, acc = fuzzy_c_means(df, 
                                     fuzzy_matrix, 
                                     n_sample, 
                                     c, 
                                     m, 
                                     max_iter, 
                                     init_method='multi_normal') # multi_normal, random
    
    
    plot_random_init_iris_sepal(df_full)
    plot_random_init_iris_petal(df_full)
    plot_cluster_iris_sepal(df_full, labels, centers)
    plot_cluster_iris_petal(df_full, labels, centers)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
# -*- coding: utf-8 -*-
"""
Created on Sat Jun  5 00:24:23 2021

@author: 34123
"""
import matplotlib.pyplot as plt
import numpy as np
import random
from scipy.stats import multivariate_normal


def plot_random_init_iris_sepal(df_full):
    sepal_df = df_full.iloc[:,0:2]
    sepal_df = np.array(sepal_df)
    
    m1 = random.choice(sepal_df)
    m2 = random.choice(sepal_df)
    m3 = random.choice(sepal_df)

    cov1 = np.cov(np.transpose(sepal_df))
    cov2 = np.cov(np.transpose(sepal_df))
    cov3 = np.cov(np.transpose(sepal_df))
    
    x1 = np.linspace(4,8,150)  
    x2 = np.linspace(1.5,4.5,150)
    X, Y = np.meshgrid(x1,x2) 

    Z1 = multivariate_normal(m1, cov1)  
    Z2 = multivariate_normal(m2, cov2)
    Z3 = multivariate_normal(m3, cov3)
    
    # a new array of given shape and type, without initializing entries
    pos = np.empty(X.shape + (2,))
    pos[:, :, 0] = X; pos[:, :, 1] = Y   

    plt.figure(figsize=(10,10))
    plt.scatter(sepal_df[:,0], sepal_df[:,1], marker='o')     
    plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5) 
    plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5) 
    plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5)
    # making both the axis equal
    plt.axis('equal')                                                                 
    plt.xlabel('Sepal Length', fontsize=16)
    plt.ylabel('Sepal Width', fontsize=16)
    plt.title('Initial Random Clusters(Sepal)', fontsize=22)
    plt.grid()
    plt.show()
    

def plot_random_init_iris_petal(df_full):
    petal_df = df_full.iloc[:,2:4]
    petal_df = np.array(petal_df)
    
    m1 = random.choice(petal_df)
    m2 = random.choice(petal_df)
    m3 = random.choice(petal_df)
    cov1 = np.cov(np.transpose(petal_df))
    cov2 = np.cov(np.transpose(petal_df))
    cov3 = np.cov(np.transpose(petal_df))

    x1 = np.linspace(-1,7,150)
    x2 = np.linspace(-1,4,150)
    X, Y = np.meshgrid(x1,x2) 

    Z1 = multivariate_normal(m1, cov1)  
    Z2 = multivariate_normal(m2, cov2)
    Z3 = multivariate_normal(m3, cov3)

    pos = np.empty(X.shape + (2,))
    pos[:, :, 0] = X; pos[:, :, 1] = Y   

    plt.figure(figsize=(10,10))
    plt.scatter(petal_df[:,0], petal_df[:,1], marker='o')     
    plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5) 
    plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5) 
    plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5) 
    plt.axis('equal') 
    plt.xlabel('Petal Length', fontsize=16) 
    plt.ylabel('Petal Width', fontsize=16)
    plt.title('Initial Random Clusters(Petal)', fontsize=22)
    plt.grid()
    plt.show()
    
    
def plot_cluster_iris_sepal(df_full, labels, centers):
    # finding mode
    seto = max(set(labels[0:50]), key=labels[0:50].count) # 2
    vers = max(set(labels[50:100]), key=labels[50:100].count) # 1
    virg = max(set(labels[100:]), key=labels[100:].count) # 0
    
    # sepal
    s_mean_clus1 = np.array([centers[seto][0],centers[seto][1]])
    s_mean_clus2 = np.array([centers[vers][0],centers[vers][1]])
    s_mean_clus3 = np.array([centers[virg][0],centers[virg][1]])
    
    values = np.array(labels) #label

    # search all 3 species
    searchval_seto = seto
    searchval_vers = vers
    searchval_virg = virg

    # index of all 3 species
    ii_seto = np.where(values == searchval_seto)[0]
    ii_vers = np.where(values == searchval_vers)[0]
    ii_virg = np.where(values == searchval_virg)[0]
    ind_seto = list(ii_seto)
    ind_vers = list(ii_vers)
    ind_virg = list(ii_virg)
    
    sepal_df = df_full.iloc[:,0:2]
    
    seto_df = sepal_df[sepal_df.index.isin(ind_seto)]
    vers_df = sepal_df[sepal_df.index.isin(ind_vers)]
    virg_df = sepal_df[sepal_df.index.isin(ind_virg)]
    
    cov_seto = np.cov(np.transpose(np.array(seto_df)))
    cov_vers = np.cov(np.transpose(np.array(vers_df)))
    cov_virg = np.cov(np.transpose(np.array(virg_df)))
    
    sepal_df = np.array(sepal_df)
    
    x1 = np.linspace(4,8,150)  
    x2 = np.linspace(1.5,4.5,150)
    X, Y = np.meshgrid(x1,x2) 

    Z1 = multivariate_normal(s_mean_clus1, cov_seto)  
    Z2 = multivariate_normal(s_mean_clus2, cov_vers)
    Z3 = multivariate_normal(s_mean_clus3, cov_virg)

    pos = np.empty(X.shape + (2,))
    pos[:, :, 0] = X; pos[:, :, 1] = Y   

    plt.figure(figsize=(10,10))                                                          
    plt.scatter(sepal_df[:,0], sepal_df[:,1], marker='o')     
    plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5) 
    plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5) 
    plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5) 
    plt.axis('equal')                                                                  
    plt.xlabel('Sepal Length', fontsize=16)
    plt.ylabel('Sepal Width', fontsize=16)
    plt.title('Final Clusters(Sepal)', fontsize=22)  
    plt.grid()
    plt.show()
    
    
    
def plot_cluster_iris_petal(df_full, labels, centers):
    # finding mode
    seto = max(set(labels[0:50]), key=labels[0:50].count) # 2
    vers = max(set(labels[50:100]), key=labels[50:100].count) # 1
    virg = max(set(labels[100:]), key=labels[100:].count) # 0
    
    values = np.array(labels) #label
    # search all 3 species
    searchval_seto = seto
    searchval_vers = vers
    searchval_virg = virg
    
    # index of all 3 species
    ii_seto = np.where(values == searchval_seto)[0]
    ii_vers = np.where(values == searchval_vers)[0]
    ii_virg = np.where(values == searchval_virg)[0]
    ind_seto = list(ii_seto)
    ind_vers = list(ii_vers)
    ind_virg = list(ii_virg)
    
    # petal
    p_mean_clus1 = np.array([centers[seto][2],centers[seto][3]])
    p_mean_clus2 = np.array([centers[vers][2],centers[vers][3]])
    p_mean_clus3 = np.array([centers[virg][2],centers[virg][3]])
    
    petal_df = df_full.iloc[:,2:4]
    
    seto_df = petal_df[petal_df.index.isin(ind_seto)]
    vers_df = petal_df[petal_df.index.isin(ind_vers)]
    virg_df = petal_df[petal_df.index.isin(ind_virg)]
    
    cov_seto = np.cov(np.transpose(np.array(seto_df)))
    cov_vers = np.cov(np.transpose(np.array(vers_df)))
    cov_virg = np.cov(np.transpose(np.array(virg_df)))
    
    petal_df = np.array(petal_df) 
    
    x1 = np.linspace(0.5,7,150)  
    x2 = np.linspace(-1,4,150)
    X, Y = np.meshgrid(x1,x2) 

    Z1 = multivariate_normal(p_mean_clus1, cov_seto)  
    Z2 = multivariate_normal(p_mean_clus2, cov_vers)
    Z3 = multivariate_normal(p_mean_clus3, cov_virg)

    pos = np.empty(X.shape + (2,))
    pos[:, :, 0] = X; pos[:, :, 1] = Y   

    plt.figure(figsize=(10,10))                                                         
    plt.scatter(petal_df[:,0], petal_df[:,1], marker='o')     
    plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5) 
    plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5) 
    plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5) 
    plt.axis('equal')                                               
    plt.xlabel('Petal Length', fontsize=16)
    plt.ylabel('Petal Width', fontsize=16)
    plt.title('Final Clusters(Petal)', fontsize=22)
    plt.grid()
    plt.show()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208

Reference:

  1. https://blog.csdn.net/on2way/article/details/47087201
  2. https://www.programmersought.com/article/9666746636/
  3. https://www.kaggle.com/prateekk94/fuzzy-c-means-clustering-on-iris-dataset
  4. https://www.datanovia.com/en/lessons/fuzzy-clustering-essentials/fuzzy-c-means-clustering-algorithm/
  5. https://github.com/theimageprocessingguy/Fuzzy-C-Means-Python
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/588350
推荐阅读
相关标签
  

闽ICP备14008679号