赞
踩
本文介绍了FCM算法的公式推导和Python源码实现,并在鸢尾花数据集上做了验证。
源码和笔记已经上传至Github:https://github.com/datamonday/ML-Algorithm-Source-Code/
基于划分的聚类,层次聚类等都属于硬聚类,即始终将样本分配给单个聚类。相对地,软聚类则不同,其旨在将每个样本与一个向量相关联,该向量通常表示样本属于某个聚类的概率。模糊C均值(Fuzzy C-Means, FCM)是一种典型的软聚类方法。其原理如下:
聚类中心初始化:
聚类结果:
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 5 00:02:38 2021
@author: 34123
"""
import os
import pandas as pd
import numpy as np
import random
import operator
import math
from copy import deepcopy
import matplotlib.pyplot as plt
# # 将网格线置于曲线之下
# plt.rcParams['axes.axisbelow'] = False
plt.style.use('fivethirtyeight') # 'ggplot'
from PlotFunctions import plot_random_init_iris_sepal, plot_random_init_iris_petal, plot_cluster_iris_sepal, plot_cluster_iris_petal
from sklearn.datasets import load_iris
def load_iris_data():
data = load_iris()
# iris数据集的特征列
features = data['data']
# iris数据集的标签
target = data['target']
# 增加维度1,用于拼接
target = target[:, np.newaxis]
target_names = data['target_names']
target_dicts = dict(zip(np.unique(target), target_names))
feature_names = data['feature_names']
# 浅拷贝,防止原地修改
feature_names = data['feature_names'].copy() # deepcopy(data['feature_names'])
feature_names.append('label')
df_full = pd.DataFrame(data = np.concatenate([features, target], axis=1),
columns=feature_names)
# 保存数据集
df_full.to_csv(str(os.getcwd()) + '/iris_data.csv', index=None)
columns = list(df_full.columns)
features = columns[:len(columns)-1]
class_labels = list(df_full[columns[-1]])
df = df_full[features]
return df_full, df, class_labels, target_dicts
# 初始化隶属度矩阵 U
def init_fuzzy_matrix(n_sample, c):
"""
初始化隶属度矩阵,注意针对一个样本,三个隶属度的相加和=1
----
param n_sample: 样本数量
param c: 聚类数量
"""
# 针对数据集中所有样本的隶属度矩阵,shape = [n_sample, c]
fuzzy_matrix = []
for i in range(n_sample):
# 生成 c 个随机数列表, random.random()方法随机生成[0,1)范围内的一个实数。
random_list = [random.random() for i in range(c)]
sum_of_random = sum(random_list)
# 归一化之后的随机数列表
# 单个样本的模糊隶属度列表
norm_random_list = [x/sum_of_random for x in random_list]
# 选择随机参数列表中最大的数的索引
one_of_random_index = norm_random_list.index(max(norm_random_list))
for j in range(0, len(norm_random_list)):
if(j == one_of_random_index):
norm_random_list[j] = 1
else:
norm_random_list[j] = 0
fuzzy_matrix.append(norm_random_list)
return fuzzy_matrix
# 计算FCM的聚类中心
def cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m):
"""
param df: 数据集的特征集,不包含标签列
param fuzzy_matrix: 隶属度矩阵
param c: 聚类簇数量
param m: 加权指数
"""
# *字符称为解包运算符
# zip(*fuzzy_amtrix) 相当于将fuzzy_matrix按列展开并拼接,但并不合并!
# list(zip(*fuzzy_amtrix)) 包含 列数 个元组。
fuzzy_mat_ravel = list(zip(*fuzzy_matrix))
cluster_centers = []
# 遍历聚类数量次
for j in range(c):
# 取出属于某一类的所有样本的隶属度列表(隶属度矩阵的一列)
fuzzy_one_dim_list = list(fuzzy_mat_ravel[j])
# 计算隶属度的m次方
m_fuzzy_one_dim_list = [p ** m for p in fuzzy_one_dim_list]
# 隶属度求和,求解聚类中心公式中的分母
denominator = sum(m_fuzzy_one_dim_list)
#
numerator_list = []
# 遍历所有样本,求分子
for i in range(n_sample):
# 取出一个样本
sample = list(df.iloc[i])
# 聚类簇中心的分子部分,样本与对应的隶属度的m次方相乘
mul_sample_fuzzy = [m_fuzzy_one_dim_list[i] * val for val in sample]
numerator_list.append(mul_sample_fuzzy)
# 计算分子,求和
numerator = map(sum, list(zip(*numerator_list)))
cluster_center = [val/denominator for val in numerator]
cluster_centers.append(cluster_center)
return cluster_centers
# 更新隶属度矩阵,参考公式 (8)
def update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers):
# 分母的指数项
order = float(2 / (m - 1))
# 遍历样本
for i in range(n_sample):
# 单个样本
sample = list(df.iloc[i])
# 计算更新公式的分母:样本减去聚类中心
distances = [np.linalg.norm( np.array(list( map(operator.sub, sample, cluster_centers[j]) )) ) \
for j in range(c)]
for j in range(c):
# 更新公式的分母
denominator = sum([math.pow(float(distances[j]/distances[val]), order) for val in range(c)])
fuzzy_matrix[i][j] = float(1 / denominator)
return fuzzy_matrix #, distances
# 获取聚类中心
def get_clusters(fuzzy_matrix, n_sample):
# 隶属度最大的那一个维度作为最终的聚类结果
cluster_labels = []
for i in range(n_sample):
max_val, idx = max( (val, idx) for (idx, val) in enumerate(fuzzy_matrix[i]) )
cluster_labels.append(idx)
return cluster_labels
# 模糊c均值聚类算法
def fuzzy_c_means(df, fuzzy_matrix, n_sample, c, m, max_iter, init_method='random'):
"""
param init_random: 聚类中心的初始化方法
- random: 从样本中随机选择c个作为聚类中心
- multi_normal: 多元高斯分布采样
"""
# 样本特征数量
n_features = df.shape[-1]
# 初始化隶属度矩阵
fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
# 初始化迭代次数
current_iter = 0
# 初始化聚类中心
init_cluster_centers = []
cluster_centers = []
# 初始化样本聚类标签的列表,每次迭代都需要保存每个样本的聚类
max_iter_cluster_labels = []
# 选择初始化方法
if init_method == 'multi_normal':
# 均值列表
mean = [0] * n_features
# 多元高斯分布的协方差矩阵,对角阵
cov = np.identity(n_features)
for i in range(0, c):
init_cluster_centers.append( list( np.random.multivariate_normal(mean, cov) ) )
# else:
# init_cluster_centers = [[0.1] * n_features ] * c
print(init_cluster_centers)
while current_iter < max_iter:
if current_iter == 0 and init_method == 'multi_normal':
cluster_centers = init_cluster_centers
else:
cluster_centers = cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m)
fuzzy_matrix = update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers)
cluster_labels = get_clusters(fuzzy_matrix, n_sample)
max_iter_cluster_labels.append(cluster_labels)
current_iter += 1
print('-' * 32)
print("Fuzzy Matrix U:\n")
print(np.array(fuzzy_matrix))
return cluster_centers, cluster_labels, max_iter_cluster_labels
if __name__ == '__main__':
df_full, df, class_labels, target_dicts = load_iris_data()
# 簇数量,鸢尾花数据集有3类
c = 3
# 最大迭代次数,防止无限循环
max_iter = 20
# 数据量
n_sample = len(df)
# 加权指数m,有论文建议 [1.5, 2.5] 范围之间比较好
m = 1.7
fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
centers, labels, acc = fuzzy_c_means(df,
fuzzy_matrix,
n_sample,
c,
m,
max_iter,
init_method='multi_normal') # multi_normal, random
plot_random_init_iris_sepal(df_full)
plot_random_init_iris_petal(df_full)
plot_cluster_iris_sepal(df_full, labels, centers)
plot_cluster_iris_petal(df_full, labels, centers)
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 5 00:24:23 2021
@author: 34123
"""
import matplotlib.pyplot as plt
import numpy as np
import random
from scipy.stats import multivariate_normal
def plot_random_init_iris_sepal(df_full):
sepal_df = df_full.iloc[:,0:2]
sepal_df = np.array(sepal_df)
m1 = random.choice(sepal_df)
m2 = random.choice(sepal_df)
m3 = random.choice(sepal_df)
cov1 = np.cov(np.transpose(sepal_df))
cov2 = np.cov(np.transpose(sepal_df))
cov3 = np.cov(np.transpose(sepal_df))
x1 = np.linspace(4,8,150)
x2 = np.linspace(1.5,4.5,150)
X, Y = np.meshgrid(x1,x2)
Z1 = multivariate_normal(m1, cov1)
Z2 = multivariate_normal(m2, cov2)
Z3 = multivariate_normal(m3, cov3)
# a new array of given shape and type, without initializing entries
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
plt.figure(figsize=(10,10))
plt.scatter(sepal_df[:,0], sepal_df[:,1], marker='o')
plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5)
plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5)
plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5)
# making both the axis equal
plt.axis('equal')
plt.xlabel('Sepal Length', fontsize=16)
plt.ylabel('Sepal Width', fontsize=16)
plt.title('Initial Random Clusters(Sepal)', fontsize=22)
plt.grid()
plt.show()
def plot_random_init_iris_petal(df_full):
petal_df = df_full.iloc[:,2:4]
petal_df = np.array(petal_df)
m1 = random.choice(petal_df)
m2 = random.choice(petal_df)
m3 = random.choice(petal_df)
cov1 = np.cov(np.transpose(petal_df))
cov2 = np.cov(np.transpose(petal_df))
cov3 = np.cov(np.transpose(petal_df))
x1 = np.linspace(-1,7,150)
x2 = np.linspace(-1,4,150)
X, Y = np.meshgrid(x1,x2)
Z1 = multivariate_normal(m1, cov1)
Z2 = multivariate_normal(m2, cov2)
Z3 = multivariate_normal(m3, cov3)
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
plt.figure(figsize=(10,10))
plt.scatter(petal_df[:,0], petal_df[:,1], marker='o')
plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5)
plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5)
plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5)
plt.axis('equal')
plt.xlabel('Petal Length', fontsize=16)
plt.ylabel('Petal Width', fontsize=16)
plt.title('Initial Random Clusters(Petal)', fontsize=22)
plt.grid()
plt.show()
def plot_cluster_iris_sepal(df_full, labels, centers):
# finding mode
seto = max(set(labels[0:50]), key=labels[0:50].count) # 2
vers = max(set(labels[50:100]), key=labels[50:100].count) # 1
virg = max(set(labels[100:]), key=labels[100:].count) # 0
# sepal
s_mean_clus1 = np.array([centers[seto][0],centers[seto][1]])
s_mean_clus2 = np.array([centers[vers][0],centers[vers][1]])
s_mean_clus3 = np.array([centers[virg][0],centers[virg][1]])
values = np.array(labels) #label
# search all 3 species
searchval_seto = seto
searchval_vers = vers
searchval_virg = virg
# index of all 3 species
ii_seto = np.where(values == searchval_seto)[0]
ii_vers = np.where(values == searchval_vers)[0]
ii_virg = np.where(values == searchval_virg)[0]
ind_seto = list(ii_seto)
ind_vers = list(ii_vers)
ind_virg = list(ii_virg)
sepal_df = df_full.iloc[:,0:2]
seto_df = sepal_df[sepal_df.index.isin(ind_seto)]
vers_df = sepal_df[sepal_df.index.isin(ind_vers)]
virg_df = sepal_df[sepal_df.index.isin(ind_virg)]
cov_seto = np.cov(np.transpose(np.array(seto_df)))
cov_vers = np.cov(np.transpose(np.array(vers_df)))
cov_virg = np.cov(np.transpose(np.array(virg_df)))
sepal_df = np.array(sepal_df)
x1 = np.linspace(4,8,150)
x2 = np.linspace(1.5,4.5,150)
X, Y = np.meshgrid(x1,x2)
Z1 = multivariate_normal(s_mean_clus1, cov_seto)
Z2 = multivariate_normal(s_mean_clus2, cov_vers)
Z3 = multivariate_normal(s_mean_clus3, cov_virg)
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
plt.figure(figsize=(10,10))
plt.scatter(sepal_df[:,0], sepal_df[:,1], marker='o')
plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5)
plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5)
plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5)
plt.axis('equal')
plt.xlabel('Sepal Length', fontsize=16)
plt.ylabel('Sepal Width', fontsize=16)
plt.title('Final Clusters(Sepal)', fontsize=22)
plt.grid()
plt.show()
def plot_cluster_iris_petal(df_full, labels, centers):
# finding mode
seto = max(set(labels[0:50]), key=labels[0:50].count) # 2
vers = max(set(labels[50:100]), key=labels[50:100].count) # 1
virg = max(set(labels[100:]), key=labels[100:].count) # 0
values = np.array(labels) #label
# search all 3 species
searchval_seto = seto
searchval_vers = vers
searchval_virg = virg
# index of all 3 species
ii_seto = np.where(values == searchval_seto)[0]
ii_vers = np.where(values == searchval_vers)[0]
ii_virg = np.where(values == searchval_virg)[0]
ind_seto = list(ii_seto)
ind_vers = list(ii_vers)
ind_virg = list(ii_virg)
# petal
p_mean_clus1 = np.array([centers[seto][2],centers[seto][3]])
p_mean_clus2 = np.array([centers[vers][2],centers[vers][3]])
p_mean_clus3 = np.array([centers[virg][2],centers[virg][3]])
petal_df = df_full.iloc[:,2:4]
seto_df = petal_df[petal_df.index.isin(ind_seto)]
vers_df = petal_df[petal_df.index.isin(ind_vers)]
virg_df = petal_df[petal_df.index.isin(ind_virg)]
cov_seto = np.cov(np.transpose(np.array(seto_df)))
cov_vers = np.cov(np.transpose(np.array(vers_df)))
cov_virg = np.cov(np.transpose(np.array(virg_df)))
petal_df = np.array(petal_df)
x1 = np.linspace(0.5,7,150)
x2 = np.linspace(-1,4,150)
X, Y = np.meshgrid(x1,x2)
Z1 = multivariate_normal(p_mean_clus1, cov_seto)
Z2 = multivariate_normal(p_mean_clus2, cov_vers)
Z3 = multivariate_normal(p_mean_clus3, cov_virg)
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
plt.figure(figsize=(10,10))
plt.scatter(petal_df[:,0], petal_df[:,1], marker='o')
plt.contour(X, Y, Z1.pdf(pos), colors="r" ,alpha = 0.5)
plt.contour(X, Y, Z2.pdf(pos), colors="b" ,alpha = 0.5)
plt.contour(X, Y, Z3.pdf(pos), colors="g" ,alpha = 0.5)
plt.axis('equal')
plt.xlabel('Petal Length', fontsize=16)
plt.ylabel('Petal Width', fontsize=16)
plt.title('Final Clusters(Petal)', fontsize=22)
plt.grid()
plt.show()
Reference:
- https://blog.csdn.net/on2way/article/details/47087201
- https://www.programmersought.com/article/9666746636/
- https://www.kaggle.com/prateekk94/fuzzy-c-means-clustering-on-iris-dataset
- https://www.datanovia.com/en/lessons/fuzzy-clustering-essentials/fuzzy-c-means-clustering-algorithm/
- https://github.com/theimageprocessingguy/Fuzzy-C-Means-Python
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。