当前位置:   article > 正文

【机器学习】AGNES层次聚类算法_agnes聚类算法 算法实例

agnes聚类算法 算法实例

算法思想:

  1. 初始每个数据都是一个簇;
  2. 寻找每个簇之间的距离,获取距离列表;
  3. 合并距离最近的两个簇Ci,Cj,并删除Cj,更新距离列表,使簇个数减1;
  4. 重复第3步,直到簇个数等于所需个数k。

数据集来源:西瓜数据集4.0

1.读取文件:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')
data = data.values.tolist()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

2.使用matplotlib.pyplot绘出原始数据

# 画出原始图像
fig, ax = plt.subplots()
plt.scatter([i[0] for i in data], [i[1] for i in data])
plt.show()
  • 1
  • 2
  • 3
  • 4

原始数据

3.计算欧氏距离

def cal_dist(a, b):
	"""
	计算欧氏距离
	"""
    a = np.array(a)
    b = np.array(b)
    dist = np.sqrt(np.dot((a - b), (a - b).T))

    return dist
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

4.使用最小距离法计算簇之间的最小距离

'''
def cal_cluster_min_dist(c1, c2):
	"""
	计算簇之间的最小距离
	"""
    minDist = 1e5
    for vec1 in c1:
        for vec2 in c2:
            dist = cal_dist(vec1, vec2)
            if dist < minDist:
                minDist = dist

    return minDist
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

5.使用平均距离法计算簇之间的平均距离

def cal_cluster_avg_dist(c1, c2):
	"""
	计算簇之间的平均距离
	"""
    num = len(c1) * len(c2)
    sum_dist = 0
    for vec1 in c1:
        for vec2 in c2:
            dist = cal_dist(vec1, vec2)
            sum_dist += dist
    return sum_dist
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

6.获取簇之间的距离列表

def get_minDist_list(data, method):
	"""
	获取最小距离列表
	"""
    cluster_num = len(data)
    #     print("cluster_num",cluster_num)
    minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]
    for i in range(cluster_num):
        j = i + 1
        while j < cluster_num:
            #             print("data[i]:",data[i])
            #             print("data[j]:",data[j])
            if method == "minDist":  # 使用最小距离计算
                minDistList[i][j] = cal_cluster_min_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            elif method == "avgDist":  # 使用平均距离计算
                minDistList[i][j] = cal_cluster_avg_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            j += 1

    return minDistList
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

7.寻找最小值

def find_min(minDistList):
	"""
	寻找距离列表中的最小值,用于合并簇以及删除
	"""
    row = len(minDistList)
    minDist = 1e5
    min_i = 0
    min_j = 0
    for i in range(row):
        for j in range(row):
            dist = minDistList[i][j]
            if dist < minDist and dist != 0:
                minDist = minDistList[i][j]
                min_i = i
                min_j = j

    return min_i, min_j, minDist
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

8.实现

def AGNES(data, k, method):
	"""
	AGNES算法实现
	"""
    cluster_num = len(data)
    C = []
    for i in data:  # 添加数据
        tmp = [i]
        C.append(tmp)
    minDistList = get_minDistList(C, method)
    while cluster_num > k:
        i, j, minDist = find_min(minDistList)
        #         print(len(minDistList))
        #         print(i,j,minDist)
        C[i].extend(C[j])  # 合并
        del C[j]  # 删除
        minDistList = get_min_dist_list(C, method)
        cluster_num -= 1

    return C
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

9.程序入口

'''
程序入口
'''
if __name__ == "__main__":
    C_min = AGNES(data, 3, 'minDist')
    C_avg = AGNES(data, 3, 'avgDist')
    fig, ax = plt.subplots(nrows=2, ncols=1)
    ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')
    ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')
    ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')
    ax[0].set_title("使用最小距离进行聚类")

    ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')
    ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')
    ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')
    ax[1].set_title("使用平均距离进行聚类")

    fig.tight_layout()
    plt.show()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

完整代码如下:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')
data = data.values.tolist()

# 画出原始图像
fig, ax = plt.subplots()
plt.scatter([i[0] for i in data], [i[1] for i in data])
plt.show()

'''
计算欧氏距离
'''

def calDist(a, b):
    a = np.array(a)
    b = np.array(b)
    dist = np.sqrt(np.dot((a - b), (a - b).T))

    return dist


'''
计算簇之间的最小距离
'''


def cal_cluster_min_dist(c1, c2):
    minDist = 1e5
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            if dist < minDist:
                minDist = dist

    return minDist


'''
计算簇之间的平均距离
'''


def cal_cluster_avg_dist(c1, c2):
    num = len(c1) * len(c2)
    sum_dist = 0
    for vec1 in c1:
        for vec2 in c2:
            dist = calDist(vec1, vec2)
            sum_dist += dist
    return sum_dist


'''
获取最小距离列表
'''


def get_min_dist_list(data, method):
    cluster_num = len(data)
    #     print("cluster_num",cluster_num)
    minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]
    for i in range(cluster_num):
        j = i + 1
        while j < cluster_num:
            #             print("data[i]:",data[i])
            #             print("data[j]:",data[j])
            if method == "minDist":  # 使用最小距离计算
                minDistList[i][j] = cal_cluster_min_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            elif method == "avgDist":  # 使用平均距离计算
                minDistList[i][j] = cal_cluster_avg_dist(data[i], data[j])
                minDistList[j][i] = minDistList[i][j]
            j += 1

    return minDistList


'''
寻找距离列表中的最小值,用于合并簇以及删除
'''


def find_min(minDistList):
    row = len(minDistList)
    minDist = 1e5
    min_i = 0
    min_j = 0
    for i in range(row):
        for j in range(row):
            dist = minDistList[i][j]
            if dist < minDist and dist != 0:
                minDist = minDistList[i][j]
                min_i = i
                min_j = j

    return min_i, min_j, minDist


'''
AGNES算法实现
'''


def AGNES(data, k, method):
    cluster_num = len(data)
    C = []
    for i in data:  # 添加数据
        tmp = [i]
        C.append(tmp)
    minDistList = get_min_dist_list(C, method)
    while cluster_num > k:
        i, j, minDist = find_min(minDistList)
        #         print(len(minDistList))
        #         print(i,j,minDist)
        C[i].extend(C[j])  # 合并
        del C[j]  # 删除
        minDistList = get_min_dist_list(C, method)
        cluster_num -= 1

    return C


'''
程序入口
'''
if __name__ == "__main__":
    C_min = AGNES(data, 3, 'minDist')
    C_avg = AGNES(data, 3, 'avgDist')
    fig, ax = plt.subplots(nrows=2, ncols=1)
    ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')
    ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')
    ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')
    ax[0].set_title("使用最小距离进行聚类")

    ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')
    ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')
    ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')
    ax[1].set_title("使用平均距离进行聚类")

    fig.tight_layout()
    plt.show()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145

在这里插入图片描述

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/293266
推荐阅读
相关标签
  

闽ICP备14008679号