当前位置:   article > 正文

【数据挖掘】基于K均值聚类的离群点检测(python实现)_python 用聚类找离群值

python 用聚类找离群值

一、python代码

  1. '''
  2. Author: Vici__
  3. date: 2020/5/21
  4. '''
  5. import math
  6. import random
  7. import numpy as np
  8. '''
  9. Point类,记录坐标x,y和点的名字id
  10. '''
  11. class Point:
  12. '''
  13. 初始化函数
  14. '''
  15. def __init__(self, x, y):
  16. self.x = x # 横坐标
  17. self.y = y # 纵坐标
  18. '''
  19. 计算两点之间的欧几里得距离
  20. '''
  21. def calc_Euclidean_distance(self, p2):
  22. return math.sqrt((self.x - p2.x) * (self.x - p2.x) + (self.y - p2.y) * (self.y - p2.y))
  23. '''
  24. 1. 获取数据集
  25. '''
  26. def get_dataset():
  27. # 原始数据集以元组形式存放,(横坐标,纵坐标,名字)
  28. datas = [(0, 0), (1, 2), (3, 1), (8, 8), (9, 10), (10, 7), (10, 1)]
  29. dataset = [] # 用于计算两点之间的距离,形式 [point1, point2...]
  30. id_point_dict = {} # 编号和点的映射
  31. temp_list = []
  32. for i in range(len(datas)): # 遍历原始数据集
  33. point = Point(datas[i][0], datas[i][1]) # 利用(横坐标,纵坐标,编号)实例化
  34. id_point_dict[str(i)] = point
  35. dataset.append(point) # 放入dataset中
  36. temp_list.append(point)
  37. return dataset, id_point_dict # [p1, p2], {id: point}
  38. '''
  39. 2. 计算离散因子,找到离散群
  40. '''
  41. def find_discrete(group, id_point_dict, n):
  42. print("2. 计算离散因子,找出离散群")
  43. index_centroids = {}
  44. for k, v in group.items():
  45. xs = [] # 当前簇的所有x坐标
  46. ys = [] # 当前簇的所有y坐标
  47. for i in v: # 遍历当前簇集合
  48. point = id_point_dict[str(i)] # 获取点的x,y坐标
  49. xs.append(point.x)
  50. ys.append(point.y)
  51. x_mean = np.mean(np.array(xs), axis=0) # 计算x均值
  52. y_mean = np.mean(np.array(ys), axis=0) # 计算y均值
  53. index_centroids[k]= (x_mean, y_mean)
  54. dis = {}
  55. OF2 = {}
  56. for k1, v1 in index_centroids.items():
  57. for k2, v2 in index_centroids.items():
  58. if k1 == k2:
  59. continue
  60. dis[(k1, k2)] = ((v1[0]-v2[0])**2 + (v1[1]-v2[1])**2)**0.5
  61. res = None
  62. Max = -1
  63. for k1, v1 in group.items():
  64. OF2[k1] = 0
  65. for k2, v2 in group.items():
  66. if k1 == k2:
  67. continue
  68. OF2[k1] += (len(group[k2]) / n) * dis[(k1, k2)]
  69. if OF2[k1] > Max:
  70. Max = OF2[k1]
  71. res = k1
  72. print("簇心间的距离:")
  73. for k, v in dis.items():
  74. print(group[k[0]], group[k[1]], v)
  75. print("离散因子:")
  76. for k, v in OF2.items():
  77. print(group[k], v)
  78. print("离散群为:")
  79. print(group[res])
  80. '''
  81. 3. KMeans主函数
  82. '''
  83. def KMeans(dataset, k, id_point_dict):
  84. n = len(dataset) # 数据集中点的个数
  85. centroids = random.sample([x for x in range(len(dataset))], k) # 随机选取k个点作为初始质心点
  86. index = n # 用于给新的质心点编号
  87. pre_answer = {} # 上一次的分成的簇的结果,用于和新生成的作比较,判断是否继续执行算法
  88. while True:
  89. answer = {} # 根据质心分的簇
  90. for j in centroids: # 遍历质心,给每一个质心编号定义一个集合
  91. answer[str(j)] = set()
  92. for i in range(n): # 遍历数据集
  93. Min = float("INF") # 用于寻找当前点最接近哪个质心
  94. Min_index = -1
  95. for j in centroids: # 遍历质心
  96. point_i = id_point_dict[str(i)] # 数据集中的点
  97. point_j = id_point_dict[str(j)] # 质心点
  98. dist = point_j.calc_Euclidean_distance(point_i) # 计算两点距离
  99. print(i, j, dist)
  100. if dist < Min: # 寻找当前点最接近哪个质心
  101. Min = dist
  102. Min_index = j
  103. # 得到结果:i这个点是Min_index的小弟
  104. answer[str(Min_index)].add(i) # 放入相应集合中
  105. centroids.clear() # 清除之前的质心点
  106. # 遍历answer计算新的质心
  107. for v in answer.values():
  108. xs = [] # 当前簇的所有x坐标
  109. ys = [] # 当前簇的所有y坐标
  110. for i in v: # 遍历当前簇集合
  111. point = id_point_dict[str(i)] # 获取点的x,y坐标
  112. xs.append(point.x)
  113. ys.append(point.y)
  114. x_mean = np.mean(np.array(xs), axis=0) # 计算x均值
  115. y_mean = np.mean(np.array(ys), axis=0) # 计算y均值
  116. print(x_mean, y_mean)
  117. new_point = Point(x_mean, y_mean) # 定义新质心点
  118. id_point_dict[str(index)] = new_point # 放入编号到点的映射中
  119. centroids.append(index) # 放入质心列表中
  120. index += 1
  121. # 检查是否继续
  122. count = 0
  123. for v1 in answer.values(): # 遍历旧簇组合
  124. for v2 in pre_answer.values(): # 遍历新簇组合
  125. if list(v1) == list(v2): # 如果两个集合相同
  126. count += 1 # 计数
  127. if count == k: # 如果集合相同的个数等于簇的个数,说明answer和pre_answer相同
  128. break # 结束算法即可
  129. pre_answer = answer.copy() # 更新旧结果
  130. # 打印每次循环得到的结果:
  131. for v in answer.values():
  132. print(v)
  133. print("---------------------------------------")
  134. find_discrete(pre_answer, id_point_dict, n)
  135. # 测试
  136. dataset, id_point_dict = get_dataset()
  137. k = 3
  138. KMeans(dataset, k, id_point_dict)

二、测试

数据:

[(0,0),(1,2),(3,1),(8,8),(9,10),(10,7),(10,1)]
k=3

结果:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/301960
推荐阅读
相关标签
  

闽ICP备14008679号