当前位置:   article > 正文

人工智能-聚类算法(分级聚类)_分级聚类算法

分级聚类算法

题目描述:

在二维平面根据一定特点产生一些点,然后给定标签,之后生成txt文档数据。

生成数据代码:

  1. import random
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. import matplotlib.pyplot as plt
  6. def genConCircle(filePath, r1, r2, eps):
  7. """
  8. :param filePath:
  9. :param r1:
  10. :param r2:
  11. :param eps:
  12. :return:
  13. """
  14. x1 = np.linspace(-5, 5, num=200)
  15. y1 = 0.5* x1 + [np.random.random() for _ in range(200)]+5
  16. x2 = np.linspace(-5, 5, num=200)
  17. y2 = -0.5 * x2 + [np.random.random() for _ in range(200)]-5.5
  18. def getRandom(r1, eps):
  19. return r1 + eps * r1 * random.random() - 0.5 * eps * r1
  20. with open(filePath, 'w+') as f:
  21. for i in np.arange(0, 2 * np.pi, 0.01 * np.pi):
  22. f.write('{} {} {}\n'.format(getRandom(r1, eps) * np.cos(i), getRandom(r1, eps) * np.sin(i),1))
  23. for i in np.arange(0, 2 * np.pi, 0.01 * np.pi):
  24. f.write('{} {} {}\n'.format(getRandom(r2, eps) * np.cos(i), getRandom(r2, eps) * np.sin(i),2))
  25. for i in range(200):
  26. f.write('{} {} {}\n'.format(x1[i], y1[i],3))
  27. for i in range(200):
  28. f.write('{} {} {}\n'.format(x2[i], y2[i],4))
  29. def draw2DTxt(filePath):
  30. data = np.loadtxt(filePath)
  31. print(data)
  32. x = data[:, 0]
  33. y = data[:, 1]
  34. plt.scatter(x[0:200], y[0:200],c='b')
  35. plt.scatter(x[200:400], y[200:400], c='r')
  36. plt.scatter(x[400:600], y[400:600], c='y')
  37. plt.scatter(x[600:800], y[600:800], c='g')
  38. plt.show()
  39. if __name__ == '__main__':
  40. genConCircle('a.txt', 2.5, 3.5, 0.2)
  41. draw2DTxt('a.txt')

数据分布情况如下:

 

之后对产生的数据进行分级聚类

分级聚类的代码:

  1. import math
  2. import pandas
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. def readfile(filename):
  6. """处理文件数据"""
  7. lines = [line for line in open(filename)]
  8. rownames = []
  9. data = []
  10. for line in lines:
  11. p = line.strip()
  12. p=p.split()
  13. rownames.append(p[-1])
  14. data.append([float(x) for x in p[0:2]])
  15. return rownames, data
  16. # 利用皮尔逊相关度作相关性判断
  17. # 传入的参数为两个list
  18. def person(v1, v2):
  19. # 简单求和
  20. sum1 = sum(v1)
  21. sum2 = sum(v2)
  22. # 求平方和
  23. sum1Sq = sum([pow(v, 2) for v in v1])
  24. sum2Sq = sum([pow(v, 2) for v in v2])
  25. # 求乘积之和
  26. pSum = sum([v1[i] * v2[i] for i in range(len(v1))])
  27. # 计算r
  28. num = pSum - (sum1 * sum2 / len(v1))
  29. den = math.sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1)))
  30. if den == 0: return 0
  31. # 让相似度越大的两个元素之间的距离变得更小
  32. return 1.0 - num / den
  33. def distEuclid(x, y):
  34. x=np.array(x)
  35. y=np.array(y)
  36. return np.sqrt(np.sum((x - y) ** 2))
  37. # 代表层级数
  38. class bicluster:
  39. """标记类属性"""
  40. def __init__(self, vec, left=None, right=None, distance=0.0, id_number=None,information=None,new_vec=None):
  41. self.left = left
  42. self.right = right
  43. self.vec = vec
  44. self.id_number = id_number
  45. self.distance = distance
  46. self.information=information
  47. self.new_vec=new_vec
  48. # 聚类算法(直到聚为1类才停止)
  49. def hcluster(rows,rowsname, distance=person):
  50. """分级聚类"""
  51. distances = {}
  52. clust=[]
  53. currentclustid = -1
  54. # 最开始的聚类就是数据集中的行 有多少行就有多少类
  55. for i in range(len(rows)):
  56. clust.append(bicluster(rows[i],id_number=i,information=[rowsname[i]],new_vec=[rows[i]]))
  57. while len(clust) > 4: #聚类为三类
  58. lowstpair = (0, 1)
  59. closest = distance(clust[0].vec, clust[1].vec)
  60. new_information=[]
  61. new_vecs=[]
  62. # 遍历每一个配对,寻找最小
  63. for i in range(len(clust)):
  64. for j in range(i + 1, len(clust)):
  65. # 用distances来缓存距离的计算值
  66. if (clust[i].id_number, clust[j].id_number) not in distances:
  67. distances[(clust[i].id_number, clust[j].id_number)] = distEuclid(clust[i].vec, clust[j].vec)
  68. d = distances[(clust[i].id_number, clust[j].id_number)]
  69. if d < closest:
  70. closest = d
  71. lowstpair = (i, j)
  72. ans1=clust[lowstpair[0]].information
  73. ans2=clust[lowstpair[1]].information
  74. new_information=new_information+ans1
  75. new_information=new_information+ans2
  76. ans3 = clust[lowstpair[0]].new_vec
  77. ans4 = clust[lowstpair[1]].new_vec
  78. new_vecs= new_vecs+ ans3
  79. new_vecs = new_vecs + ans4
  80. #print(new_information)
  81. # 计算两个聚类的平均值
  82. mergevec = [(clust[lowstpair[0]].vec[i] + clust[lowstpair[1]].vec[i]) / 2.0 for i in range(len(clust[0].vec))]
  83. # 建立新的聚类
  84. newcluster = bicluster(mergevec, left=clust[lowstpair[0]], right=clust[lowstpair[1]], distance=closest,id_number=currentclustid,information=new_information,new_vec=new_vecs)
  85. #print(newcluster)
  86. # 不在原来集合中的聚类,其id为负数
  87. currentclustid -= 1
  88. # 先删右边的则不会对左边的产生影响
  89. del clust[lowstpair[1]]
  90. del clust[lowstpair[0]]
  91. clust.append(newcluster)
  92. return clust
  93. def showdata(ans):
  94. '''画图的展示'''
  95. x1 = []
  96. y1 = []
  97. for i in range(len(ans[0].new_vec)):
  98. x1.append(ans[0].new_vec[i][0])
  99. y1.append(ans[0].new_vec[i][1])
  100. x2 = []
  101. y2 = []
  102. for i in range(len(ans[1].new_vec)):
  103. x2.append(ans[1].new_vec[i][0])
  104. y2.append(ans[1].new_vec[i][1])
  105. x3 = []
  106. y3 = []
  107. for i in range(len(ans[2].new_vec)):
  108. x3.append(ans[2].new_vec[i][0])
  109. y3.append(ans[2].new_vec[i][1])
  110. x4 = []
  111. y4 = []
  112. for i in range(len(ans[3].new_vec)):
  113. x4.append(ans[3].new_vec[i][0])
  114. y4.append(ans[3].new_vec[i][1])
  115. plt.scatter(x1, y1, c='r', s=20, alpha=0.9)
  116. plt.scatter(x2, y2, c='b', s=20, alpha=0.9)
  117. plt.scatter(x3, y3, c='g', s=20, alpha=0.9)
  118. plt.scatter(x4, y4, c='y', s=20, alpha=0.9)
  119. plt.show()
  120. def get_test(ans,k,total_num):
  121. '''准确率计算函数'''
  122. num=0
  123. for i in range(k):
  124. data=ans[i].information
  125. label=max(data, key=data.count)
  126. num=num+data.count(label)
  127. pass
  128. print("准确率为:")
  129. print(num/total_num)
  130. a,b=readfile('data_test3')
  131. total_num=len(b)
  132. ans=hcluster(b,a, distance=person)
  133. get_test(ans,4,total_num)
  134. showdata(ans)

下面是分级聚类的结果:

可以看到分级聚类的效果并不是很好,其实这一类数据更加适合基于密度的聚类,比如:密度峰值,高斯,均值漂移 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/很楠不爱3/article/detail/360963
推荐阅读
相关标签
  

闽ICP备14008679号