赞
踩
在二维平面根据一定特点产生一些点,然后给定标签,之后生成txt文档数据。
- import random
- import numpy as np
- import matplotlib.pyplot as plt
-
- import numpy as np
- import matplotlib.pyplot as plt
-
-
-
-
-
- def genConCircle(filePath, r1, r2, eps):
- """
- :param filePath:
- :param r1:
- :param r2:
- :param eps:
- :return:
- """
- x1 = np.linspace(-5, 5, num=200)
- y1 = 0.5* x1 + [np.random.random() for _ in range(200)]+5
- x2 = np.linspace(-5, 5, num=200)
- y2 = -0.5 * x2 + [np.random.random() for _ in range(200)]-5.5
- def getRandom(r1, eps):
- return r1 + eps * r1 * random.random() - 0.5 * eps * r1
-
- with open(filePath, 'w+') as f:
- for i in np.arange(0, 2 * np.pi, 0.01 * np.pi):
- f.write('{} {} {}\n'.format(getRandom(r1, eps) * np.cos(i), getRandom(r1, eps) * np.sin(i),1))
- for i in np.arange(0, 2 * np.pi, 0.01 * np.pi):
- f.write('{} {} {}\n'.format(getRandom(r2, eps) * np.cos(i), getRandom(r2, eps) * np.sin(i),2))
- for i in range(200):
- f.write('{} {} {}\n'.format(x1[i], y1[i],3))
- for i in range(200):
- f.write('{} {} {}\n'.format(x2[i], y2[i],4))
-
-
-
-
- def draw2DTxt(filePath):
- data = np.loadtxt(filePath)
- print(data)
- x = data[:, 0]
- y = data[:, 1]
- plt.scatter(x[0:200], y[0:200],c='b')
- plt.scatter(x[200:400], y[200:400], c='r')
- plt.scatter(x[400:600], y[400:600], c='y')
- plt.scatter(x[600:800], y[600:800], c='g')
- plt.show()
-
-
- if __name__ == '__main__':
- genConCircle('a.txt', 2.5, 3.5, 0.2)
- draw2DTxt('a.txt')
- import math
- import pandas
- import matplotlib.pyplot as plt
- import numpy as np
-
-
-
- def readfile(filename):
- """处理文件数据"""
- lines = [line for line in open(filename)]
- rownames = []
- data = []
- for line in lines:
- p = line.strip()
- p=p.split()
- rownames.append(p[-1])
- data.append([float(x) for x in p[0:2]])
- return rownames, data
-
-
-
- # 利用皮尔逊相关度作相关性判断
- # 传入的参数为两个list
- def person(v1, v2):
- # 简单求和
- sum1 = sum(v1)
- sum2 = sum(v2)
- # 求平方和
- sum1Sq = sum([pow(v, 2) for v in v1])
- sum2Sq = sum([pow(v, 2) for v in v2])
- # 求乘积之和
- pSum = sum([v1[i] * v2[i] for i in range(len(v1))])
- # 计算r
- num = pSum - (sum1 * sum2 / len(v1))
- den = math.sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1)))
- if den == 0: return 0
- # 让相似度越大的两个元素之间的距离变得更小
- return 1.0 - num / den
-
- def distEuclid(x, y):
- x=np.array(x)
- y=np.array(y)
- return np.sqrt(np.sum((x - y) ** 2))
-
- # 代表层级数
- class bicluster:
- """标记类属性"""
- def __init__(self, vec, left=None, right=None, distance=0.0, id_number=None,information=None,new_vec=None):
- self.left = left
- self.right = right
- self.vec = vec
- self.id_number = id_number
- self.distance = distance
- self.information=information
- self.new_vec=new_vec
-
- # 聚类算法(直到聚为1类才停止)
- def hcluster(rows,rowsname, distance=person):
- """分级聚类"""
- distances = {}
- clust=[]
- currentclustid = -1
- # 最开始的聚类就是数据集中的行 有多少行就有多少类
- for i in range(len(rows)):
- clust.append(bicluster(rows[i],id_number=i,information=[rowsname[i]],new_vec=[rows[i]]))
- while len(clust) > 4: #聚类为三类
- lowstpair = (0, 1)
- closest = distance(clust[0].vec, clust[1].vec)
- new_information=[]
- new_vecs=[]
- # 遍历每一个配对,寻找最小
- for i in range(len(clust)):
- for j in range(i + 1, len(clust)):
- # 用distances来缓存距离的计算值
- if (clust[i].id_number, clust[j].id_number) not in distances:
- distances[(clust[i].id_number, clust[j].id_number)] = distEuclid(clust[i].vec, clust[j].vec)
- d = distances[(clust[i].id_number, clust[j].id_number)]
- if d < closest:
- closest = d
- lowstpair = (i, j)
- ans1=clust[lowstpair[0]].information
- ans2=clust[lowstpair[1]].information
- new_information=new_information+ans1
- new_information=new_information+ans2
-
- ans3 = clust[lowstpair[0]].new_vec
- ans4 = clust[lowstpair[1]].new_vec
- new_vecs= new_vecs+ ans3
- new_vecs = new_vecs + ans4
- #print(new_information)
- # 计算两个聚类的平均值
- mergevec = [(clust[lowstpair[0]].vec[i] + clust[lowstpair[1]].vec[i]) / 2.0 for i in range(len(clust[0].vec))]
- # 建立新的聚类
- newcluster = bicluster(mergevec, left=clust[lowstpair[0]], right=clust[lowstpair[1]], distance=closest,id_number=currentclustid,information=new_information,new_vec=new_vecs)
- #print(newcluster)
- # 不在原来集合中的聚类,其id为负数
- currentclustid -= 1
- # 先删右边的则不会对左边的产生影响
- del clust[lowstpair[1]]
- del clust[lowstpair[0]]
- clust.append(newcluster)
- return clust
-
- def showdata(ans):
- '''画图的展示'''
- x1 = []
- y1 = []
- for i in range(len(ans[0].new_vec)):
- x1.append(ans[0].new_vec[i][0])
- y1.append(ans[0].new_vec[i][1])
- x2 = []
- y2 = []
- for i in range(len(ans[1].new_vec)):
- x2.append(ans[1].new_vec[i][0])
- y2.append(ans[1].new_vec[i][1])
- x3 = []
- y3 = []
- for i in range(len(ans[2].new_vec)):
- x3.append(ans[2].new_vec[i][0])
- y3.append(ans[2].new_vec[i][1])
- x4 = []
- y4 = []
- for i in range(len(ans[3].new_vec)):
- x4.append(ans[3].new_vec[i][0])
- y4.append(ans[3].new_vec[i][1])
- plt.scatter(x1, y1, c='r', s=20, alpha=0.9)
- plt.scatter(x2, y2, c='b', s=20, alpha=0.9)
- plt.scatter(x3, y3, c='g', s=20, alpha=0.9)
- plt.scatter(x4, y4, c='y', s=20, alpha=0.9)
- plt.show()
-
- def get_test(ans,k,total_num):
- '''准确率计算函数'''
- num=0
- for i in range(k):
- data=ans[i].information
- label=max(data, key=data.count)
- num=num+data.count(label)
- pass
- print("准确率为:")
- print(num/total_num)
-
-
- a,b=readfile('data_test3')
- total_num=len(b)
- ans=hcluster(b,a, distance=person)
- get_test(ans,4,total_num)
- showdata(ans)
下面是分级聚类的结果:
可以看到分级聚类的效果并不是很好,其实这一类数据更加适合基于密度的聚类,比如:密度峰值,高斯,均值漂移
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。