当前位置:   article > 正文

数据挖掘大作业(一):Kmeans+PAM_pam对部分waveform数据集加20%的高斯噪声

pam对部分waveform数据集加20%的高斯噪声
  • 题目描述
  1. 编程实现K-means算法对waveform数据进行聚类,并对无噪声得图像进行分割;
  2. 编程实现PAM算法对有20%高斯噪声的waveform数据聚类,并对有噪声得图像进行分割。

 

  • 算法描述

(1) K -means

      

(2) PAM

      

  • 结果展示

1.K-means算法对waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

2.K-means算法对有20%高斯噪声的waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

3.PAM算法对waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

4.PAM算法对有20%高斯噪声的waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

• 图像分割部分

                                 

              无噪声原图                                              有噪声原图

 

5.K-means算法对无噪声得图像进行分割

6.K-means算法对有噪声得图像进行分割

7.PAM算法对无噪声得图像进行分割

8.PAM算法对有噪声得图像进行分割

  • 实验代码
    1. # -*- coding:utf-8 -*-
    2. from numpy import *
    3. import pandas as pd
    4. import matplotlib.pyplot as plt
    5. import random
    6. import numpy as np
    7. from PIL import Image
    8. def image_gauss_noise(image): # 图片添加高斯噪声
    9. img = image.astype(np.int16) # 此步是为了避免像素点小于0,大于255的情况
    10. for i in range(img.shape[0]):
    11. for j in range(img.shape[1]):
    12. img[i, j] += random.gauss(mu=0, sigma=10)
    13. img[img > 255] = 255
    14. img[img < 0] = 0
    15. img = img.astype(np.uint8)
    16. return img
    17. def data_gauss_noise(data): # 10%数据添加高斯噪声
    18. m, n = shape(data)
    19. msample = set((m * np.random.rand(int(m * 0.2))).astype(int))
    20. for i in msample:
    21. for j in range(n):
    22. data[i, j] += random.gauss(mu=0, sigma=0.1)
    23. return data
    24. # 计算两个向量的距离,欧式距离
    25. def disMea(vecA, vecB):
    26. return sqrt(sum(power(vecA - vecB, 2)))
    27. # 随机选择中心点
    28. def createCent(dataSet, k):
    29. n = shape(dataSet)[1]
    30. centriods = mat(zeros((k, n)))
    31. for j in range(n):
    32. minJ = min(dataSet[:, j])
    33. rangeJ = float(max(array(dataSet)[:, j]) - minJ)
    34. centriods[:, j] = minJ + rangeJ * np.random.rand(k, 1)
    35. return centriods
    36. def kmeans(dataSet, k):
    37. m = shape(dataSet)[0]
    38. clusterA = mat(zeros((m, 1)))
    39. centriods = createCent(dataSet, k)
    40. clusterC = True
    41. itr = 10
    42. while clusterC and itr:
    43. clusterC = False
    44. for i in range(m):
    45. minDist = inf
    46. minIndex = -1
    47. for j in range(k):
    48. distJI = disMea(centriods[j, :], dataSet[i, :])
    49. if distJI < minDist:
    50. minDist = distJI;
    51. minIndex = j
    52. if clusterA[i, 0] != minIndex:
    53. clusterC = True
    54. clusterA[i, 0] = int(minIndex)
    55. for cent in range(k):
    56. ptsInClust = dataSet[nonzero(clusterA[:, 0].A == cent)[0]] # get all the point in this cluster
    57. centriods[cent, :] = mean(ptsInClust, axis=0) # assign centroid to mean
    58. # print(itr)
    59. itr -= 1
    60. return centriods, clusterA
    61. def show1(dataSet, k, centriods, clusterA, count):
    62. plt.figure()
    63. m, n = shape(dataSet)
    64. mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    65. for i in range(m):
    66. markIndex = int(clusterA[i, 0])
    67. plt.plot(dataSet[i, 6], dataSet[i, 9], mark[markIndex])
    68. # mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    69. # for i in range(k):
    70. # plt.plot(centriods[i, showindex[6]], centriods[i, showindex[9]], mark[i], markersize=12)
    71. plt.savefig("Figure_"+str(count)+".png")
    72. def pearson_distance(vector1, vector2):
    73. from scipy.spatial.distance import pdist
    74. X = vstack([vector1, vector2])
    75. return pdist(X)
    76. def totalcost(blogwords, medoids_idx):
    77. distances_cache = { }
    78. size = shape(blogwords)[0]
    79. total_cost = 0.0
    80. medoids = { }
    81. for idx in medoids_idx:
    82. medoids[idx] = []
    83. for i in range(size):
    84. choice = None
    85. min_cost = inf
    86. for m in medoids:
    87. tmp = distances_cache.get((m, i), None)
    88. if tmp == None:
    89. tmp = pearson_distance(blogwords[m], blogwords[i])
    90. distances_cache[(m, i)] = tmp
    91. if tmp < min_cost:
    92. choice = m
    93. min_cost = tmp
    94. medoids[choice].append(i)
    95. total_cost += min_cost
    96. return total_cost, medoids
    97. def PAM(dataSet, k):
    98. m, n = shape(dataSet) # 数据集的行
    99. iter_count = 0
    100. # 随机选取K个聚类中心
    101. CenterIndex = random.sample([i for i in range(m)], k)
    102. # 计算初始的代价和聚类结果
    103. pre_cost, medoids = totalcost(dataSet, CenterIndex)
    104. current_cost = inf
    105. best_choice = []
    106. best_res = { }
    107. itr = 5
    108. while itr:
    109. # 遍历所有中心点
    110. for m in medoids:
    111. # 逐个选取中心点的簇中的数据,进行替代计算
    112. for item in medoids[m]:
    113. # 取的点不是中心点才计算
    114. if item != m:
    115. # print("now replace is %s" % item)
    116. # 获取中心点m在类簇中的下标
    117. # print("In for CenterIndex is %s" % CenterIndex)
    118. idx = CenterIndex.index(m)
    119. # print("now will be replaced index is %s" % idx)
    120. # 临时记录该数据,因为要被替换进行计算
    121. swap_temp = CenterIndex[idx]
    122. # 进行替换
    123. CenterIndex[idx] = item
    124. # 替换后的代价和类簇
    125. tmp, medoids_ = totalcost(dataSet, CenterIndex)
    126. # 如果代价更小,那么就替换
    127. if tmp < current_cost:
    128. # 进行替换,中心点的修改
    129. best_choice = list(CenterIndex)
    130. # 类簇的修改
    131. best_res = dict(medoids_)
    132. # 代价的修改
    133. current_cost = tmp
    134. # 将中心点进行复原,重复上面的操作直到所有的非中心点数据计算完毕才选择一个最小的,而不是选择目前算的更小值
    135. CenterIndex[idx] = swap_temp
    136. # 若果当前计算的最好的类簇的中心和前一次的中心是一样的,那么认为趋于稳定,结束计算
    137. if best_choice == CenterIndex:
    138. break
    139. # 否则那么更新,重复上面的步骤
    140. if current_cost <= pre_cost:
    141. pre_cost = current_cost
    142. medoids = best_res
    143. CenterIndex = best_choice
    144. itr -= 1
    145. print(itr)
    146. # 返回最小代价,中心点,划分的聚类结果
    147. # current_cost, best_choice, best_res
    148. m, n = shape(dataSet)
    149. centriods = mat(zeros((k, n)))
    150. for i in range(k):
    151. centriods[i, :] = dataSet[best_choice[i], :]
    152. clusterA = mat(zeros((m, 1)))
    153. n = 0
    154. for i in list(best_res.keys()):
    155. for j in best_res[i]:
    156. clusterA[j, 0] = n
    157. n += 1
    158. return centriods, clusterA
    159. def fun1(count): # waveform + kmeans
    160. dataset = pd.read_csv('waveform.csv',header=None)
    161. data = mat(dataset)[:,1:22]
    162. myCentroids, clustAssing = kmeans(data, 3)
    163. show1(data, 3, myCentroids, clustAssing, count)
    164. def fun2(count): # gauss_noise + waveform + kmeans
    165. dataset = pd.read_csv('waveform.csv',header=None)
    166. data = mat(dataset)[:,1:22]
    167. data = data_gauss_noise(data)
    168. myCentroids, clustAssing = kmeans(data, 3)
    169. show1(data, 3, myCentroids, clustAssing, count)
    170. def fun3(count): # lena + kmeans
    171. q = Image.open('lena.jpg')
    172. q = q.convert('L')
    173. q.save("lena_1.png")
    174. m, n = q.size
    175. q1 = array(q)
    176. q1 = q1.reshape((m * n, 1))
    177. k = 3
    178. Centroids, clustAssing = kmeans(q1, k)
    179. y_new = array(clustAssing).reshape((n, m)).astype(int16)
    180. pic_new = Image.new("L", (m, n))
    181. for i in range(m):
    182. for j in range(n):
    183. pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
    184. pic_new.save("Figure_"+str(count)+".png")
    185. def fun4(count): # gauss_noise + lena + kmeans
    186. q = Image.open('lena.jpg')
    187. q = q.convert('L')
    188. gauss_img = image_gauss_noise(np.array(q))
    189. q = Image.fromarray(gauss_img)
    190. q.save("lena_2.png")
    191. m, n = q.size
    192. q1 = array(q)
    193. q1 = q1.reshape((m * n, 1))
    194. k = 3
    195. Centroids, clustAssing = kmeans(q1, k)
    196. y_new = array(clustAssing).reshape((n, m)).astype(int16)
    197. pic_new = Image.new("L", (m, n))
    198. for i in range(m):
    199. for j in range(n):
    200. pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
    201. pic_new.save("Figure_"+str(count)+".png")
    202. def fun5(count): # waveform + kmeans
    203. dataset = pd.read_csv('waveform.csv', header=None)
    204. data = mat(dataset)[:, 1:22]
    205. myCentroids, clustAssing = PAM(data, 3)
    206. show1(data, 3, myCentroids, clustAssing, count)
    207. def fun6(count): # gauss_noise + waveform + kmeans
    208. dataset = pd.read_csv('waveform.csv', header=None)
    209. data = mat(dataset)[:, 1:22]
    210. data = data_gauss_noise(data)
    211. myCentroids, clustAssing = PAM(data, 3)
    212. show1(data, 3, myCentroids, clustAssing, count)
    213. def fun7(count): # lena + kmeans
    214. q = Image.open('lena.jpg')
    215. q = q.convert('L')
    216. m, n = q.size
    217. q1 = array(q)
    218. q1 = q1.reshape((m * n, 1))
    219. k = 3
    220. Centroids, clustAssing = PAM(q1, k)
    221. y_new = array(clustAssing).reshape((n, m)).astype(int16)
    222. pic_new = Image.new("L", (m, n))
    223. for i in range(m):
    224. for j in range(n):
    225. pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
    226. pic_new.save("Figure_"+str(count)+".png")
    227. def fun8(count): # gauss_noise + lena + kmeans
    228. q = Image.open('lena.jpg')
    229. q = q.convert('L')
    230. gauss_img = image_gauss_noise(np.array(q))
    231. q = Image.fromarray(gauss_img)
    232. m, n = q.size
    233. q1 = array(q)
    234. q1 = q1.reshape((m * n, 1))
    235. k = 3
    236. Centroids, clustAssing = PAM(q1, k)
    237. y_new = array(clustAssing).reshape((n, m)).astype(int16)
    238. pic_new = Image.new("L", (m, n))
    239. for i in range(m):
    240. for j in range(n):
    241. pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
    242. pic_new.save("Figure_"+str(count)+".png")
    243. if '__main__' == __name__:
    244. fun1(1)
    245. fun2(2)
    246. fun3(3)
    247. fun4(4)
    248. fun5(5)
    249. fun6(6)
    250. fun7(7)
    251. fun8(8)

     

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/425383
推荐阅读
相关标签
  

闽ICP备14008679号