当前位置:   article > 正文

层次聚类算法之single-linkage和complete-linkage(C语言实现)_c实现层次聚类算法

c实现层次聚类算法

层次聚类试图在不同层次上对数据集合进行划分, 从而形成树形的聚类结构。数据集的划分可采用“自底向上”的聚合策略,也可以采用“自顶向下”的分拆策略。

AGNES是一种采用自底向上的聚合策略的层次聚合算法,它先将数据集中的每个样本看作是一个初始的聚类簇,然后在算法进行的每一步中找出距离最近的两个聚类来进行合并,该过程不断的重复,直到到达预设的聚类簇的个数。

改算法的关键是如何计算聚类之间的距离, 实际上,每一个聚类是一个样本的集合,因此,只需要采用关于集合的某种距离即可。

最近距离由两个簇的最近的样本来决定, 最大距离由两个簇的最远的样本来决定,由此分别产生的AGNES算法又分别称为single-linkage和complete-linkage算法。


single-linkage算法

  1. #include<stdio.h>
  2. #include<stdlib.h>
  3. #include<math.h>
  4. #include<string.h>
  5. #include<io.h>
  6. int sample_size;
  7. int sample_dimension;
  8. char filename[200];
  9. double** data;
  10. double** distance;
  11. int cluster_count;
  12. void initialization();
  13. void readDataFromFile();
  14. double calculateDistance_BetweenTwoObject(int, int);
  15. void initializationDistanceMatrix();
  16. void AGNES();
  17. void findMinDistance_BetweenClsuter_MIN(int*, int*);
  18. void combineCluster(int, int);
  19. void writeToFile(int);
  20. int main(int argc, char* argv[])
  21. {
  22. if( argc != 4 )
  23. {
  24. printf("This algorithm requires 4 user-specified parameter"
  25. "\n\t\tthe number of sample"
  26. "\n\t\tthe dimension of sample"
  27. "\n\t\tthe filename contain the sample"
  28. "\n\n");
  29. exit(0);
  30. }
  31. sample_size = atoi(argv[1]);
  32. sample_dimension = atoi(argv[2]);
  33. strcat(filename, argv[3]);
  34. initialization();
  35. readDataFromFile();
  36. initializationDistanceMatrix();
  37. AGNES();
  38. return 0;
  39. }
  40. /*
  41. * initialize the dynamic array for storing sample
  42. * */
  43. void initialization()
  44. {
  45. //initializaion the sample data
  46. int i, j;
  47. data = (double**)malloc(sizeof(double*) * (sample_size + 1));
  48. if( !data )
  49. {
  50. printf("data malloc error: 0");
  51. exit(0);
  52. }
  53. for( i = 1; i <= sample_size; i++ )
  54. {
  55. data[i] = (double*)malloc(sizeof(double) * (sample_dimension + 1));
  56. if( !data[i] )
  57. {
  58. printf("data malloc error: %d", i);
  59. exit(0);
  60. }
  61. }
  62. //initialiation the distance data
  63. distance = (double**)malloc(sizeof(double*) * (sample_size + 1));
  64. if( !distance )
  65. {
  66. printf("distance malloc error: 0");
  67. exit(0);
  68. }
  69. for( i = 1; i <= sample_size; i++ )
  70. {
  71. distance[i] = (double*)malloc(sizeof(double) * (sample_size + 1));
  72. if( !distance[i] )
  73. {
  74. printf("distance malloc error: %d", i);
  75. exit(0);
  76. }
  77. }
  78. //the 0th element of all row indicate the clauterID of the object
  79. for( i = 1; i <= sample_size; i++ )
  80. {
  81. distance[i][0] = i;
  82. }
  83. }
  84. /*
  85. * read the sample data from file
  86. * */
  87. void readDataFromFile()
  88. {
  89. FILE* fread;
  90. int i;
  91. int j;
  92. if( NULL == (fread = fopen(filename, "r")))
  93. {
  94. printf("open file(%s) error: ", filename);
  95. exit(0);
  96. }
  97. for( i = 1; i <= sample_size; i++ )
  98. {
  99. for( j = 1; j <= sample_dimension; j++ )
  100. {
  101. if( 1 != fscanf(fread, "%lf ", &data[i][j]))
  102. {
  103. printf("fscanf error: (%d, %d)", i, j);
  104. exit(0);
  105. }
  106. }
  107. }
  108. //test
  109. printf("print the origin data:\n");
  110. for( i = 1; i <= sample_size; i++ )
  111. {
  112. for( j = 1; j <= sample_dimension; j++ )
  113. {
  114. printf("%f\t", data[i][j]);
  115. }
  116. printf("\n");
  117. }
  118. //test END
  119. }
  120. /*
  121. * calculate distance between two objects
  122. * */
  123. double calculateDistance_BetweenTwoObject(int firstID, int secondID)
  124. {
  125. double distance = 0.0;
  126. int i;
  127. for( i = 1; i <= sample_dimension; i++ )
  128. {
  129. distance = distance + pow(data[firstID][i] - data[secondID][i], 2);
  130. }
  131. return sqrt(distance);
  132. }
  133. /*
  134. * calculate initialization distance matrix
  135. * */
  136. void initializationDistanceMatrix()
  137. {
  138. int i, j;
  139. for( i = 1; i <= sample_size; i++ )
  140. {
  141. for( j = i; j <= sample_size; j++ )
  142. {
  143. distance[i][j] = calculateDistance_BetweenTwoObject(i, j);
  144. distance[j][i] = distance[i][j];
  145. }
  146. }
  147. //test
  148. printf("print the origin distance matrix\n");
  149. for( i = 1; i <= sample_size; i++ )
  150. {
  151. for( j = 0; j <= sample_size; j++ )
  152. {
  153. printf("%f ", distance[i][j]);
  154. }
  155. printf("\n");
  156. }
  157. //test END
  158. }
  159. /***************************************************************************************************************
  160. * AGNES
  161. ***************************************************************************************************************/
  162. void AGNES()
  163. {
  164. cluster_count = sample_size;
  165. int cluster_1, cluster_2;
  166. int count = 1;
  167. while( cluster_count > 3 )
  168. {
  169. printf("-------------------%d--------------------\n", count);
  170. findMinDistance_BetweenClsuter_MIN(&cluster_1, &cluster_2);
  171. combineCluster(cluster_1, cluster_2);
  172. cluster_count--;
  173. writeToFile(count++);
  174. }
  175. }
  176. /*
  177. * find two clusters the distance between them is minimun, and store the clusterID in @cluster_1 and @cluster_2, respectively.
  178. * */
  179. void findMinDistance_BetweenClsuter_MIN(int* cluster_1, int* cluster_2)
  180. {
  181. int i, j;
  182. double min_distance;
  183. int flag = 1;
  184. for( i = 1; i <= sample_size; i++ )
  185. {
  186. for( j = i + 1; j <= sample_size; j++ )
  187. {
  188. if( distance[i][0] == distance[j][0] )
  189. {
  190. printf("same cluster!!!! %d and %d\n\n", i , j);
  191. continue;
  192. }
  193. else if( flag == 1 )
  194. {
  195. min_distance = distance[i][j];
  196. *cluster_1 = i;
  197. *cluster_2 = j;
  198. flag = 0;
  199. printf("----get the initial minimum distance is %f(%d, %d)\n", min_distance, i, j);
  200. continue;
  201. }
  202. if( distance[i][j] < min_distance )
  203. {
  204. min_distance = distance[i][j];
  205. *cluster_1 = i;
  206. *cluster_2 = j;
  207. }
  208. }
  209. }
  210. printf("the minimum is %f :(%d, %d)\n", min_distance, *cluster_1, *cluster_2);
  211. }
  212. /*
  213. * combine the two clusters to one
  214. * */
  215. void combineCluster(int cluster_1, int cluster_2)
  216. {
  217. int i;
  218. int buffer;
  219. buffer = distance[cluster_2][0];
  220. for( i = 1; i <= sample_size; i++ )
  221. {
  222. if( distance[i][0] == buffer )
  223. {
  224. distance[i][0] = distance[cluster_1][0];
  225. }
  226. }
  227. //test
  228. printf(" the minimum distance is %f, and the object is %d and %d\n", distance[cluster_1][cluster_2], cluster_1, cluster_2);
  229. int j;
  230. for( i = 1; i <= sample_size; i++ )
  231. {
  232. for( j = 0; j <= sample_size; j++ )
  233. {
  234. printf("%f ", distance[i][j]);
  235. }
  236. printf("\n");
  237. }
  238. }
  239. /*
  240. * write the result of clustering information to file
  241. * */
  242. void writeToFile(int round)
  243. {
  244. int i;
  245. int j;
  246. int* auxiliary = (int*)malloc(sizeof(int) * (sample_size + 1));
  247. if( !auxiliary )
  248. {
  249. printf("auxiliary malloc error");
  250. exit(0);
  251. }
  252. for( i = 1; i <= sample_size; i++ )
  253. auxiliary[i] = 0;
  254. for( i = 1; i <= sample_size; i++ )
  255. {
  256. auxiliary[(int)distance[i][0]]++;
  257. }
  258. int *clusterID = (int*)malloc(sizeof(int) * (sample_size - round + 1));
  259. if( !clusterID )
  260. {
  261. printf("clusterID malloc error");
  262. exit(0);
  263. }
  264. int counter = 1;
  265. for( i = 1; i <= sample_size; i++ )
  266. {
  267. if( auxiliary[i] != 0 )
  268. clusterID[counter++] = i;
  269. }
  270. //test
  271. printf("cluster ID:");
  272. for( i = 1; i <= sample_size - round; i++ )
  273. {
  274. printf("%d ", clusterID[i]);
  275. }
  276. printf("\n");
  277. //test END
  278. printf("writeToFile -----------round: %d\n", round);
  279. FILE** fwrite;
  280. fwrite = (FILE**)malloc(sizeof(FILE*) * (sample_size + 1));
  281. if( !fwrite )
  282. {
  283. printf("fwrite malloc error\n");
  284. exit(0);
  285. }
  286. char filename[200] = "";
  287. char instruction[200] = "";
  288. sprintf(instruction, "md Round_%d", round);
  289. system(instruction);
  290. for( i = 1; i <= sample_size - round; i++ )
  291. {
  292. sprintf(filename, ".//Round_%d//cluster_%d.data", round, clusterID[i]);
  293. if( NULL == (fwrite[clusterID[i]] = fopen(filename, "w")))
  294. {
  295. printf("open file(%s) error\n", filename);
  296. exit(0);
  297. }
  298. }
  299. for( i = 1; i <= sample_size; i++ )
  300. {
  301. printf("%d ", (int)distance[i][0]);
  302. for( j = 1; j <= sample_dimension; j++ )
  303. {
  304. fprintf(fwrite[(int)distance[i][0]], "%f\t", data[i][j]);
  305. printf("%f ", data[i][j]);
  306. }
  307. fprintf(fwrite[(int)distance[i][0]], "\n");
  308. printf("\n");
  309. }
  310. for( i = 1; i <= sample_size - round; i++ )
  311. {
  312. fclose(fwrite[clusterID[i]]);
  313. }
  314. }

complete-linkage算法

  1. #include<stdio.h>
  2. #include<stdlib.h>
  3. #include<math.h>
  4. #include<string.h>
  5. #include<io.h>
  6. int sample_size;
  7. int sample_dimension;
  8. char filename[200];
  9. double** data;
  10. double** distance;
  11. int cluster_count;
  12. void initialization();
  13. void readDataFromFile();
  14. double calculateDistance_BetweenTwoObject(int, int);
  15. void initializationDistanceMatrix();
  16. void findMinDistance_BetweenClsuter_MAX(int*, int*);
  17. void AGNES();
  18. void combineCluster(int, int);
  19. void updateDistance(int);
  20. void writeToFile(int);
  21. int main(int argc, char* argv[])
  22. {
  23. if( argc != 4 )
  24. {
  25. printf("This algorithm requires 4 user-specified parameter"
  26. "\n\t\tthe number of sample"
  27. "\n\t\tthe dimension of sample"
  28. "\n\t\tthe filename contain the sample"
  29. "\n\n");
  30. exit(0);
  31. }
  32. sample_size = atoi(argv[1]);
  33. sample_dimension = atoi(argv[2]);
  34. strcat(filename, argv[3]);
  35. initialization();
  36. readDataFromFile();
  37. initializationDistanceMatrix();
  38. AGNES();
  39. return 0;
  40. }
  41. /*
  42. * initialize the dynamic array for storing sample
  43. * */
  44. void initialization()
  45. {
  46. //initializaion the sample data
  47. int i, j;
  48. data = (double**)malloc(sizeof(double*) * (sample_size + 1));
  49. if( !data )
  50. {
  51. printf("data malloc error: 0");
  52. exit(0);
  53. }
  54. for( i = 1; i <= sample_size; i++ )
  55. {
  56. data[i] = (double*)malloc(sizeof(double) * (sample_dimension + 1));
  57. if( !data[i] )
  58. {
  59. printf("data malloc error: %d", i);
  60. exit(0);
  61. }
  62. }
  63. //initialiation the distance data
  64. distance = (double**)malloc(sizeof(double*) * (sample_size + 1));
  65. if( !distance )
  66. {
  67. printf("distance malloc error: 0");
  68. exit(0);
  69. }
  70. for( i = 0; i <= sample_size; i++ )
  71. {
  72. distance[i] = (double*)malloc(sizeof(double) * (sample_size + 1));
  73. if( !distance[i] )
  74. {
  75. printf("distance malloc error: %d", i);
  76. exit(0);
  77. }
  78. }
  79. //the 0th element of all row indicate the clauterID of the object
  80. for( i = 1; i <= sample_size; i++ )
  81. {
  82. distance[i][0] = i;
  83. }
  84. //where reserved if distance[0][i] = 1,
  85. for( i = 1; i <= sample_size; i++ )
  86. {
  87. distance[0][i] = 1;
  88. }
  89. }
  90. /*
  91. * read the sample data from file
  92. * */
  93. void readDataFromFile()
  94. {
  95. FILE* fread;
  96. int i;
  97. int j;
  98. if( NULL == (fread = fopen(filename, "r")))
  99. {
  100. printf("open file(%s) error: ", filename);
  101. exit(0);
  102. }
  103. for( i = 1; i <= sample_size; i++ )
  104. {
  105. for( j = 1; j <= sample_dimension; j++ )
  106. {
  107. if( 1 != fscanf(fread, "%lf ", &data[i][j]))
  108. {
  109. printf("fscanf error: (%d, %d)", i, j);
  110. exit(0);
  111. }
  112. }
  113. }
  114. }
  115. /*
  116. * calculate distance between two objects
  117. * */
  118. double calculateDistance_BetweenTwoObject(int firstID, int secondID)
  119. {
  120. double distance = 0.0;
  121. int i;
  122. for( i = 1; i <= sample_dimension; i++ )
  123. {
  124. distance = distance + pow(data[firstID][i] - data[secondID][i], 2);
  125. }
  126. return sqrt(distance);
  127. }
  128. /*
  129. * calculate initialization distance matrix
  130. * */
  131. void initializationDistanceMatrix()
  132. {
  133. int i, j;
  134. for( i = 1; i <= sample_size; i++ )
  135. {
  136. for( j = i; j <= sample_size; j++ )
  137. {
  138. distance[i][j] = calculateDistance_BetweenTwoObject(i, j);
  139. distance[j][i] = distance[i][j];
  140. }
  141. }
  142. }
  143. /***************************************************************************************************************
  144. * AGNES
  145. ***************************************************************************************************************/
  146. void AGNES()
  147. {
  148. cluster_count = sample_size;
  149. int cluster_1, cluster_2;
  150. int count = 1;
  151. while( cluster_count > 3 )
  152. {
  153. printf("-------------------%d--------------------\n", count);
  154. findMinDistance_BetweenClsuter_MAX(&cluster_1, &cluster_2);
  155. combineCluster(cluster_1, cluster_2);
  156. updateDistance(cluster_1);
  157. cluster_count--;
  158. writeToFile(count++);
  159. }
  160. }
  161. /*
  162. * find two clusters the distance between them is minimun, and store the clusterID in @cluster_1 and @cluster_2, respectively.
  163. * */
  164. void findMinDistance_BetweenClsuter_MAX(int* cluster_1, int* cluster_2)
  165. {
  166. int i, j;
  167. double min_distance;
  168. int flag = 1;
  169. int object_1;
  170. int object_2;
  171. for( i = 1; i <= sample_size; i++ )
  172. {
  173. for( j = i + 1; j <= sample_size; j++ )
  174. {
  175. if( distance[i][0] == distance[j][0] || ( distance[0][i] == 0 || distance[0][j] == 0 ) )
  176. {
  177. continue;
  178. }
  179. else if( flag == 1 )
  180. {
  181. min_distance = distance[i][j];
  182. object_1 = i;
  183. object_2 = j;
  184. flag = 0;
  185. continue;
  186. }
  187. if( distance[i][j] < min_distance )
  188. {
  189. min_distance = distance[i][j];
  190. object_1 = i;
  191. object_2 = j;
  192. }
  193. }
  194. }
  195. *cluster_1 = distance[object_1][0];
  196. *cluster_2 = distance[object_2][0];
  197. }
  198. /*
  199. * combine the two clusters to one
  200. * */
  201. void combineCluster(int cluster_1, int cluster_2)
  202. {
  203. int i;
  204. for( i = 1; i <= sample_size; i++ )
  205. {
  206. if( distance[i][0] == cluster_2 )
  207. {
  208. distance[i][0] = cluster_1;
  209. }
  210. }
  211. }
  212. /*
  213. * update distance data
  214. * */
  215. void updateDistance(int clusterID)
  216. {
  217. int i, j;
  218. int flag = 1;
  219. int save_i;
  220. for( i = 1; i <= sample_size; i++ )
  221. {
  222. if( distance[i][0] == clusterID && flag == 1 )
  223. {
  224. flag = 0;
  225. save_i = i;
  226. continue;
  227. }
  228. else if( distance[i][0] == clusterID && flag == 0 )
  229. {
  230. distance[0][i] = 0;
  231. for( j = 1; j <= sample_size; j++ )
  232. {
  233. if( distance[i][j] > distance[save_i][j] )
  234. {
  235. distance[save_i][j] = distance[i][j];
  236. }
  237. }
  238. }
  239. }
  240. for( i = 1; i <= sample_size; i++ )
  241. {
  242. distance[i][save_i] = distance[save_i][i];
  243. }
  244. }
  245. /*
  246. * write the result of clustering information to file
  247. * */
  248. void writeToFile(int round)
  249. {
  250. int i;
  251. int j;
  252. int* auxiliary = (int*)malloc(sizeof(int) * (sample_size + 1));
  253. if( !auxiliary )
  254. {
  255. printf("auxiliary malloc error");
  256. exit(0);
  257. }
  258. for( i = 1; i <= sample_size; i++ )
  259. auxiliary[i] = 0;
  260. for( i = 1; i <= sample_size; i++ )
  261. {
  262. auxiliary[(int)distance[i][0]]++;
  263. }
  264. int *clusterID = (int*)malloc(sizeof(int) * (sample_size - round + 1));
  265. if( !clusterID )
  266. {
  267. printf("clusterID malloc error");
  268. exit(0);
  269. }
  270. int counter = 1;
  271. for( i = 1; i <= sample_size; i++ )
  272. {
  273. if( auxiliary[i] != 0 )
  274. clusterID[counter++] = i;
  275. }
  276. printf("writeToFile -----------round: %d\n", round);
  277. FILE** fwrite;
  278. fwrite = (FILE**)malloc(sizeof(FILE*) * (sample_size + 1));
  279. if( !fwrite )
  280. {
  281. printf("fwrite malloc error\n");
  282. exit(0);
  283. }
  284. char filename[200] = "";
  285. char instruction[200] = "";
  286. sprintf(instruction, "md Round_%d", round);
  287. system(instruction);
  288. for( i = 1; i <= sample_size - round; i++ )
  289. {
  290. sprintf(filename, ".//Round_%d//cluster_%d.data", round, clusterID[i]);
  291. if( NULL == (fwrite[clusterID[i]] = fopen(filename, "w")))
  292. {
  293. printf("open file(%s) error\n", filename);
  294. exit(0);
  295. }
  296. }
  297. for( i = 1; i <= sample_size; i++ )
  298. {
  299. printf("%d ", (int)distance[i][0]);
  300. for( j = 1; j <= sample_dimension; j++ )
  301. {
  302. fprintf(fwrite[(int)distance[i][0]], "%f\t", data[i][j]);
  303. }
  304. fprintf(fwrite[(int)distance[i][0]], "\n");
  305. }
  306. for( i = 1; i <= sample_size - round; i++ )
  307. {
  308. fclose(fwrite[clusterID[i]]);
  309. }
  310. }


声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/293299
推荐阅读
相关标签
  

闽ICP备14008679号