赞
踩
层次聚类试图在不同层次上对数据集合进行划分, 从而形成树形的聚类结构。数据集的划分可采用“自底向上”的聚合策略,也可以采用“自顶向下”的分拆策略。
AGNES是一种采用自底向上的聚合策略的层次聚合算法,它先将数据集中的每个样本看作是一个初始的聚类簇,然后在算法进行的每一步中找出距离最近的两个聚类来进行合并,该过程不断的重复,直到到达预设的聚类簇的个数。
改算法的关键是如何计算聚类之间的距离, 实际上,每一个聚类是一个样本的集合,因此,只需要采用关于集合的某种距离即可。
最近距离由两个簇的最近的样本来决定, 最大距离由两个簇的最远的样本来决定,由此分别产生的AGNES算法又分别称为single-linkage和complete-linkage算法。
single-linkage算法
- #include<stdio.h>
- #include<stdlib.h>
- #include<math.h>
- #include<string.h>
- #include<io.h>
-
- int sample_size;
- int sample_dimension;
- char filename[200];
-
- double** data;
- double** distance;
- int cluster_count;
-
- void initialization();
- void readDataFromFile();
- double calculateDistance_BetweenTwoObject(int, int);
- void initializationDistanceMatrix();
- void AGNES();
- void findMinDistance_BetweenClsuter_MIN(int*, int*);
- void combineCluster(int, int);
- void writeToFile(int);
-
-
- int main(int argc, char* argv[])
- {
- if( argc != 4 )
- {
- printf("This algorithm requires 4 user-specified parameter"
- "\n\t\tthe number of sample"
- "\n\t\tthe dimension of sample"
- "\n\t\tthe filename contain the sample"
- "\n\n");
- exit(0);
- }
- sample_size = atoi(argv[1]);
- sample_dimension = atoi(argv[2]);
- strcat(filename, argv[3]);
-
- initialization();
- readDataFromFile();
- initializationDistanceMatrix();
-
- AGNES();
-
- return 0;
- }
-
- /*
- * initialize the dynamic array for storing sample
- * */
- void initialization()
- {
- //initializaion the sample data
- int i, j;
- data = (double**)malloc(sizeof(double*) * (sample_size + 1));
- if( !data )
- {
- printf("data malloc error: 0");
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- {
- data[i] = (double*)malloc(sizeof(double) * (sample_dimension + 1));
- if( !data[i] )
- {
- printf("data malloc error: %d", i);
- exit(0);
- }
- }
-
- //initialiation the distance data
- distance = (double**)malloc(sizeof(double*) * (sample_size + 1));
- if( !distance )
- {
- printf("distance malloc error: 0");
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- {
- distance[i] = (double*)malloc(sizeof(double) * (sample_size + 1));
- if( !distance[i] )
- {
- printf("distance malloc error: %d", i);
- exit(0);
- }
- }
-
- //the 0th element of all row indicate the clauterID of the object
- for( i = 1; i <= sample_size; i++ )
- {
- distance[i][0] = i;
- }
- }
-
- /*
- * read the sample data from file
- * */
- void readDataFromFile()
- {
- FILE* fread;
- int i;
- int j;
- if( NULL == (fread = fopen(filename, "r")))
- {
- printf("open file(%s) error: ", filename);
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = 1; j <= sample_dimension; j++ )
- {
- if( 1 != fscanf(fread, "%lf ", &data[i][j]))
- {
- printf("fscanf error: (%d, %d)", i, j);
- exit(0);
- }
- }
- }
-
- //test
- printf("print the origin data:\n");
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = 1; j <= sample_dimension; j++ )
- {
- printf("%f\t", data[i][j]);
- }
- printf("\n");
- }
- //test END
- }
-
- /*
- * calculate distance between two objects
- * */
- double calculateDistance_BetweenTwoObject(int firstID, int secondID)
- {
- double distance = 0.0;
- int i;
- for( i = 1; i <= sample_dimension; i++ )
- {
- distance = distance + pow(data[firstID][i] - data[secondID][i], 2);
- }
- return sqrt(distance);
- }
-
- /*
- * calculate initialization distance matrix
- * */
- void initializationDistanceMatrix()
- {
- int i, j;
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = i; j <= sample_size; j++ )
- {
- distance[i][j] = calculateDistance_BetweenTwoObject(i, j);
- distance[j][i] = distance[i][j];
- }
- }
-
- //test
- printf("print the origin distance matrix\n");
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = 0; j <= sample_size; j++ )
- {
- printf("%f ", distance[i][j]);
- }
- printf("\n");
- }
- //test END
- }
-
- /***************************************************************************************************************
- * AGNES
- ***************************************************************************************************************/
- void AGNES()
- {
- cluster_count = sample_size;
- int cluster_1, cluster_2;
- int count = 1;
- while( cluster_count > 3 )
- {
- printf("-------------------%d--------------------\n", count);
- findMinDistance_BetweenClsuter_MIN(&cluster_1, &cluster_2);
- combineCluster(cluster_1, cluster_2);
- cluster_count--;
- writeToFile(count++);
- }
- }
-
- /*
- * find two clusters the distance between them is minimun, and store the clusterID in @cluster_1 and @cluster_2, respectively.
- * */
- void findMinDistance_BetweenClsuter_MIN(int* cluster_1, int* cluster_2)
- {
- int i, j;
- double min_distance;
- int flag = 1;
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = i + 1; j <= sample_size; j++ )
- {
- if( distance[i][0] == distance[j][0] )
- {
- printf("same cluster!!!! %d and %d\n\n", i , j);
- continue;
- }
- else if( flag == 1 )
- {
- min_distance = distance[i][j];
- *cluster_1 = i;
- *cluster_2 = j;
- flag = 0;
- printf("----get the initial minimum distance is %f(%d, %d)\n", min_distance, i, j);
- continue;
- }
- if( distance[i][j] < min_distance )
- {
- min_distance = distance[i][j];
- *cluster_1 = i;
- *cluster_2 = j;
- }
- }
- }
- printf("the minimum is %f :(%d, %d)\n", min_distance, *cluster_1, *cluster_2);
- }
-
- /*
- * combine the two clusters to one
- * */
- void combineCluster(int cluster_1, int cluster_2)
- {
- int i;
- int buffer;
- buffer = distance[cluster_2][0];
- for( i = 1; i <= sample_size; i++ )
- {
- if( distance[i][0] == buffer )
- {
- distance[i][0] = distance[cluster_1][0];
- }
- }
-
- //test
- printf(" the minimum distance is %f, and the object is %d and %d\n", distance[cluster_1][cluster_2], cluster_1, cluster_2);
- int j;
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = 0; j <= sample_size; j++ )
- {
- printf("%f ", distance[i][j]);
- }
- printf("\n");
- }
- }
-
- /*
- * write the result of clustering information to file
- * */
- void writeToFile(int round)
- {
- int i;
- int j;
- int* auxiliary = (int*)malloc(sizeof(int) * (sample_size + 1));
- if( !auxiliary )
- {
- printf("auxiliary malloc error");
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- auxiliary[i] = 0;
- for( i = 1; i <= sample_size; i++ )
- {
- auxiliary[(int)distance[i][0]]++;
- }
- int *clusterID = (int*)malloc(sizeof(int) * (sample_size - round + 1));
- if( !clusterID )
- {
- printf("clusterID malloc error");
- exit(0);
- }
- int counter = 1;
- for( i = 1; i <= sample_size; i++ )
- {
- if( auxiliary[i] != 0 )
- clusterID[counter++] = i;
- }
- //test
- printf("cluster ID:");
- for( i = 1; i <= sample_size - round; i++ )
- {
- printf("%d ", clusterID[i]);
- }
- printf("\n");
- //test END
-
- printf("writeToFile -----------round: %d\n", round);
- FILE** fwrite;
- fwrite = (FILE**)malloc(sizeof(FILE*) * (sample_size + 1));
- if( !fwrite )
- {
- printf("fwrite malloc error\n");
- exit(0);
- }
-
- char filename[200] = "";
- char instruction[200] = "";
- sprintf(instruction, "md Round_%d", round);
- system(instruction);
-
- for( i = 1; i <= sample_size - round; i++ )
- {
- sprintf(filename, ".//Round_%d//cluster_%d.data", round, clusterID[i]);
- if( NULL == (fwrite[clusterID[i]] = fopen(filename, "w")))
- {
- printf("open file(%s) error\n", filename);
- exit(0);
- }
- }
- for( i = 1; i <= sample_size; i++ )
- {
- printf("%d ", (int)distance[i][0]);
- for( j = 1; j <= sample_dimension; j++ )
- {
- fprintf(fwrite[(int)distance[i][0]], "%f\t", data[i][j]);
- printf("%f ", data[i][j]);
- }
- fprintf(fwrite[(int)distance[i][0]], "\n");
- printf("\n");
- }
-
- for( i = 1; i <= sample_size - round; i++ )
- {
- fclose(fwrite[clusterID[i]]);
- }
- }
- #include<stdio.h>
- #include<stdlib.h>
- #include<math.h>
- #include<string.h>
- #include<io.h>
-
- int sample_size;
- int sample_dimension;
- char filename[200];
-
- double** data;
- double** distance;
- int cluster_count;
-
- void initialization();
- void readDataFromFile();
- double calculateDistance_BetweenTwoObject(int, int);
- void initializationDistanceMatrix();
- void findMinDistance_BetweenClsuter_MAX(int*, int*);
- void AGNES();
- void combineCluster(int, int);
- void updateDistance(int);
- void writeToFile(int);
-
-
- int main(int argc, char* argv[])
- {
- if( argc != 4 )
- {
- printf("This algorithm requires 4 user-specified parameter"
- "\n\t\tthe number of sample"
- "\n\t\tthe dimension of sample"
- "\n\t\tthe filename contain the sample"
- "\n\n");
- exit(0);
- }
- sample_size = atoi(argv[1]);
- sample_dimension = atoi(argv[2]);
- strcat(filename, argv[3]);
-
- initialization();
- readDataFromFile();
- initializationDistanceMatrix();
- AGNES();
- return 0;
- }
-
- /*
- * initialize the dynamic array for storing sample
- * */
- void initialization()
- {
- //initializaion the sample data
- int i, j;
- data = (double**)malloc(sizeof(double*) * (sample_size + 1));
- if( !data )
- {
- printf("data malloc error: 0");
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- {
- data[i] = (double*)malloc(sizeof(double) * (sample_dimension + 1));
- if( !data[i] )
- {
- printf("data malloc error: %d", i);
- exit(0);
- }
- }
-
- //initialiation the distance data
- distance = (double**)malloc(sizeof(double*) * (sample_size + 1));
- if( !distance )
- {
- printf("distance malloc error: 0");
- exit(0);
- }
- for( i = 0; i <= sample_size; i++ )
- {
- distance[i] = (double*)malloc(sizeof(double) * (sample_size + 1));
- if( !distance[i] )
- {
- printf("distance malloc error: %d", i);
- exit(0);
- }
- }
-
- //the 0th element of all row indicate the clauterID of the object
- for( i = 1; i <= sample_size; i++ )
- {
- distance[i][0] = i;
- }
- //where reserved if distance[0][i] = 1,
- for( i = 1; i <= sample_size; i++ )
- {
- distance[0][i] = 1;
- }
- }
-
- /*
- * read the sample data from file
- * */
- void readDataFromFile()
- {
- FILE* fread;
- int i;
- int j;
- if( NULL == (fread = fopen(filename, "r")))
- {
- printf("open file(%s) error: ", filename);
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = 1; j <= sample_dimension; j++ )
- {
- if( 1 != fscanf(fread, "%lf ", &data[i][j]))
- {
- printf("fscanf error: (%d, %d)", i, j);
- exit(0);
- }
- }
- }
- }
-
- /*
- * calculate distance between two objects
- * */
- double calculateDistance_BetweenTwoObject(int firstID, int secondID)
- {
- double distance = 0.0;
- int i;
- for( i = 1; i <= sample_dimension; i++ )
- {
- distance = distance + pow(data[firstID][i] - data[secondID][i], 2);
- }
- return sqrt(distance);
- }
-
- /*
- * calculate initialization distance matrix
- * */
- void initializationDistanceMatrix()
- {
- int i, j;
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = i; j <= sample_size; j++ )
- {
- distance[i][j] = calculateDistance_BetweenTwoObject(i, j);
- distance[j][i] = distance[i][j];
- }
- }
- }
-
- /***************************************************************************************************************
- * AGNES
- ***************************************************************************************************************/
- void AGNES()
- {
- cluster_count = sample_size;
- int cluster_1, cluster_2;
- int count = 1;
- while( cluster_count > 3 )
- {
- printf("-------------------%d--------------------\n", count);
- findMinDistance_BetweenClsuter_MAX(&cluster_1, &cluster_2);
- combineCluster(cluster_1, cluster_2);
- updateDistance(cluster_1);
- cluster_count--;
- writeToFile(count++);
- }
- }
-
- /*
- * find two clusters the distance between them is minimun, and store the clusterID in @cluster_1 and @cluster_2, respectively.
- * */
- void findMinDistance_BetweenClsuter_MAX(int* cluster_1, int* cluster_2)
- {
- int i, j;
- double min_distance;
- int flag = 1;
- int object_1;
- int object_2;
- for( i = 1; i <= sample_size; i++ )
- {
- for( j = i + 1; j <= sample_size; j++ )
- {
- if( distance[i][0] == distance[j][0] || ( distance[0][i] == 0 || distance[0][j] == 0 ) )
- {
- continue;
- }
- else if( flag == 1 )
- {
- min_distance = distance[i][j];
- object_1 = i;
- object_2 = j;
- flag = 0;
- continue;
- }
- if( distance[i][j] < min_distance )
- {
- min_distance = distance[i][j];
- object_1 = i;
- object_2 = j;
- }
- }
- }
- *cluster_1 = distance[object_1][0];
- *cluster_2 = distance[object_2][0];
- }
-
- /*
- * combine the two clusters to one
- * */
- void combineCluster(int cluster_1, int cluster_2)
- {
- int i;
- for( i = 1; i <= sample_size; i++ )
- {
- if( distance[i][0] == cluster_2 )
- {
- distance[i][0] = cluster_1;
- }
- }
- }
-
- /*
- * update distance data
- * */
- void updateDistance(int clusterID)
- {
- int i, j;
- int flag = 1;
- int save_i;
- for( i = 1; i <= sample_size; i++ )
- {
- if( distance[i][0] == clusterID && flag == 1 )
- {
- flag = 0;
- save_i = i;
- continue;
- }
- else if( distance[i][0] == clusterID && flag == 0 )
- {
- distance[0][i] = 0;
- for( j = 1; j <= sample_size; j++ )
- {
- if( distance[i][j] > distance[save_i][j] )
- {
- distance[save_i][j] = distance[i][j];
- }
- }
- }
- }
- for( i = 1; i <= sample_size; i++ )
- {
- distance[i][save_i] = distance[save_i][i];
- }
- }
-
- /*
- * write the result of clustering information to file
- * */
- void writeToFile(int round)
- {
- int i;
- int j;
- int* auxiliary = (int*)malloc(sizeof(int) * (sample_size + 1));
- if( !auxiliary )
- {
- printf("auxiliary malloc error");
- exit(0);
- }
- for( i = 1; i <= sample_size; i++ )
- auxiliary[i] = 0;
- for( i = 1; i <= sample_size; i++ )
- {
- auxiliary[(int)distance[i][0]]++;
- }
- int *clusterID = (int*)malloc(sizeof(int) * (sample_size - round + 1));
- if( !clusterID )
- {
- printf("clusterID malloc error");
- exit(0);
- }
- int counter = 1;
- for( i = 1; i <= sample_size; i++ )
- {
- if( auxiliary[i] != 0 )
- clusterID[counter++] = i;
- }
-
- printf("writeToFile -----------round: %d\n", round);
- FILE** fwrite;
- fwrite = (FILE**)malloc(sizeof(FILE*) * (sample_size + 1));
- if( !fwrite )
- {
- printf("fwrite malloc error\n");
- exit(0);
- }
-
- char filename[200] = "";
- char instruction[200] = "";
- sprintf(instruction, "md Round_%d", round);
- system(instruction);
-
- for( i = 1; i <= sample_size - round; i++ )
- {
- sprintf(filename, ".//Round_%d//cluster_%d.data", round, clusterID[i]);
- if( NULL == (fwrite[clusterID[i]] = fopen(filename, "w")))
- {
- printf("open file(%s) error\n", filename);
- exit(0);
- }
- }
- for( i = 1; i <= sample_size; i++ )
- {
- printf("%d ", (int)distance[i][0]);
- for( j = 1; j <= sample_dimension; j++ )
- {
- fprintf(fwrite[(int)distance[i][0]], "%f\t", data[i][j]);
- }
- fprintf(fwrite[(int)distance[i][0]], "\n");
- }
-
- for( i = 1; i <= sample_size - round; i++ )
- {
- fclose(fwrite[clusterID[i]]);
- }
- }
-
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。