赞
踩
该算法是自底而上的算法,他的主要原理是所有数据每个样本看成一个初始聚类,在算法运行过程中不断找出距离自己最近的聚类之后合并,直到达到自己设定的聚类个数,所用数据集为python的鸢尾花数据,伪代码如下:
C++代码如下
//DataPoint.h 保存基础数据的类 #ifndef _DATAPOINT_H_ #define _DATAPOINT_H_ #include <vector> #include <string> #include <iostream> #include<map> using namespace std; class DataPoint { public: DataPoint() {} ~DataPoint() {} vector<double> GetData(); //获取保存数据 string GetName(); //获取花卉名字 int GetClusterId(); //获取聚类id void SetClusterId(int number);//设置聚类id void SetName(string str);//设置花卉名字 void SetData(vector<double> v);//设置保存数据 private: vector<double> Data; //保存数据 string name; //花卉名字 int clusterId;//聚类id }; #endif // _DATAPOINT_H_
//DataPoint.cpp #include "DataPoint.h" void DataPoint::SetClusterId(int number) { this->clusterId = number; } void DataPoint::SetData(vector<double> v) { for (int i = 0; i < v.size(); i++) { Data.push_back(v[i]); } } void DataPoint::SetName(string name) { this->name = name; } vector<double> DataPoint::GetData() { return this->Data; } string DataPoint::GetName() { return this->name; } int DataPoint::GetClusterId() { return this->clusterId; }
//AGNES.cpp #ifndef _AGNES_H_ #define _AGNES_H_ #include <iostream> #include<string> #include<vector> #include <fstream> #include <sstream> #include "DataPoint.h" using namespace std; //将string数据转为数字 template <class Type> Type stringToNum(const string& str) { istringstream iss(str); Type num; iss >> num; return num; } class AGNES { public: AGNES(double ThreshordDis, int MaxClu) :ThreshordDis(ThreshordDis), MaxClu(MaxClu) {} ~AGNES() {} void StartAgnes(); //算法核心 void GetData();//获取数据 void print();//打印信息 private: double GetDistance(DataPoint &point, DataPoint &point2); //得到两个数据之间的距离 double GetCDistance(vector<DataPoint> C, vector<DataPoint> C2); //得到两个聚类之间的最小距离 double GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2);//得到两个聚类之间的平均距离 vector<DataPoint> DataBase; //保存所有数据 vector<vector<DataPoint>> CluData; //保存所有簇类数据 double ThreshordDis; //这里暂时无用,可以省略 int MaxClu; //最终获得聚类数量 int p; //存储最初的聚类的个数 }; #endif
//AGNES.cpp #include "AGNES.h" static double DataMap[10000][10000]; //保存两两聚类之间的距离 void AGNES::GetData() { ifstream file; string line; file.open("iris.csv",ios::in); if (file.fail()) { cout << "文件打开失败" << endl; } while (getline(file, line)) { stringstream ss(line); string str; vector<string> temp; vector<double> v; DataPoint d; while (getline(ss, str, ',')) { temp.push_back(str); } for (int i = 1; i < temp.size()-1; i++) { v.push_back(stringToNum<double>(temp[i])); } //初始化数据 d.SetData(v); d.SetName(temp[temp.size() - 1]); d.SetClusterId(stringToNum<int>(temp[0])-1);//因为数据是从1开始编号的所以这里-1使其从0开始编号 DataBase.push_back(d); } } void AGNES::StartAgnes() { //获得每个数据距离其他数据的距离 for (int i = 0; i < DataBase.size(); i++) { vector<DataPoint> t; t.push_back(DataBase[i]); CluData.push_back(t); } //DataMap数组保存两个聚类之间的距离 for (int i = 0; i < CluData.size(); i++) { vector<DataPoint> temp; for (int j =i+1; j < CluData.size(); j++) { double dis = GetAVGDistance(CluData[i], CluData[j]); DataMap[i][j] = dis; DataMap[j][i] = dis; } } p = DataBase.size(); //设置聚类个数 while (p > MaxClu) { double Temp = 9999; int Find_i = 0, Find_j = 0; //寻找最小值点 for (int i = 0; i < p; i++) { for (int j = i+1; j < p; j++) { if (DataMap[i][j] < Temp) { Temp = DataMap[i][j]; Find_i = i; Find_j = j; } } } int NewId = CluData[Find_i][0].GetClusterId(); //获取当前簇的聚类号码 //将寻找到的最小点的两个簇合并 A1+B1->A1 //并将两个簇的簇类号码统一 for (int j = 0; j < CluData[Find_j].size(); j++) { CluData[Find_j][j].SetClusterId(NewId); CluData[Find_i].push_back(CluData[Find_j][j]); } //重编号矩阵 for (int i = Find_j + 1; i < p; i++) { for (int j = 0; j < CluData[i].size(); j++) { CluData[i][j].SetClusterId(CluData[i][j].GetClusterId() - 1); } } //距离矩阵重置 for (int i = Find_j; i < CluData.size()-1; i++) { CluData[i] = CluData[i + 1]; } //删除DataMap矩阵 行列 for (int i = 0; i < p; i++) { for (int j = Find_j; j < p-1; j++) { DataMap[i][j] = DataMap[i][j + 1]; } } for (int i = Find_j; i < p; i++) { for (int j = 0; j < p - 1; j++) { DataMap[i][j] = DataMap[i + 1][j]; } } p = p - 1; //重新计算合并数据后的聚类与其他聚类之间的距离 for (int i = 0; i < p; ++i) { double dis = GetAVGDistance(CluData[Find_i], CluData[i]); if (DataMap[Find_i][i] != dis) { DataMap[Find_i][i] = dis; DataMap[i][Find_i] = dis; } } } } //打印数据 void AGNES::print() { for (int i = 0; i < p; i++) { cout << "聚类编号:" << i << endl; for (int j = 0; j < CluData[i].size(); j++) { vector<double> dat = CluData[i][j].GetData(); for (int s = 0; s < dat.size(); s++) { cout << dat[s] << " "; } cout << CluData[i][j].GetName() << " "; cout << CluData[i][j].GetClusterId(); cout << endl; } } } //两个数据之间的距离 double AGNES::GetDistance(DataPoint &point, DataPoint &point2) { double sum = 0; for (int i = 0; i < point.GetData().size(); i++) { sum += (point.GetData()[i] - point2.GetData()[i])*(point.GetData()[i] - point2.GetData()[i]); } double result = sqrt(sum); return result; } //两个聚类之间最小的距离 double AGNES::GetCDistance(vector<DataPoint> C, vector<DataPoint> C2) { double MinDis = 9999; for (int i = 0; i < C.size(); i++) { for (int j = 0; j < C2.size(); j++) { double dis = GetDistance(C[i], C2[j]); if (dis < MinDis) { MinDis = dis; } } } return MinDis; } //两个聚类的平均距离 double AGNES::GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2) { double temp[4]; double temp2[4]; for (int i = 0; i < C.size(); i++) { temp[0] += C[i].GetData()[0]; temp[1] += C[i].GetData()[1]; temp[2] += C[i].GetData()[2]; temp[3] += C[i].GetData()[3]; } temp[0] = temp[0] / C.size(); temp[1] = temp[1] / C.size(); temp[2] = temp[2] / C.size(); temp[3] = temp[3] / C.size(); for (int i = 0; i < C2.size(); i++) { temp2[0] += C2[i].GetData()[0]; temp2[1] += C2[i].GetData()[1]; temp2[2] += C2[i].GetData()[2]; temp2[3] += C2[i].GetData()[3]; } temp2[0] = temp2[0] / C2.size(); temp2[1] = temp2[1] / C2.size(); temp2[2] = temp2[2] / C2.size(); temp2[3] = temp2[3] / C2.size(); int sum = 0; for (int i = 0; i < 4; i++) { sum += ((temp[i] - temp2[i])*(temp[i] - temp2[i])); } return sqrt(sum); }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。