当前位置:   article > 正文

机器学习聚类算法AGNES层次聚类 C++实现_c++层次聚类

c++层次聚类

C++实现AGNES算法

该算法是自底而上的算法,他的主要原理是所有数据每个样本看成一个初始聚类,在算法运行过程中不断找出距离自己最近的聚类之后合并,直到达到自己设定的聚类个数,所用数据集为python的鸢尾花数据伪代码如下:
AGNES伪代码

C++代码如下

//DataPoint.h 保存基础数据的类
#ifndef _DATAPOINT_H_
#define  _DATAPOINT_H_

#include <vector>
#include <string>
#include <iostream>
#include<map>
using namespace std;
class DataPoint
{
public:
	DataPoint() {}
	~DataPoint() {}

	vector<double> GetData(); //获取保存数据
	string  GetName(); //获取花卉名字
	int GetClusterId(); //获取聚类id

	void SetClusterId(int number);//设置聚类id
	void SetName(string str);//设置花卉名字
	void SetData(vector<double> v);//设置保存数据
private:

	vector<double> Data; //保存数据
	string name; //花卉名字
	int clusterId;//聚类id

};
#endif // _DATAPOINT_H_
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
//DataPoint.cpp
#include "DataPoint.h"
void DataPoint::SetClusterId(int number)
{
	this->clusterId = number;
}
void DataPoint::SetData(vector<double> v)
{
	for (int i = 0; i < v.size(); i++)
	{
		Data.push_back(v[i]);
	}
}
void DataPoint::SetName(string name)
{
	this->name = name;
}
vector<double> DataPoint::GetData()
{
	return this->Data;
}
string DataPoint::GetName()
{
	return this->name;
}
int DataPoint::GetClusterId()
{
	return this->clusterId;
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
//AGNES.cpp
#ifndef _AGNES_H_
#define  _AGNES_H_

#include <iostream>
#include<string>
#include<vector>
#include <fstream>
#include <sstream>
#include "DataPoint.h"
using namespace std;
//将string数据转为数字
template <class Type>
Type stringToNum(const string& str)
{
	istringstream iss(str);
	Type num;
	iss >> num;
	return num;
}
class AGNES
{
public:
	AGNES(double ThreshordDis, int MaxClu) :ThreshordDis(ThreshordDis), MaxClu(MaxClu) {}
	~AGNES() {}
	void StartAgnes(); //算法核心
	void GetData();//获取数据
	void print();//打印信息
private:
	double GetDistance(DataPoint &point, DataPoint &point2); //得到两个数据之间的距离
	double GetCDistance(vector<DataPoint> C, vector<DataPoint> C2); //得到两个聚类之间的最小距离
	double GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2);//得到两个聚类之间的平均距离
	vector<DataPoint> DataBase; //保存所有数据
	vector<vector<DataPoint>> CluData; //保存所有簇类数据


	double ThreshordDis; //这里暂时无用,可以省略
	int MaxClu; //最终获得聚类数量
	int  p; //存储最初的聚类的个数
};
#endif
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
//AGNES.cpp
#include "AGNES.h"
static double DataMap[10000][10000]; //保存两两聚类之间的距离
void AGNES::GetData()
{
	ifstream file;
	string line;
	file.open("iris.csv",ios::in);
	if (file.fail())
	{
		cout << "文件打开失败" << endl;
	}
	while (getline(file, line))
	{
		stringstream ss(line);
		string str;
		vector<string> temp;
		vector<double> v;
		DataPoint d;
		while (getline(ss, str, ','))
		{
			temp.push_back(str);
		}
		for (int i = 1; i < temp.size()-1; i++)
		{
			v.push_back(stringToNum<double>(temp[i]));
		}
		//初始化数据
		d.SetData(v);
		d.SetName(temp[temp.size() - 1]); 
		d.SetClusterId(stringToNum<int>(temp[0])-1);//因为数据是从1开始编号的所以这里-1使其从0开始编号
		DataBase.push_back(d);
	}
}

void AGNES::StartAgnes()
{
	//获得每个数据距离其他数据的距离
	for (int i = 0; i < DataBase.size(); i++)
	{
		vector<DataPoint> t;
		t.push_back(DataBase[i]);
		CluData.push_back(t);
	}
	//DataMap数组保存两个聚类之间的距离
	for (int i = 0; i < CluData.size(); i++)
	{
		vector<DataPoint> temp;
		for (int j =i+1; j < CluData.size(); j++)
		{
			double dis = GetAVGDistance(CluData[i], CluData[j]);
			DataMap[i][j] = dis;
			DataMap[j][i] = dis;
		}
	}
	p = DataBase.size(); //设置聚类个数
	while (p > MaxClu)
	{
		double Temp = 9999;
		int Find_i = 0, Find_j = 0;
		//寻找最小值点
		for (int i = 0; i < p; i++)
		{
			for (int j = i+1; j < p; j++)
			{
				if (DataMap[i][j] < Temp)
				{
					Temp = DataMap[i][j];
					Find_i = i;
					Find_j = j;
				}
			}
		}
		int NewId = CluData[Find_i][0].GetClusterId(); //获取当前簇的聚类号码
		//将寻找到的最小点的两个簇合并 A1+B1->A1
		//并将两个簇的簇类号码统一
		for (int j = 0; j < CluData[Find_j].size(); j++)
		{
			CluData[Find_j][j].SetClusterId(NewId);
			CluData[Find_i].push_back(CluData[Find_j][j]);
		}
		//重编号矩阵
		for (int i = Find_j + 1; i < p; i++)
		{
			for (int j = 0; j < CluData[i].size(); j++)
			{
				CluData[i][j].SetClusterId(CluData[i][j].GetClusterId() - 1);
			}
		}
		//距离矩阵重置
		for (int i = Find_j; i < CluData.size()-1; i++)
		{
			CluData[i] = CluData[i + 1];
		}
		//删除DataMap矩阵 行列
		for (int i = 0; i < p; i++)
		{
			for (int j = Find_j; j < p-1; j++)
			{
				DataMap[i][j] = DataMap[i][j + 1];
			}
		}
		for (int i = Find_j; i < p; i++)
		{
			for (int j = 0; j < p - 1; j++)
			{
				DataMap[i][j] = DataMap[i + 1][j];
			}
		}
		p = p - 1;
		//重新计算合并数据后的聚类与其他聚类之间的距离
		for (int i = 0; i < p; ++i)
		{
			double dis = GetAVGDistance(CluData[Find_i], CluData[i]);
			if (DataMap[Find_i][i] != dis)
			{
				DataMap[Find_i][i] = dis;
				DataMap[i][Find_i] = dis;
			}
		}
	}
}
//打印数据
void AGNES::print()
{
	for (int i = 0; i < p; i++)
	{
		cout << "聚类编号:" << i << endl;
		for (int j = 0; j < CluData[i].size(); j++)
		{
			vector<double> dat = CluData[i][j].GetData();
			for (int s = 0; s < dat.size(); s++)
			{
				cout << dat[s] << " ";
			}
			cout << CluData[i][j].GetName() << " ";
			cout << CluData[i][j].GetClusterId();
			cout << endl;
		}
	}
}
//两个数据之间的距离
double AGNES::GetDistance(DataPoint &point, DataPoint &point2)
{
	double sum = 0;
	for (int i = 0; i < point.GetData().size(); i++)
	{
		sum += (point.GetData()[i] - point2.GetData()[i])*(point.GetData()[i] - point2.GetData()[i]);
	}
	double result = sqrt(sum);
	return result;
}
//两个聚类之间最小的距离
double AGNES::GetCDistance(vector<DataPoint> C, vector<DataPoint> C2)
{
	double MinDis = 9999;
	for (int i = 0; i < C.size(); i++)
	{
		for (int j = 0; j < C2.size(); j++)
		{
			double dis = GetDistance(C[i], C2[j]);
			if (dis < MinDis)
			{
				MinDis = dis;
			}
		}
	}
	return MinDis;
}
//两个聚类的平均距离
double AGNES::GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2)
{
	double temp[4];
	double temp2[4];
	for (int i = 0; i < C.size(); i++)
	{
		temp[0] += C[i].GetData()[0];
		temp[1] += C[i].GetData()[1];
		temp[2] += C[i].GetData()[2];
		temp[3] += C[i].GetData()[3];
	}
	temp[0] = temp[0] / C.size();
	temp[1] = temp[1] / C.size();
	temp[2] = temp[2] / C.size();
	temp[3] = temp[3] / C.size();

	for (int i = 0; i < C2.size(); i++)
	{
		temp2[0] += C2[i].GetData()[0];
		temp2[1] += C2[i].GetData()[1];
		temp2[2] += C2[i].GetData()[2];
		temp2[3] += C2[i].GetData()[3];
	}
	temp2[0] = temp2[0] / C2.size();
	temp2[1] = temp2[1] / C2.size();
	temp2[2] = temp2[2] / C2.size();
	temp2[3] = temp2[3] / C2.size();
	int sum = 0;
	for (int i = 0; i < 4; i++)
	{
		sum += ((temp[i] - temp2[i])*(temp[i] - temp2[i]));
	}
	return sqrt(sum);
}
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Monodyee/article/detail/293302
推荐阅读
相关标签
  

闽ICP备14008679号