PCA的C++与sklearn实现_sklearn c++实现

作者：我家小花儿 | 2024-03-12 04:18:21

踩

sklearn c++实现

PCA的简要步骤：

数据中心化；
求中心化后数据的协方差矩阵；
求协方差矩阵的特征值、特征向量；
对特征值进行从大到小排序，同步排序特征向量；
从前往后选取一定百分比的排序后的特征向量组成映射矩阵；
将原始数据乘以映射矩阵得到维度降低后的数据。

C++实现PCA

auxiliary.hpp

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <vector>

#include <Eigen/Dense>

#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>

/* 解析鸢尾花数据集 */
std::vector<double> ParseStrLine(const std::string& line) {
    std::vector<double> one_sample;
    auto found = line.find_first_of(",");
    double val = 0.0;
    std::string substr = line;
    while (found != std::string::npos)
    {
        val = atof(std::string(substr.begin(), substr.begin() + found).c_str());
        one_sample.push_back(val);
        substr = std::string(substr.begin() + found + 1, substr.end());

        found = substr.find_first_of(",");
    }
    val = atof(substr.c_str());
    one_sample.push_back(val);

    return one_sample;
}

/* 读取鸢尾花数据集 */
int GetData(std::vector<std::vector<double> > &iris)
{
    std::ifstream data_txt("../data/iris.txt");
    if (!data_txt.is_open())
    {
        std::cout << "Read data failed." << std::endl;
        system("pause");
        return -1;
    }

    // iris中存储原始数据
    std::string line("");
    while (std::getline(data_txt, line))
    {
        iris.push_back(ParseStrLine(line));
    }

    return 0;
}

/* 读取鸢尾花数据集标签 */
int GetLabels(std::vector<int> &labels)
{
    std::ifstream label_txt("../data/iris_label.txt");
    if (!label_txt.is_open())
    {
        std::cout << "Read data failed." << std::endl;
        system("pause");
        return -1;
    }

    // iris中原始数据的标签
    std::string line("");
    std::getline(label_txt, line);
    auto found = line.find_first_of(",");
    std::string substr = line;
    int val = 0;
    while (found != std::string::npos)
    {
        val = atoi(std::string(substr.begin(), substr.begin() + found).c_str());
        labels.push_back(val);
        substr = std::string(substr.begin() + found + 1, substr.end());
        found = substr.find_first_of(",");
    }
    val = atoi(substr.c_str());
    labels.push_back(val);

    return 0;
}

/* 保存PCA降维后的数据 */
int SaveDecompositionData(const Eigen::MatrixXd  &pca_m)
{
    std::ofstream pca_txt("../data/pca_iris.txt");
    if (!pca_txt.is_open())
    {
        std::cout << "Read data failed." << std::endl;
        system("pause");
        return -1;
    }
    char content[30] = { '\0' };
    for (int i = 0; i < pca_m.rows(); ++i)
    {
        sprintf_s(content, 30, "%.12f", pca_m(i, 0));
        pca_txt << content << ", ";
        sprintf_s(content, 30, "%.12f", pca_m(i, 1));
        pca_txt << content << std::endl;
    }
    pca_txt.close();

    return 0;
}

/* 可视化PCA降维后的数据 */
void Visialization(const Eigen::MatrixXd  &pca_m, std::vector<int> &labels)
{
    cv::Mat visual(220, 220, CV_8UC3);
    cv::Scalar color[] = { cv::Scalar(255,0,0),cv::Scalar(0,255,0), cv::Scalar(0,0,255) };
    for (int i = 0; i < pca_m.rows(); ++i)
    {
        cv::Point center(pca_m(i, 0) * 30 + 100, pca_m(i, 1) * 30 + 100);
        visual.at<cv::Vec3b>(center.x, center.y) = cv::Vec3b(color[labels[i]][0], color[labels[i]][1], color[labels[i]][2]);
    }
    cv::imshow("visual", visual);
    cv::waitKey(0);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

pca.cpp

#include "auxiliary.hpp"

#define USE_LESS_EIGEN               // 打开该宏尽可能的少用eigen库,更接近PCA的推导过程的实现
#define FEAT_REMAIN_RATIO  0.5       // 特征维度保留比例
#define EQUALIZATION_WITH_SKLEARN    // 使计算结果与sklearn一致

int main()
{
    /* Step0,读取鸢尾花数据  */
    std::vector<std::vector<double> > iris;
    int ret = GetData(iris);
    if (0 != ret)
    {
        std::cout << "GetData() failed." << std::endl;
        system("pause");
        return ret;
    }

    /* Step1,读取鸢尾花数据的标签  */
    std::vector<int> labels;
    ret = GetLabels(labels);
    if (0 != ret)
    {
        std::cout << "GetData() failed." << std::endl;
        system("pause");
        return ret;
    }

    /* Step2,将鸢尾花数据中心化  */
#ifdef USE_LESS_EIGEN
    // 计算原始数据各维度均值
    std::vector<double> mean(iris[0].size());  // 特征各维度均值
    std::vector<double> sum(iris[0].size());
    for (auto sample : iris)
    {
        for (int i = 0; i < sample.size(); ++i)
        {
            sum[i] += sample[i];
        }
    }
    for (int i = 0; i < sum.size(); ++i)
    {
        mean[i] = sum[i] / iris.size();
    }

    // 计算中心化后的数据
    Eigen::MatrixXd m(iris.size(), iris[0].size());  // 存储中心化后的数据
    for (int i = 0; i < iris.size(); ++i)
    {
        for (int j = 0; j < iris[0].size(); ++j)
        {
            m(i, j) = iris[i][j] - mean[j];
            //std::cout << m(i, j) << std::endl;
        }
    }
    Eigen::MatrixXd zero_mean_m = m;
#else
    Eigen::MatrixXd m(iris.size(), iris[0].size());  // 存储中心化后的数据
    for (int i = 0; i < iris.size(); ++i)
    {
        for (int j = 0; j < iris[0].size(); ++j)
        {
            m(i, j) = iris[i][j];
        }
    }
    Eigen::MatrixXd mean_vec = m.colwise().mean();
    Eigen::RowVectorXd mean_vec_row(Eigen::RowVectorXd::Map(mean_vec.data(), m.cols()));
    Eigen::MatrixXd zero_mean_m = m;
    zero_mean_m.rowwise() -= mean_vec_row;
#endif

    /* Step3,计算中心化后数据zero_mean_m的协方差矩阵 */
#ifdef USE_LESS_EIGEN
    Eigen::MatrixXd zero_mean_transpose = zero_mean_m.transpose();
    Eigen::MatrixXd covariance = zero_mean_transpose * zero_mean_m / (zero_mean_m.rows() - 1);
    std::cout << covariance << std::endl;
#else
    Eigen::MatrixXd covariance = (zero_mean_m.adjoint()*zero_mean_m) / double(zero_mean_m.rows() - 1);
    std::cout << covariance << std::endl;
#endif

    /* Step4,计算协方差矩阵的特征值及特征向量 */
    Eigen::EigenSolver<Eigen::MatrixXd>eigen_solver(covariance);
    if (eigen_solver.info() != Eigen::Success)
    {
        std::cout << "Solver error." << std::endl;
    }
    std::cout << "特征值：\n" << eigen_solver.eigenvalues().real() << std::endl;
    std::cout << "特征向量：\n" << eigen_solver.eigenvectors().real() << std::endl;

    /* Step5,将特征值从大到小排序,并同步排序特征向量 */
    std::vector<std::pair<double, std::vector<double>>>eigen_values_vectors;
    for (int i = 0; i < covariance.rows(); ++i)
    {
        std::pair<double, std::vector<double>> eigen_value_vector;
        eigen_value_vector.first = eigen_solver.eigenvalues().real()[i];

        for (int j = 0; j < covariance.cols(); ++j)
        {
            // 注意eigen中特征向量是按列排布,因此下面取特征向量的索引是(j, i)
            eigen_value_vector.second.push_back(eigen_solver.eigenvectors()(j, i).real()); 
        }
        eigen_values_vectors.push_back(eigen_value_vector);
    }
    struct CEigenBig {
        bool operator() (std::pair<double, std::vector<double>> i, std::pair<double, std::vector<double>> j) { return (i.first > j.first); }
    }eigen_big;
    std::sort(eigen_values_vectors.begin(), eigen_values_vectors.end(), eigen_big); // 将特征值从大到小排序,并同步排序特征向量

    int remain = eigen_values_vectors.size() * FEAT_REMAIN_RATIO;     // 计算保留的特征维度数
    Eigen::MatrixXd components(eigen_values_vectors.size(), remain);  // components是保留维度的特征向量
    for (int j = 0; j < components.cols(); ++j)
    {
        for (int i = 0; i < components.rows(); ++i)
        {
            // 该语句块可使最终降维后数据与sklearn中的完全一致,
            // 由于实现细节的差异使得特征向量的正负号有差异导致降维后数据存在正负号的差异,
            // 对于鸢尾花数据sklearn的实现与此处的实现其第二大特征值对应的特征向量存在正负号的差异,
            // 因此想要降维后的数据第二各维度的符号一致可以将第二大特征值取反
#ifdef EQUALIZATION_WITH_SKLEARN
            if (1 == j)
            {
                components(i, j) = -eigen_values_vectors[j].second[i];
            }
            else
            {
                components(i, j) = eigen_values_vectors[j].second[i];
            }
#else
            components(i, j) = eigen_values_vectors[j].second[i];
#endif
        }
    }
    std::cout << components << std::endl;

    /* Step6,将中心化的数据乘以(保留维度的)特征向量得到降维后的数据 */
    Eigen::MatrixXd  pca_m = zero_mean_m * components;
    std::cout << "降维后数据：\n" << pca_m << std::endl;

    /* Step7,保存降维后数据 */
    ret = SaveDecompositionData(pca_m);
    if (0 != ret)
    {
        std::cout << "GetData() failed." << std::endl;
        system("pause");
        return ret;
    }

    /* Step8,可视化降维后数据 */
    Visialization(pca_m, labels);

    system("pause");
    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

sklearn实现PCA

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy

iris = datasets.load_iris()

x = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(x).transform(x)
cov = pca.get_covariance()
# X_r = pca.fit_transform(x)
print("explained variance ratio (first two components): %s" % str(pca.explained_variance_ratio_))

#fout = open(r'..\data\sklearn_pca_iris.txt', 'w',encoding='UTF-8')
#for i in range(len(X_r)):
#    fout.write(str(X_r[i][0])+', '+str(X_r[i][1])+'\n')
#fout.close()

numpy.savetxt(r'..\data\sklearn_pca_iris.txt', X_r, fmt='%.12f', delimiter=', ', newline='\n')

plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(
        X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
    )
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA of IRIS dataset")
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

在这里插入图片描述

Data

iris.txt

5.1,3.5,1.4,0.2
4.9,3. ,1.4,0.2
4.7,3.2,1.3,0.2
4.6,3.1,1.5,0.2
5. ,3.6,1.4,0.2
5.4,3.9,1.7,0.4
4.6,3.4,1.4,0.3
5. ,3.4,1.5,0.2
4.4,2.9,1.4,0.2
4.9,3.1,1.5,0.1
5.4,3.7,1.5,0.2
4.8,3.4,1.6,0.2
4.8,3. ,1.4,0.1
4.3,3. ,1.1,0.1
5.8,4. ,1.2,0.2
5.7,4.4,1.5,0.4
5.4,3.9,1.3,0.4
5.1,3.5,1.4,0.3
5.7,3.8,1.7,0.3
5.1,3.8,1.5,0.3
5.4,3.4,1.7,0.2
5.1,3.7,1.5,0.4
4.6,3.6,1. ,0.2
5.1,3.3,1.7,0.5
4.8,3.4,1.9,0.2
5. ,3. ,1.6,0.2
5. ,3.4,1.6,0.4
5.2,3.5,1.5,0.2
5.2,3.4,1.4,0.2
4.7,3.2,1.6,0.2
4.8,3.1,1.6,0.2
5.4,3.4,1.5,0.4
5.2,4.1,1.5,0.1
5.5,4.2,1.4,0.2
4.9,3.1,1.5,0.1
5. ,3.2,1.2,0.2
5.5,3.5,1.3,0.2
4.9,3.1,1.5,0.1
4.4,3. ,1.3,0.2
5.1,3.4,1.5,0.2
5. ,3.5,1.3,0.3
4.5,2.3,1.3,0.3
4.4,3.2,1.3,0.2
5. ,3.5,1.6,0.6
5.1,3.8,1.9,0.4
4.8,3. ,1.4,0.3
5.1,3.8,1.6,0.2
4.6,3.2,1.4,0.2
5.3,3.7,1.5,0.2
5. ,3.3,1.4,0.2
7. ,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,1.5
5.5,2.3,4. ,1.3
6.5,2.8,4.6,1.5
5.7,2.8,4.5,1.3
6.3,3.3,4.7,1.6
4.9,2.4,3.3,1. 
6.6,2.9,4.6,1.3
5.2,2.7,3.9,1.4
5. ,2. ,3.5,1. 
5.9,3. ,4.2,1.5
6. ,2.2,4. ,1. 
6.1,2.9,4.7,1.4
5.6,2.9,3.6,1.3
6.7,3.1,4.4,1.4
5.6,3. ,4.5,1.5
5.8,2.7,4.1,1. 
6.2,2.2,4.5,1.5
5.6,2.5,3.9,1.1
5.9,3.2,4.8,1.8
6.1,2.8,4. ,1.3
6.3,2.5,4.9,1.5
6.1,2.8,4.7,1.2
6.4,2.9,4.3,1.3
6.6,3. ,4.4,1.4
6.8,2.8,4.8,1.4
6.7,3. ,5. ,1.7
6. ,2.9,4.5,1.5
5.7,2.6,3.5,1. 
5.5,2.4,3.8,1.1
5.5,2.4,3.7,1. 
5.8,2.7,3.9,1.2
6. ,2.7,5.1,1.6
5.4,3. ,4.5,1.5
6. ,3.4,4.5,1.6
6.7,3.1,4.7,1.5
6.3,2.3,4.4,1.3
5.6,3. ,4.1,1.3
5.5,2.5,4. ,1.3
5.5,2.6,4.4,1.2
6.1,3. ,4.6,1.4
5.8,2.6,4. ,1.2
5. ,2.3,3.3,1. 
5.6,2.7,4.2,1.3
5.7,3. ,4.2,1.2
5.7,2.9,4.2,1.3
6.2,2.9,4.3,1.3
5.1,2.5,3. ,1.1
5.7,2.8,4.1,1.3
6.3,3.3,6. ,2.5
5.8,2.7,5.1,1.9
7.1,3. ,5.9,2.1
6.3,2.9,5.6,1.8
6.5,3. ,5.8,2.2
7.6,3. ,6.6,2.1
4.9,2.5,4.5,1.7
7.3,2.9,6.3,1.8
6.7,2.5,5.8,1.8
7.2,3.6,6.1,2.5
6.5,3.2,5.1,2. 
6.4,2.7,5.3,1.9
6.8,3. ,5.5,2.1
5.7,2.5,5. ,2. 
5.8,2.8,5.1,2.4
6.4,3.2,5.3,2.3
6.5,3. ,5.5,1.8
7.7,3.8,6.7,2.2
7.7,2.6,6.9,2.3
6. ,2.2,5. ,1.5
6.9,3.2,5.7,2.3
5.6,2.8,4.9,2. 
7.7,2.8,6.7,2. 
6.3,2.7,4.9,1.8
6.7,3.3,5.7,2.1
7.2,3.2,6. ,1.8
6.2,2.8,4.8,1.8
6.1,3. ,4.9,1.8
6.4,2.8,5.6,2.1
7.2,3. ,5.8,1.6
7.4,2.8,6.1,1.9
7.9,3.8,6.4,2. 
6.4,2.8,5.6,2.2
6.3,2.8,5.1,1.5
6.1,2.6,5.6,1.4
7.7,3. ,6.1,2.3
6.3,3.4,5.6,2.4
6.4,3.1,5.5,1.8
6. ,3. ,4.8,1.8
6.9,3.1,5.4,2.1
6.7,3.1,5.6,2.4
6.9,3.1,5.1,2.3
5.8,2.7,5.1,1.9
6.8,3.2,5.9,2.3
6.7,3.3,5.7,2.5
6.7,3. ,5.2,2.3
6.3,2.5,5. ,1.9
6.5,3. ,5.2,2. 
6.2,3.4,5.4,2.3
5.9,3. ,5.1,1.8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

iris_labels.txt

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
1

Reference

Comparison of LDA and PCA 2D projection of Iris dataset

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/我家小花儿/article/detail/222012