赞
踩
对于非线性等式
f
i
(
x
)
=
g
i
(
x
)
−
y
i
=
0
,
i
=
1
,
.
.
.
,
m
f_i(x)=g_i(x)-y_i=0 , i = 1, . . . , m
fi(x)=gi(x)−yi=0,i=1,...,m
写作向量形式
f
(
x
)
=
(
f
1
(
x
)
,
.
.
.
,
f
m
(
x
)
)
f(x) = (f_1(x), . . . , f_m(x))
f(x)=(f1(x),...,fm(x))
f
(
x
)
=
0
f(x)=0
f(x)=0
对任意x,我们称f_i(x)为第i个残差,我们不能直接得到x的数值解,但是可以通过使残差平方和最小,来求一个近似解。
f
1
(
x
)
2
+
.
.
.
+
f
m
(
x
)
2
=
∣
∣
f
(
x
)
∣
∣
2
f_1(x)^2+...+f_m(x)^2=||f(x)||^2
f1(x)2+...+fm(x)2=∣∣f(x)∣∣2
这意味着找到对任意x具有
∣
∣
f
(
x
)
∣
∣
2
≥
∣
∣
f
(
x
~
)
∣
∣
2
||f(x)||^2≥||f(\tilde{x})||^2
∣∣f(x)∣∣2≥∣∣f(x~)∣∣2的
x
~
\tilde{x}
x~,我们把这一点
x
~
\tilde{x}
x~称为式(1)的最小二乘近似解。
将博客手写数据集的线性最小二乘二分类问题转化为非线性问题(这里将原文中的参数换为
β
\beta
β,未知数为x不然不好看):
g
=
x
^
T
β
^
g=\hat{x}^T\hat{\beta}
g=x^Tβ^
f
i
(
x
)
=
g
i
(
x
)
−
y
i
f_i(x)=g_i(x)-y_i
fi(x)=gi(x)−yi
将
f
i
(
x
)
f_i(x)
fi(x)做归一化处理
φ
(
u
)
=
e
u
−
e
−
u
e
u
+
e
−
u
φ(u)=\frac{e^u-e^{-u}}{e^u+e^{-u}}
φ(u)=eu+e−ueu−e−u
我们通过求解非线性最小二乘问题来选择x
∑
i
=
1
m
φ
(
g
i
(
x
)
)
−
y
i
\sum_{i=1}^mφ(g_i(x))- y_i
i=1∑mφ(gi(x))−yi
对于一维
f
(
x
)
=
0
f(x)=0
f(x)=0求根,牛顿法是一种利用泰勒展开的线性化方法
对于优化问题
m
i
n
f
(
x
)
min f(x)
minf(x)
问题转化为
f
′
(
x
)
=
0
f^{'}(x)=0
f′(x)=0
牛顿法将问题转化为求
f
′
(
x
0
)
+
(
x
−
x
0
)
f
′
′
(
x
0
)
=
0
f^{'}(x_0)+(x-x_0)f^{''}(x_0)=0
f′(x0)+(x−x0)f′′(x0)=0
同样用迭代法求解x
x
n
+
1
=
x
n
−
f
′
(
x
n
)
f
′
′
(
x
n
)
x_{n+1}=x_n-\frac{f^{'}(x_n)}{f^{''}(x_n)}
xn+1=xn−f′′(xn)f′(xn)
对于多维的
f
(
x
)
=
0
f(x)=0
f(x)=0问题,这时我们就可以利用雅克比,海赛矩阵之类的来表示高维求导函数。
f
(
x
)
=
0
,
x
=
(
x
0
,
.
.
.
,
x
n
)
f(x)=0,x= (x_0, . . . ,x_n)
f(x)=0,x=(x0,...,xn)
雅克比矩阵(表示一阶偏导):
J
f
=
(
∂
f
∂
x
0
,
∂
f
∂
x
1
.
.
.
.
.
.
.
.
.
∂
f
∂
x
n
)
J_f=(\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1}.........\frac{\partial f}{\partial x_n})
Jf=(∂x0∂f,∂x1∂f.........∂xn∂f)
黑赛矩阵表示二阶偏导
H
f
=
[
∂
2
f
∂
x
0
2
∂
2
f
∂
x
0
x
1
.
.
.
∂
2
f
∂
x
0
x
n
∂
2
f
∂
x
1
x
0
∂
2
f
∂
x
1
2
.
.
.
∂
2
f
∂
x
1
x
n
.
.
.
.
∂
2
f
∂
x
n
x
0
∂
2
f
∂
x
n
x
1
.
.
.
∂
2
f
∂
x
n
2
]
H_f=
高维牛顿法解最优化问题又可写成
x
n
+
1
=
x
n
−
H
f
(
x
n
)
−
1
J
f
(
x
n
)
x_{n+1}=x_n-H_f(x_n)^{-1}J_f(x_n)
xn+1=xn−Hf(xn)−1Jf(xn)
对于目标函数
m
i
n
∑
i
=
1
m
f
i
2
(
x
)
min\sum_{i=1}^mf_i^2(x)
mini=1∑mfi2(x)
雅克比矩阵(表示一阶偏导):
J
f
=
[
∂
f
0
∂
x
0
∂
f
0
∂
x
1
.
.
.
∂
f
0
∂
x
n
∂
f
1
∂
x
0
∂
f
1
∂
x
1
.
.
.
∂
f
1
∂
x
n
.
.
.
.
∂
f
m
∂
x
0
∂
f
m
∂
x
1
.
.
.
∂
f
m
∂
x
n
]
J_f=
黑赛矩阵表示二阶偏导(将二次偏导省略)
H
=
2
J
f
T
J
f
H=2J_f^TJ_f
H=2JfTJf
代入牛顿法高维迭代方程的基本形式,得到高斯牛顿法迭代方程
x
n
+
1
=
x
n
−
(
J
f
T
J
f
)
−
1
J
f
(
x
n
)
f
(
x
n
)
x_{n+1}=x_n-(J_f^TJ_f)^{-1}J_f(x_n)f(x_n)
xn+1=xn−(JfTJf)−1Jf(xn)f(xn)
高斯牛顿法的H矩阵并不一定可逆,Levenberg-Marquardt对其进行改进
x
n
+
1
=
x
n
−
(
J
f
T
J
f
+
λ
I
)
−
1
J
f
(
x
n
)
f
(
x
n
)
x_{n+1}=x_n-(J_f^TJ_f+λI)^{-1}J_f(x_n)f(x_n)
xn+1=xn−(JfTJf+λI)−1Jf(xn)f(xn)
然后Levenberg-Marquardt方法的好处就是在于可以调节:
如果下降太快,使用较小的λ,使之更接近高斯牛顿法
如果下降太慢,使用较大的λ,使之更接近梯度下降法
细节:Levenberg-Marquardt算法浅谈
因此在解决了高斯-牛顿法基础之上,LM算法的重点就是如何确定λ值。λ值更新
算法流程
其实有很多地方没有详尽的推导,代码也可能不正确,后面重新捋一遍(给自己说)
参考博客LM算法C++实现
手写数据集非线性最小二乘二分类实现
#include <vector> #include <string> #include <fstream> #include <iostream> #include <Eigen/Dense> #include <ctime> #include <windows.h> using namespace std; using namespace Eigen; const double DERIV_STEP = 1e-5; const int MAX_ITER = 100; int ReversInt(int i)//保证数据在不同主机之间传输时能够被正确解释 { unsigned char ch1, ch2, ch3, ch4;//无符号版本和有符号版本的区别就是有符号类型需要使用一个bit来表示数字的正负 ch1 = i & 255; ch2 = (i >> 8) & 255; ch3 = (i >> 16) & 255; ch4 = (i >> 24) & 255; return((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4; } void read_Mnist_label(string filename, vector<float>& labels) { ifstream file(filename, ios::binary); if (file.is_open()) { int magic_number = 0; int number_of_images = 0; file.read((char*)&magic_number, sizeof(magic_number)); file.read((char*)&number_of_images, sizeof(number_of_images)); magic_number = ReversInt(magic_number); number_of_images = ReversInt(number_of_images); cout << endl; cout << "\t" << "magic number = " << magic_number << endl; cout << "\t" << "number of images = " << number_of_images << endl; for (int i = 0; i < number_of_images; i++) { unsigned char label = 0;//一个字节存储8位无符号数,储存的数值范围为0-255 file.read((char*)&label, sizeof(label)); labels.push_back((float)label); } } } void read_Mnist_Images(string filename, vector<vector<float>>& images) { ifstream file(filename, ios::binary);//直接调用了其默认的打开方式 if (file.is_open()) { int magic_number = 0; int number_of_images = 0; int n_rows = 0; int n_cols = 0; // unsigned char label; file.read((char*)&magic_number, sizeof(magic_number)); file.read((char*)&number_of_images, sizeof(number_of_images)); file.read((char*)&n_rows, sizeof(n_rows)); file.read((char*)&n_cols, sizeof(n_cols)); magic_number = ReversInt(magic_number); number_of_images = ReversInt(number_of_images); n_rows = ReversInt(n_rows); n_cols = ReversInt(n_cols); cout << endl; cout << "\t" << "magic number = " << magic_number << endl; cout << "\t" << "number of images = " << number_of_images << endl;//60000 cout << "\t" << "rows = " << n_rows << endl;//28 cout << "\t" << "cols = " << n_cols << endl;//28 for (int i = 0; i < number_of_images; i++) { vector<float>tp; for (int r = 0; r < n_rows; r++) { for (int c = 0; c < n_rows; c++) { unsigned char image = 0;//一个字节存储8位无符号数,储存的数值范围为0-255 file.read((char*)&image, sizeof(image)); tp.push_back(image); } } images.push_back(tp);//60000×784 } } } float f_i(VectorXd* beta, MatrixXf* x, int i)//f_i { float result = 0.0f; float sum = 0.0f; for (int j = 0; j < 494; j++) sum += ((*x)(i, j) * (*beta)(j)); result += (exp(sum) - exp(-sum) / (exp(sum) + exp(-sum))); return result; } float func_i(VectorXf* labels1, VectorXd* beta, MatrixXf* x, int i)//f_i { float result = 0.0f; float sum = 0.0f; for (int j = 0; j < 494; j++) sum += ((*x)(i, j) * (*beta)(j)); result += (exp(sum) - exp(-sum) / (exp(sum) + exp(-sum)) - (*labels1)[i]); return result; } float FUNC(VectorXf* labels1, VectorXd* beta, MatrixXf* x) { float sum = 0; for (int i = 0; i < (*labels1).size(); i++) { sum += func_i(labels1, beta, x, i); } return sum; } float Deriv(VectorXf* labels1, VectorXd* beta, MatrixXf* x, int n, int i)//beta_n,f_i { // Assumes input is a single row matrix //clone方法返回,一个新的相同的对象被创建 // Returns the derivative of the nth parameter VectorXd params1(*beta); VectorXd params2(*beta); // Use central difference to get derivative params1[n] -= DERIV_STEP; params2[n] += DERIV_STEP; // c += a 等效于 c = c + a float p1 = func_i(labels1, ¶ms1, x, i); float p2 = func_i(labels1, ¶ms2, x, i); float d = (p2 - p1) / (2 * DERIV_STEP); return d; } int main(int argc, char* argv[]) { Eigen::initParallel(); //训练 //60000×784,组成一个矩阵 vector<vector<float>>images; //60000×494,组成一个矩阵 vector<int> effective_col; //一个60000维标签向量 vector<float>labels; //一个494维b向量 cout << endl << "训练集图片信息:" << endl; read_Mnist_Images("train-images.idx3-ubyte", images); cout << endl << "训练集标签信息:" << endl; read_Mnist_label("train-labels.idx1-ubyte", labels); //60000中的600张 for (int i = 0; i < images[0].size(); i++) { int s = 0; for (int j = 0; j < images.size(); j++) { if (images[j][i] > 0) { s++; } if (s == 600) { effective_col.push_back(i);//得到有效列 break; } } } auto m = images.size();//训练矩阵的行数60000 auto n = images[0].size();//矩阵列数 auto b = labels.size();//矩阵行数 auto num_key = effective_col.size() + 1;//有效列数 //***************************************建立训练集60000*494矩阵*************************************** MatrixXf Characteristic_matrix(m, num_key);//其每个元素都是float类型。 for (int i = 0; i < m; i++) { for (int j = 0; j < num_key - 1; j++) { Characteristic_matrix(i, j) = images[i][effective_col[j]]; } Characteristic_matrix(i, num_key - 1) = 1;//最后一个特征是1 } images.~vector<vector<float>>();//析构函数,释放内存 int tr = Characteristic_matrix.rows(); int tc = Characteristic_matrix.cols(); VectorXf actual_Y(b);//初始化训练集的标签向量 for (int i = 0; i < b; i++) { labels[i] == 0 ? actual_Y(i) = 1 : actual_Y(i) - 1; } labels.~vector<float>();//析构函数,释放内存 //初始化beta矩阵 VectorXd beta(num_key); for (int i = 0; i < 494; i++) { beta(i) = 0.5f; } VectorXd tempbeta(num_key); MatrixXd J(m, num_key);//雅各比矩阵 VectorXd G(num_key);//g矩阵 float u = 1.0f;//阻尼因子 float v = 2.0f; MatrixXd I(num_key, num_key); I = I.Identity(num_key, num_key);//单位矩阵 float q = 1.0f; MatrixXd H(num_key, num_key);//黑塞矩阵 float mse; float mse_temp; VectorXd f(m); for (int k = 0; k < 10; k++) { cout << k << endl; if (q > 0) { for (int i = 0; i < m; i++) { f(i) = func_i(&actual_Y, &beta, &Characteristic_matrix, i); } for (int i = 0; i < m; i++) { for (int j = 0; j < 494; j++) J(i, j) = Deriv(&actual_Y, &beta, &Characteristic_matrix, j, i); G = J.transpose() * f; } H = J.transpose() * J; } tempbeta = beta - ((H + u * I).inverse()) * G; mse = FUNC(&actual_Y, &beta, &Characteristic_matrix); mse_temp = FUNC(&actual_Y, &tempbeta, &Characteristic_matrix); q = 2 * (mse - mse_temp) / (beta.transpose() * (u * beta - J.transpose() * f)); if (q > 0) { mse = mse_temp; beta = tempbeta; float a = 1 / 3; u = u * max(a, 1 - (2 * q - 1) * (2 * q - 1) * (2 * q - 1)); v = 2; } else { u = u * v; v = v * 2; } cout << beta << endl; if (fabs(mse - mse_temp) < 1e-8) break; } vector<vector<float>>images1; vector<float>labels1; std::cout << endl << "测试集图片信息:" << endl; read_Mnist_Images("t10k-images.idx3-ubyte", images1); std::cout << endl << "测试集标签信息:" << endl; read_Mnist_label("t10k-labels.idx1-ubyte", labels1); auto m1 = images1.size(); std::cout << m1 << endl; MatrixXf Characteristic_matrix1(m1, num_key);//其每个元素都是float类型。 for (int i = 0; i < m1; i++) { for (int j = 0; j < num_key - 1; j++) { Characteristic_matrix1(i, j) = images1[i][effective_col[j]]; } Characteristic_matrix1(i, num_key - 1) = 1; } images1.~vector<vector<float>>();//析构函数,释放内存 VectorXf actual_y1(m1);//初始化测试集的标签向量 for (int i = 0; i < m1; i++) { labels1[i] == 0 ? actual_y1(i) = 1 : actual_y1(i) = -1; } labels1.~vector<float>();//析构函数,释放内存 //***************************************建立测试集10000*494矩阵*************************************** //写判断函数 int tp = 0, fp = 0, tn = 0, fn = 0; for (int i = 0; i < m1; i++) { int c = (int)(f_i(&tempbeta, &Characteristic_matrix1, i) + 0.5); std::cout << c << endl; if (c == actual_y1(i)) { tp++; } else { fp++; } } std::cout << endl; std::cout << "测试集预测结果:" << endl; std::cout << "\t" << "tp = " << tp << endl << "\t" << "fp = " << fp << endl; std::cout << "\t" << "错误率 = " << 1.0 * (fp) / 10000 << endl; return 0; }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。