赞
踩
线性回归作为一种回归分析技术,其分析的因变量属于连续型变量,如果因变量转变为离散型变量,将转换为分类问题。回归分析属于有监督学习问题,本博客将重点回顾标准线性回归知识点,并就线性回归中可能出现的问题进行简单探讨,引出线性回归的两个变种岭回归以及Lasso回归,最后通过sklearn库模拟整个回归过程。
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, discriminant_analysis, cross_validation
def load_data():
diabetes = datasets.load_diabetes()
return cross_validation.train_test_split(diabetes.data, diabetes.target, test_size=0.25, random_state=0)
def test_LinearRegression(*data):
X_train, X_test, y_train, y_test = data
#通过sklearn的linear_model创建线性回归对象
linearRegression = linear_model.LinearRegression()
#进行训练
linearRegression.fit(X_train, y_train)
#通过LinearRegression的coef_属性获得权重向量,intercept_获得b的值
print("权重向量:%s, b的值为:%.2f" % (linearRegression.coef_, linearRegression.intercept_))
#计算出损失函数的值
print("损失函数的值: %.2f" % np.mean((linearRegression.predict(X_test) - y_test) ** 2))
#计算预测性能得分
print("预测性能得分: %.2f" % linearRegression.score(X_test, y_test))
if __name__ == '__main__':
#获得数据集
X_train, X_test, y_train, y_test = load_data()
#进行训练并且输出预测结果
test_LinearRegression(X_train, X_test, y_train, y_test)
权重向量:[ -43.26774487 -208.67053951 593.39797213 302.89814903 -560.27689824
261.47657106 -8.83343952 135.93715156 703.22658427 28.34844354], b的值为:153.07
损失函数的值: 3180.20
预测性能得分: 0.36
岭回归(L2)与Lasso回归(L1)
岭回归和Lasso回归都是在原有的线性回归的基础上引入了正则化项来解决欠拟合和过拟合的问题,
可以使得到的模型更加符合实际情况,是对简单线性回归的优化。λ为正则化参数,用于调节回归系数。
当λ较大时,会削弱回归系数,此时可以校正过拟合问题;当出现欠拟合时则需要将λ往小了调整。
岭回归与Lasso回归的出现是为了解决线性回归出现的过拟合以及在通过正规方程方法求解θ的过程中出现的x
转置乘以x不可逆这两类问题的,这两种回归均通过在损失函数中引入正则化项来达到目的,具体三者的损失
函数对比见下图:
其中λ称为正则化参数,如果λ选取过大,会把所有参数θ均最小化,造成欠拟合,如果λ选取过小,会导致
对过拟合问题解决不当,因此λ的选取是一个技术活。θ则代表预测变量的系数,θ为0表示该预测变量对因
变量是无用的
岭回归与Lasso回归最大的区别在于岭回归引入的是L2范数惩罚项,Lasso回归引入的是L1范数惩罚项,
Lasso回归能够使得损失函数中的许多θ均变成0,
这点要优于岭回归,因为岭回归是要所有的θ均存在的,这样计算量Lasso回归将远远小于岭回归。
可以看到,Ridge回归性能随着alpha的增加先提高后降低,最后缓慢降低;但Lasso回归性能最终会趋于一条直线,原因就在于好多θ值已经均为0,而岭回归却有一定平滑度,因为所有的θ值均存在,可以不断进行优化。
岭回归以及Lasso回归代码实现
岭回归代码示例
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, discriminant_analysis, cross_validation
def load_data():
diabetes = datasets.load_diabetes()
return cross_validation.train_test_split(diabetes.data, diabetes.target, test_size=0.25, random_state=0)
def test_ridge(*data):
X_train, X_test, y_train, y_test = data
ridgeRegression = linear_model.Ridge()
ridgeRegression.fit(X_train, y_train)
print("权重向量:%s, b的值为:%.2f" % (ridgeRegression.coef_, ridgeRegression.intercept_))
print("损失函数的值:%.2f" % np.mean((ridgeRegression.predict(X_test) - y_test) ** 2))
print("预测性能得分: %.2f" % ridgeRegression.score(X_test, y_test))
#测试不同的α值对预测性能的影响
def test_ridge_alpha(*data):
X_train, X_test, y_train, y_test = data
alphas = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
scores = []
for i, alpha in enumerate(alphas):
ridgeRegression = linear_model.Ridge(alpha=alpha)
ridgeRegression.fit(X_train, y_train)
scores.append(ridgeRegression.score(X_test, y_test))
return alphas, scores
def show_plot(alphas, scores):
figure = plt.figure()
ax = figure.add_subplot(1, 1, 1)
ax.plot(alphas, scores)
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel(r"score")
ax.set_xscale("log")
ax.set_title("Ridge")
plt.show()
if __name__ == '__main__':
#使用默认的alpha
#获得数据集
#X_train, X_test, y_train, y_test = load_data()
#进行训练并且预测结果
#test_ridge(X_train, X_test, y_train, y_test)
#使用自己设置的alpha
X_train, X_test, y_train, y_test = load_data()
alphas, scores = test_ridge_alpha(X_train, X_test, y_train, y_test)
show_plot(alphas, scores)
Lasso回归代码示例
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, discriminant_analysis, cross_validation
def load_data():
diabetes = datasets.load_diabetes()
return cross_validation.train_test_split(diabetes.data, diabetes.target, test_size=0.25, random_state=0)
def test_lasso(*data):
X_train, X_test, y_train, y_test = data
lassoRegression = linear_model.Lasso()
lassoRegression.fit(X_train, y_train)
print("权重向量:%s, b的值为:%.2f" % (lassoRegression.coef_, lassoRegression.intercept_))
print("损失函数的值:%.2f" % np.mean((lassoRegression.predict(X_test) - y_test) ** 2))
print("预测性能得分: %.2f" % lassoRegression.score(X_test, y_test))
#测试不同的α值对预测性能的影响
def test_lasso_alpha(*data):
X_train, X_test, y_train, y_test = data
alphas = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
scores = []
for i, alpha in enumerate(alphas):
lassoRegression = linear_model.Lasso(alpha=alpha)
lassoRegression.fit(X_train, y_train)
scores.append(lassoRegression.score(X_test, y_test))
return alphas, scores
def show_plot(alphas, scores):
figure = plt.figure()
ax = figure.add_subplot(1, 1, 1)
ax.plot(alphas, scores)
ax.set_xlabel(r"$\alpha$")
ax.set_ylabel(r"score")
ax.set_xscale("log")
ax.set_title("Ridge")
plt.show()
if __name__=='__main__':
X_train, X_test, y_train, y_test = load_data()
# 使用默认的alpha
#test_lasso(X_train, X_test, y_train, y_test)
# 使用自己设置的alpha
alphas, scores = test_lasso_alpha(X_train, X_test, y_train, y_test)
show_plot(alphas, scores)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。