当前位置:   article > 正文

python-LASSO回归模型_lasso回归 python

lasso回归 python

https://zhuanlan.zhihu.com/p/76055830

这里写图片描述

LASSO回归模型的应用

①可视化方法确定λ

  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import matplotlib
  5. from sklearn import model_selection
  6. from sklearn.linear_model import Lasso, LassoCV
  7. font = {
  8. 'family': 'FangSong',
  9. 'weight': 'bold',
  10. 'size': 12
  11. }
  12. matplotlib.rc("font", **font)
  13. # 读取糖尿病数据集
  14. diabetes = pd.read_excel('./data/diabetes.xlsx')
  15. # 构造自变量(剔除患者性别、年龄和因变量)
  16. predictors = diabetes.iloc[:, 2: -1]
  17. # 将数据集拆分成训练集和测试集
  18. x_train, x_test, y_train, y_test = model_selection.train_test_split(predictors, diabetes['Y'], test_size=0.2, random_state=1234)
  19. '''
  20. 可视化方法确定λ的值
  21. '''
  22. # 构造不同的Lambda值
  23. Lambdas = np.logspace(-5, 2, 200)
  24. # 构造空列表,用于存储模型的偏回归系数
  25. lasso_cofficients = []
  26. for Lambda in Lambdas:
  27. lasso = Lasso(alpha=Lambda, normalize=True, max_iter=10000)
  28. lasso.fit(x_train, y_train)
  29. lasso_cofficients.append(lasso.coef_)
  30. '''
  31. 可视化方法确定λ的值
  32. '''
  33. # 绘制Lambda与回归线的折线图
  34. plt.plot(Lambdas, lasso_cofficients)
  35. # 对x轴做对数变换
  36. plt.xscale('log')
  37. # 设置折线图x轴和y轴标签
  38. plt.xlabel('Lambda')
  39. plt.ylabel('Cofficients')
  40. # 显示图形
  41. plt.show()

②交叉验证法确定λ的值

  1. # LASSO回归模型的交叉验证
  2. lasso_cv = LassoCV(alphas=Lambdas, normalize=True, cv=10, max_iter=10000)
  3. lasso_cv.fit(x_train, y_train)
  4. # 输出最佳的lambda值
  5. lasso_best_alpha = lasso_cv.alpha_ # 0.06294988990221888
  6. print(lasso_best_alpha)

③模型的预测

  1. # 基于最佳的lambda值建模
  2. lasso = Lasso(alpha=lasso_best_alpha, normalize=True, max_iter=10000)
  3. # 对"类"加以数据实体,执行回归系数的运算
  4. lasso.fit(x_train, y_train)
  5. # 返回LASSO回归的系数
  6. res = pd.Series(index=['Intercept'] + x_train.columns.tolist(), data=[lasso.intercept_] + lasso.coef_.tolist())
  7. '''
  8. Intercept -278.560358
  9. BMI 6.188602
  10. BP 0.860826
  11. S1 -0.127627
  12. S2 -0.000000
  13. S3 -0.488408
  14. S4 0.000000
  15. S5 44.487738
  16. S6 0.324076
  17. 系数中含有两个0,分别是S2和S4,说明这两个变量对糖尿病指数Y没有显著意义
  18. '''
  19. print(res)
  20. # 模型预测
  21. lasso_predict = lasso.predict(x_test)
  22. # 验证预测效果
  23. from sklearn.metrics import mean_squared_error
  24. RMSE = np.sqrt(mean_squared_error(y_test, lasso_predict)) # 53.061437258225745
  25. print(RMSE)

用线性回归做比较

  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import matplotlib
  5. from sklearn import model_selection
  6. from statsmodels import api as sms
  7. from sklearn.metrics import mean_squared_error
  8. font = {
  9. 'family': 'FangSong',
  10. 'weight': 'bold',
  11. 'size': 12
  12. }
  13. matplotlib.rc("font", **font)
  14. # 读取糖尿病数据集
  15. diabetes = pd.read_excel('./data/diabetes.xlsx')
  16. # 构造自变量(剔除患者性别、年龄和因变量)
  17. predictors = diabetes.iloc[:, 2: -1]
  18. # 将数据集拆分成训练集和测试集
  19. x_train, x_test, y_train, y_test = model_selection.train_test_split(predictors, diabetes['Y'], test_size=0.2, random_state=1234)
  20. # 为自变量x添加常数列1,用于拟合截距项
  21. x_train2 = sms.add_constant(x_train)
  22. x_test2 = sms.add_constant(x_test)
  23. # 构建多元线性回归模型
  24. liner = sms.formula.OLS(y_train, x_train2).fit()
  25. # 返回线性回归模型的系数
  26. params = liner.params
  27. '''
  28. const -406.699716
  29. BMI 6.217649
  30. BP 0.948245
  31. S1 -1.264772
  32. S2 0.901368
  33. S3 0.962373
  34. S4 6.694215
  35. S5 71.614661
  36. S6 0.376004
  37. dtype: float64
  38. '''
  39. print(params)
  40. # 模型预测
  41. linear_predict = liner.predict(x_test2)
  42. # 预测效果验证
  43. RMSE = np.sqrt(mean_squared_error(y_test, linear_predict)) # 53.42623939722992
  44. print(RMSE)

 

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/笔触狂放9/article/detail/598907
推荐阅读
相关标签
  

闽ICP备14008679号