赞
踩
https://zhuanlan.zhihu.com/p/76055830
①可视化方法确定
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import matplotlib
- from sklearn import model_selection
- from sklearn.linear_model import Lasso, LassoCV
- font = {
- 'family': 'FangSong',
- 'weight': 'bold',
- 'size': 12
- }
- matplotlib.rc("font", **font)
- # 读取糖尿病数据集
- diabetes = pd.read_excel('./data/diabetes.xlsx')
- # 构造自变量(剔除患者性别、年龄和因变量)
- predictors = diabetes.iloc[:, 2: -1]
- # 将数据集拆分成训练集和测试集
- x_train, x_test, y_train, y_test = model_selection.train_test_split(predictors, diabetes['Y'], test_size=0.2, random_state=1234)
- '''
- 可视化方法确定λ的值
- '''
- # 构造不同的Lambda值
- Lambdas = np.logspace(-5, 2, 200)
- # 构造空列表,用于存储模型的偏回归系数
- lasso_cofficients = []
- for Lambda in Lambdas:
- lasso = Lasso(alpha=Lambda, normalize=True, max_iter=10000)
- lasso.fit(x_train, y_train)
- lasso_cofficients.append(lasso.coef_)
- '''
- 可视化方法确定λ的值
- '''
- # 绘制Lambda与回归线的折线图
- plt.plot(Lambdas, lasso_cofficients)
- # 对x轴做对数变换
- plt.xscale('log')
- # 设置折线图x轴和y轴标签
- plt.xlabel('Lambda')
- plt.ylabel('Cofficients')
- # 显示图形
- plt.show()
②交叉验证法确定λ的值
- # LASSO回归模型的交叉验证
- lasso_cv = LassoCV(alphas=Lambdas, normalize=True, cv=10, max_iter=10000)
- lasso_cv.fit(x_train, y_train)
- # 输出最佳的lambda值
- lasso_best_alpha = lasso_cv.alpha_ # 0.06294988990221888
- print(lasso_best_alpha)
③模型的预测
-
- # 基于最佳的lambda值建模
- lasso = Lasso(alpha=lasso_best_alpha, normalize=True, max_iter=10000)
- # 对"类"加以数据实体,执行回归系数的运算
- lasso.fit(x_train, y_train)
- # 返回LASSO回归的系数
- res = pd.Series(index=['Intercept'] + x_train.columns.tolist(), data=[lasso.intercept_] + lasso.coef_.tolist())
- '''
- Intercept -278.560358
- BMI 6.188602
- BP 0.860826
- S1 -0.127627
- S2 -0.000000
- S3 -0.488408
- S4 0.000000
- S5 44.487738
- S6 0.324076
- 系数中含有两个0,分别是S2和S4,说明这两个变量对糖尿病指数Y没有显著意义
- '''
- print(res)
- # 模型预测
- lasso_predict = lasso.predict(x_test)
- # 验证预测效果
- from sklearn.metrics import mean_squared_error
- RMSE = np.sqrt(mean_squared_error(y_test, lasso_predict)) # 53.061437258225745
- print(RMSE)
用线性回归做比较
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import matplotlib
- from sklearn import model_selection
- from statsmodels import api as sms
- from sklearn.metrics import mean_squared_error
- font = {
- 'family': 'FangSong',
- 'weight': 'bold',
- 'size': 12
- }
- matplotlib.rc("font", **font)
- # 读取糖尿病数据集
- diabetes = pd.read_excel('./data/diabetes.xlsx')
- # 构造自变量(剔除患者性别、年龄和因变量)
- predictors = diabetes.iloc[:, 2: -1]
- # 将数据集拆分成训练集和测试集
- x_train, x_test, y_train, y_test = model_selection.train_test_split(predictors, diabetes['Y'], test_size=0.2, random_state=1234)
- # 为自变量x添加常数列1,用于拟合截距项
- x_train2 = sms.add_constant(x_train)
- x_test2 = sms.add_constant(x_test)
- # 构建多元线性回归模型
- liner = sms.formula.OLS(y_train, x_train2).fit()
- # 返回线性回归模型的系数
- params = liner.params
- '''
- const -406.699716
- BMI 6.217649
- BP 0.948245
- S1 -1.264772
- S2 0.901368
- S3 0.962373
- S4 6.694215
- S5 71.614661
- S6 0.376004
- dtype: float64
- '''
- print(params)
- # 模型预测
- linear_predict = liner.predict(x_test2)
- # 预测效果验证
- RMSE = np.sqrt(mean_squared_error(y_test, linear_predict)) # 53.42623939722992
- print(RMSE)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。