当前位置:   article > 正文

Python机器学习库sklearn几种回归算法建模及分析(实验)_sklearn中都有哪些回归算法regression

sklearn中都有哪些回归算法regression

最简单的回归模型就是线性回归

数据导入与可视化分析

  1. from IPython.display import Image
  2. %matplotlib inline
  3. # Added version check for recent scikit-learn 0.18 checks
  4. from distutils.version import LooseVersion as Version
  5. from sklearn import __version__ as sklearn_version
  1. #原数据网址变了,新换的数据地址需要处理http://lib.stat.cmu.edu/datasets/boston
  2. import pandas as pd
  3. import numpy as np
  4. #df = pd.read_csv('http://lib.stat.cmu.edu/datasets/boston',header=19,sep='\s{1,3}')
  5. #df.head()
  6. dfnp=np.genfromtxt('boston.txt')
  7. df=pd.DataFrame(dfnp,columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'])
  8. df.to_csv('boston.csv')
  9. df.head()

  1. import matplotlib.pyplot as plt
  2. import seaborn as sns
  3. sns.set(style='whitegrid', context='notebook') #style控制默认样式,context控制着默认的画幅大小
  4. cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
  5. sns.pairplot(df[cols], size=2.5)
  6. plt.tight_layout()
  7. # plt.savefig('./figures/scatter.png', dpi=300)
  8. plt.show()


  1. import numpy as np
  2. cm = np.corrcoef(df[cols].values.T) #corrcoef方法按行计算皮尔逊相关系数,cm是对称矩阵
  3. #使用np.corrcoef(a)可计算行与行之间的相关系数,np.corrcoef(a,rowvar=0)用于计算各列之间的相关系数,输出为相关系数矩阵。
  4. sns.set(font_scale=1.5) #font_scale设置字体大小
  5. hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size': 15},yticklabels=cols,xticklabels=cols)
  6. # plt.tight_layout()
  7. # plt.savefig('./figures/corr_mat.png', dpi=300)
  8. plt.show()

  1. sns.reset_orig() #将参数还原为seaborn作图前的原始值
  2. %matplotlib inline

线性回归的最小二乘法

  1. #编制最小二乘法类
  2. class LinearRegressionGD(object):
  3. def __init__(self, eta=0.001, n_iter=20):
  4. self.eta = eta
  5. self.n_iter = n_iter
  6. def fit(self, X, y): #X是列向量,y是行向量
  7. self.w_ = np.zeros(1 + X.shape[1]) #初始化(1,2)全0的行向量,存迭代过程拟合直线的两个系数
  8. self.cost_ = []
  9. for i in range(self.n_iter):
  10. output = self.net_input(X)
  11. errors = (y - output) #与y同维度的行向量,errors是误差项
  12. self.w_[1:] += self.eta * X.T.dot(errors) #拟合直线的一次项系数
  13. self.w_[0] += self.eta * errors.sum() #拟合直线的常数项
  14. cost = (errors**2).sum() / 2.0 #残差的平方和一半——目标函数
  15. self.cost_.append(cost)
  16. return self
  17. def net_input(self, X):
  18. return np.dot(X, self.w_[1:]) + self.w_[0]
  19. def predict(self, X):
  20. return self.net_input(X)
  21. #cost_是每次迭代的残差平方和一半的统计列表,w_包含每次迭代直线的两个参数,errors是每次迭代的残差
  1. X = df[['RM']].values #X是(*,1)维列向量
  2. y = df['MEDV'].values #y是(*, )行向量
  3. from sklearn.preprocessing import StandardScaler
  4. sc_x = StandardScaler()
  5. sc_y = StandardScaler()
  6. X_std = sc_x.fit_transform(X)
  7. #fit_transform方法可以拆分成StanderdScalar里的fit和transform两步,这里为了区别LinearRegressionGD类将两步合并
  8. y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
  9. #y[:, np.newaxis]作用等同于y[np.newaxis].T,也就是df[['MEDV']].values;flatten方法的作用是变回成1*n的向量
  10. #fit_transform方法是对“列向量”直接规范化
  11. lr = LinearRegressionGD()
  12. lr.fit(X_std, y_std) #这里的fit是LinearRegressionGD类的,注意区分sklearn里不同类的fit方法使用环境
  13. #Output:<__main__.LinearRegressionGD at 0x16add278>
  14. plt.plot(range(1, lr.n_iter+1), lr.cost_)
  15. plt.ylabel('SSE')
  16. plt.xlabel('Epoch')
  17. plt.tight_layout()
  18. # plt.savefig('./figures/cost.png', dpi=300)
  19. plt.show()

  1. def lin_regplot(X, y, model):
  2. plt.scatter(X, y, c='lightblue')
  3. plt.plot(X, model.predict(X), color='red', linewidth=2)
  4. return
  1. lin_regplot(X_std, y_std, lr)
  2. plt.xlabel('Average number of rooms [RM] (standardized)')
  3. plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
  4. plt.tight_layout()
  5. # plt.savefig('./figures/gradient_fit.png', dpi=300)
  6. plt.show()

  1. print('Slope: %.3f' % lr.w_[1]) #ax+b里的a
  2. print('Intercept: %.3f' % lr.w_[0]) #ax+b里的b
  3. #Output:
  4. #Slope: 0.695
  5. #Intercept: -0.000
  6. num_rooms_std = sc_x.transform(np.array([[5.0]])) #与建模时数据进行同样的标准化转化
  7. price_std = lr.predict(num_rooms_std) #a*num_rooms_std+b
  8. print("Price in $1000's: %.3f" % sc_y.inverse_transform(price_std))
  9. #Output:
  10. #Price in $1000's: 10.840
  1. #用sklearn完成回归并查看系数,与上面自己编写的LinearRegressionGD对比
  2. from sklearn.linear_model import LinearRegression
  3. slr = LinearRegression()
  4. slr.fit(X, y)
  5. y_pred = slr.predict(X)
  6. print('Slope: %.3f' % slr.coef_[0])
  7. print('Intercept: %.3f' % slr.intercept_)
  8. #Output:
  9. #Slope: 9.102
  10. #Intercept: -34.671
  11. lin_regplot(X, y, slr)
  12. plt.xlabel('Average number of rooms [RM]')
  13. plt.ylabel('Price in $1000\'s [MEDV]')
  14. plt.tight_layout()
  15. # plt.savefig('./figures/scikit_lr_fit.png', dpi=300)
  16. plt.show()

下面利用法方程求解拟合直线系数

  1. # adding a column vector of "ones"
  2. Xb = np.hstack((np.ones((X.shape[0], 1)), X)) #在X前加一列1
  3. w = np.zeros(X.shape[1])
  4. z = np.linalg.inv(np.dot(Xb.T, Xb))
  5. #np.linalg.inv方法利用线性代数包求逆,np.dot(Xb.T, Xb)是2*2方阵
  6. w = np.dot(z, np.dot(Xb.T, y))
  7. print('Slope: %.3f' % w[1])
  8. print('Intercept: %.3f' % w[0])
  9. #Output:
  10. #Slope: 9.102
  11. #Intercept: -34.671
  12. #从上面结果可见:Logistic Regression求拟合直线的原理就是利用法方程

使用sklearn的RANSAC提高鲁棒性

去除噪声点,利用完全数据的有效内点子集做回归a subset of inliers(对应outliers是离群点)

  1. from sklearn.linear_model import RANSACRegressor
  2. #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RANSACRegressor.html#sklearn.linear_model.RANSACRegressor
  3. #RANSACRegressor是从样本总体中随机取无噪声子集,回归拟合直线
  4. if Version(sklearn_version) < '0.18':
  5. ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50,
  6. residual_metric=lambda x: np.sum(np.abs(x), axis=1),
  7. residual_threshold=5.0, random_state=0)
  8. else:
  9. ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50,
  10. loss='absolute_loss', residual_threshold=5.0, random_state=0)
  11. #base_estimator基本估计器对象默认为LR,max_trials随机样本子集选择的最大迭代次数,
  12. #min_samples最少需要取样的点数,loss是样本损失判定准则,residual_threshold是检测离群点的样本损失阈值
  13. ransac.fit(X, y)
  14. inlier_mask = ransac.inlier_mask_ #将ransac对象的inliers标注为True
  15. outlier_mask = np.logical_not(inlier_mask) #取非,numpy库里避免python本身含有的not重载
  16. line_X = np.arange(3, 10, 1)
  17. line_y_ransac = ransac.predict(line_X[:, np.newaxis]) #构造出的验证ransac预测点
  18. plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers')
  19. plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers')
  20. plt.plot(line_X, line_y_ransac, color='red') #构造出的预测点做的回归线
  21. plt.plot(X[inlier_mask],ransac.predict(X[inlier_mask]),color='m') #用于建模的inliers点做的回归线
  22. plt.xlabel('Average number of rooms [RM]')
  23. plt.ylabel('Price in $1000\'s [MEDV]')
  24. plt.legend(loc='upper left')
  25. plt.tight_layout()
  26. # plt.savefig('./figures/ransac_fit.png', dpi=300)
  27. plt.show()

  1. print('Slope: %.3f' % ransac.estimator_.coef_[0])
  2. print('Intercept: %.3f' % ransac.estimator_.intercept_)
  3. #Output:
  4. #Slope: 9.621
  5. #Intercept: -37.137

评估回归模型的性能

  1. if Version(sklearn_version) < '0.18':
  2. from sklearn.cross_validation import train_test_split
  3. else:
  4. from sklearn.model_selection import train_test_split
  5. X = df.iloc[:, :-1].values #除去最后一列,其余都作为特征考虑范围
  6. y = df['MEDV'].values
  7. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
  8. slr = LinearRegression()
  9. slr.fit(X_train, y_train)
  10. y_train_pred = slr.predict(X_train)
  11. y_test_pred = slr.predict(X_test)
  12. plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
  13. plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
  14. #预测值与偏差的关系
  15. plt.xlabel('Predicted values')
  16. plt.ylabel('Residuals')
  17. plt.legend(loc='upper left')
  18. plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
  19. plt.xlim([-10, 50])
  20. plt.tight_layout()
  21. # plt.savefig('./figures/slr_residuals.png', dpi=300)
  22. plt.show()

  1. from sklearn.metrics import r2_score
  2. from sklearn.metrics import mean_squared_error #均方误差回归损失
  3. #http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
  4. print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
  5. print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
  6. #Output:
  7. #MSE train: 19.958, test: 27.196
  8. #R^2 train: 0.765, test: 0.673

添加正则化项

  1. from sklearn.linear_model import Lasso
  2. #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso
  3. lasso = Lasso(alpha=0.1) #L1正则化系数
  4. lasso.fit(X_train, y_train)
  5. y_train_pred = lasso.predict(X_train)
  6. y_test_pred = lasso.predict(X_test)
  7. print(lasso.coef_)
  8. print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
  9. print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
  10. #Output:
  11. #[-0.11311792 0.04725111 -0.03992527 0.96478874 -0. 3.72289616
  12. # -0.02143106 -1.23370405 0.20469 -0.0129439 -0.85269025 0.00795847
  13. # -0.52392362]
  14. #MSE train: 20.926, test: 28.876
  15. #R^2 train: 0.753, test: 0.653

多项式回归与曲线拟合

  1. X = np.array([258.0, 270.0, 294.0, 320.0, 342.0, 368.0, 396.0, 446.0, 480.0, 586.0])[:, np.newaxis]
  2. y = np.array([236.4, 234.4, 252.8, 298.6, 314.2, 342.2, 360.8, 368.0, 391.2, 390.8])
  3. from sklearn.preprocessing import PolynomialFeatures #生成多项式特征,不是直接用多项式模型拟合
  4. #http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures
  5. lr = LinearRegression()
  6. pr = LinearRegression()
  7. quadratic = PolynomialFeatures(degree=2) #degress设置多项式拟合中多项式的最高次数
  8. X_quad = quadratic.fit_transform(X) #X列向量,fit_transform是对X按列求[1, X, X^2],即构造二次项数据
  9. #For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
  10. # fit linear features
  11. lr.fit(X, y)
  12. X_fit = np.arange(250, 600, 10)[:, np.newaxis] #X_fit是构造的预测数据,X是训练数据
  13. y_lin_fit = lr.predict(X_fit) #利用线性回归对构造的X_fit数据预测
  14. # fit quadratic features
  15. pr.fit(X_quad, y) #X_quad是训练数据,使用它进行建模得多项式系数
  16. y_quad_fit = pr.predict(quadratic.fit_transform(X_fit)) #利用二次多项式对构造的X_fit数据预测
  17. # plot results
  18. plt.scatter(X, y, label='training points')
  19. plt.plot(X_fit, y_lin_fit, label='linear fit', linestyle='--')
  20. plt.plot(X_fit, y_quad_fit, label='quadratic fit')
  21. plt.legend(loc='upper left')
  22. plt.tight_layout()
  23. # plt.savefig('./figures/poly_example.png', dpi=300)
  24. plt.show()

  1. #验证pr.fit(X_quad, y)可以借助LinearRegression类中的判定边界构造部分求多项式系数
  2. pr.fit(X_quad, y)
  3. pr.coef_
  4. #Output:
  5. #array([ 0.00000000e+00, 2.39893018e+00, -2.25020109e-03])
  6. y_lin_pred = lr.predict(X) #用训练数据集训练的线性模型,对训练数据集进行线性预测(因为线性模型建模时,就不是100%学习到了精确模型)
  7. y_quad_pred = pr.predict(X_quad)
  8. print('Training MSE linear: %.3f, quadratic: %.3f' % (mean_squared_error(y, y_lin_pred),mean_squared_error(y, y_quad_pred)))
  9. print('Training R^2 linear: %.3f, quadratic: %.3f' % (r2_score(y, y_lin_pred),r2_score(y, y_quad_pred)))
  10. #Output:
  11. #Training MSE linear: 569.780, quadratic: 61.330
  12. #Training R^2 linear: 0.832, quadratic: 0.982

房价数据集的非线性拟合

  1. X = df[['LSTAT']].values
  2. y = df['MEDV'].values
  3. regr = LinearRegression()
  4. # create quadratic features
  5. quadratic = PolynomialFeatures(degree=2)
  6. cubic = PolynomialFeatures(degree=3)
  7. X_quad = quadratic.fit_transform(X)
  8. X_cubic = cubic.fit_transform(X)
  9. # fit features
  10. X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]
  11. regr = regr.fit(X, y) #X,y训练数据集建模;X_fit测试数据集预测;对训练数据集测试得分(因为有时根本不知道测试数据集对应的真实y值)
  12. y_lin_fit = regr.predict(X_fit)
  13. linear_r2 = r2_score(y, regr.predict(X))
  14. regr = regr.fit(X_quad, y)
  15. y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
  16. quadratic_r2 = r2_score(y, regr.predict(X_quad))
  17. regr = regr.fit(X_cubic, y)
  18. y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
  19. cubic_r2 = r2_score(y, regr.predict(X_cubic))
  20. # plot results
  21. plt.scatter(X, y, label='training points', color='lightgray')
  22. plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2, linestyle=':')
  23. plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2, color='red', lw=2, linestyle='-')
  24. plt.plot(X_fit, y_cubic_fit, label='cubic (d=3), $R^2=%.2f$' % cubic_r2, color='green', lw=2, linestyle='--')
  25. plt.xlabel('% lower status of the population [LSTAT]')
  26. plt.ylabel('Price in $1000\'s [MEDV]')
  27. plt.legend(loc='upper right')
  28. plt.tight_layout()
  29. # plt.savefig('./figures/polyhouse_example.png', dpi=300)
  30. plt.show()

使用基于树的算法进行回归:回归树

  1. import numpy
  2. from sklearn.tree import DecisionTreeRegressor
  3. #http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
  4. X = df[['LSTAT']].values
  5. y = df['MEDV'].values
  6. tree = DecisionTreeRegressor(max_depth=3) #max_depth设置树深
  7. tree.fit(X, y) #参考官网attributes部分了解建模后得到的各种属性:树,使用的特征及特征重要性
  8. sort_idx = X.flatten().argsort() #X中最小元素到最大元素的索引构成的向量
  9. lin_regplot(X[sort_idx], y[sort_idx], tree)
  10. plt.xlabel('% lower status of the population [LSTAT]')
  11. plt.ylabel('Price in $1000\'s [MEDV]')
  12. # plt.savefig('./figures/tree_regression.png', dpi=300)
  13. plt.show()
  14. #水平红线表示c值,竖直红线表示特征列选择的切分点

随机森林回归

  1. X = df.iloc[:, :-1].values #13个特征列
  2. y = df['MEDV'].values
  3. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
  4. from sklearn.ensemble import RandomForestRegressor
  5. forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1, n_jobs=-1)
  6. forest.fit(X_train, y_train)
  7. y_train_pred = forest.predict(X_train)
  8. y_test_pred = forest.predict(X_test)
  9. print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
  10. print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
  11. #Output:
  12. #MSE train: 1.642, test: 11.052
  13. #R^2 train: 0.979, test: 0.878
  14. plt.scatter(y_train_pred, y_train_pred - y_train, c='black', marker='o', s=35, alpha=0.5, label='Training data')
  15. plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', s=35, alpha=0.7, label='Test data')
  16. plt.xlabel('Predicted values')
  17. plt.ylabel('Residuals')
  18. plt.legend(loc='upper left')
  19. plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
  20. plt.xlim([-10, 50])
  21. plt.tight_layout()
  22. # plt.savefig('./figures/slr_residuals.png', dpi=300)
  23. plt.show()

关注公众号:瑞行AI,欢迎交流AI算法、数据分析、leetcode刷题等技术,提供技术方案咨询和就业指导服务!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/127981
推荐阅读
相关标签
  

闽ICP备14008679号