Python机器学习09——随机森林_python 随机森林

参考书目:陈强.机器学习及Python应用. 北京:高等教育出版社, 2021.





  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.model_selection import KFold, StratifiedKFold
  7. from sklearn.model_selection import GridSearchCV
  8. from sklearn.metrics import mean_squared_error
  9. from sklearn.linear_model import LinearRegression
  10. from sklearn.linear_model import LogisticRegression
  11. from sklearn.tree import DecisionTreeClassifier
  12. from sklearn.tree import DecisionTreeRegressor
  13. from sklearn.ensemble import BaggingClassifier
  14. from sklearn.ensemble import BaggingRegressor
  15. from sklearn.ensemble import RandomForestClassifier
  16. from sklearn.ensemble import RandomForestRegressor
  17. from sklearn.datasets import load_iris, load_boston
  18. from sklearn.metrics import cohen_kappa_score
  19. from sklearn.metrics import plot_roc_curve
  20. from sklearn.inspection import plot_partial_dependence
  21. from mlxtend.plotting import plot_decision_regions
  22. #读取数据
  23. # Motorcycle Example: Tree vs. Bagging
  24. mcycle = pd.read_csv('mcycle.csv')
  25. mcycle.head()
  26. #取X和y
  27. X = np.array(mcycle.times).reshape(-1, 1)
  28. y = mcycle.accel


  1. # Single tree estimation best_estimator_.
  2. model = DecisionTreeRegressor(random_state=123)
  3. path = model.cost_complexity_pruning_path(X, y)
  4. param_grid = {'ccp_alpha': path.ccp_alphas}
  5. kfold = KFold(n_splits=10, shuffle=True, random_state=1)
  6. model = GridSearchCV(DecisionTreeRegressor(random_state=123), param_grid, cv=kfold)
  7. pred_tree = model.fit(X, y).predict(X)
  8. print(model.score(X,y))
  9. sns.scatterplot(x='times', y='accel', data=mcycle, alpha=0.6)
  10. plt.plot(X, pred_tree, 'b')
  11. plt.title('Single Tree Estimation')


  1. # Bagging estimation
  2. model = BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=123), n_estimators=500, random_state=0)
  3. pred_bag = model.fit(X, y).predict(X)
  4. print(model.score(X,y))
  5. sns.scatterplot(x='times', y='accel', data=mcycle, alpha=0.6)
  6. plt.plot(X, pred_bag, 'b')
  7. plt.title('Bagging Estimation')
  8. # Alternatively,one could use 'RandomForestRegressor', which by default
  9. # sets max_features = n_features that is de facto bagging. The results are slightly different.
  10. # The advantage of 'BaggingRegressor' is the option to use different base learners.



  1. Boston = load_boston()
  2. X = pd.DataFrame(Boston.data, columns=Boston.feature_names)
  3. y = Boston.target
  4. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
  5. #bagging估计器
  6. model = BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=123), n_estimators=500, oob_score=True, random_state=0)
  7. #拟合
  8. model.fit(X_train, y_train)
  9. #袋外预测值
  10. pred_oob = model.oob_prediction_
  11. #袋外均方误差
  12. mean_squared_error(y_train, pred_oob)
  13. #袋外测试集拟合优度
  14. model.oob_score_
  15. #测试集拟合优度
  16. model.score(X_test, y_test)
  17. #对比线性回归拟合优度
  18. # Comparison with OLS
  19. model = LinearRegression().fit(X_train, y_train)
  20. model.score(X_test, y_test)


  1. # OOB Errors
  2. oob_errors = []
  3. for n_estimators in range(100,301,10):
  4. model = BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=123),
  5. n_estimators=n_estimators, n_jobs=-1, oob_score=True, random_state=0)
  6. model.fit(X_train, y_train)
  7. pred_oob = model.oob_prediction_
  8. oob_errors.append(mean_squared_error(y_train, pred_oob))
  9. plt.plot(range(100, 301,10), oob_errors)
  10. plt.xlabel('Number of Trees')
  11. plt.ylabel('OOB MSE')
  12. plt.title('Bagging OOB Errors')




  1. # Random Forest for Regression on Boston Housing Data
  2. #确定超参数max_features,即每次分裂使用的特征个数
  3. max_features=int(X_train.shape[1] / 3)
  4. max_features
  5. 拟合评价
  6. model = RandomForestRegressor(n_estimators=5000, max_features=max_features, random_state=0)
  7. model.fit(X_train, y_train)
  8. model.score(X_test, y_test)
  9. #预测值和真实值比较
  10. # Visualize prediction fit
  11. pred = model.predict(X_test)
  12. plt.scatter(pred, y_test, alpha=0.6)
  13. w = np.linspace(min(pred), max(pred), 100)
  14. plt.plot(w, w)
  15. plt.xlabel('pred')
  16. plt.ylabel('y_test')
  17. plt.title('Random Forest Prediction')


  1. # Feature Importance Plot
  2. model.feature_importances_
  3. sorted_index = model.feature_importances_.argsort()
  4. plt.barh(range(X.shape[1]), model.feature_importances_[sorted_index])
  5. plt.yticks(np.arange(X.shape[1]), X.columns[sorted_index])
  6. plt.xlabel('Feature Importance')
  7. plt.ylabel('Feature')
  8. plt.title('Random Forest')
  9. plt.tight_layout()


  1. # 画偏依赖图
  2. from sklearn.inspection import PartialDependenceDisplay
  3. PartialDependenceDisplay.from_estimator(model, X, ['LSTAT', 'RM'])


  1. scores = []
  2. for max_features in range(1, X.shape[1] + 1):
  3. model = RandomForestRegressor(max_features=max_features,
  4. n_estimators=500, random_state=123)
  5. model.fit(X_train, y_train)
  6. score = model.score(X_test, y_test)
  7. scores.append(score)
  8. index = np.argmax(scores)
  9. range(1, X.shape[1] + 1)[index]
  10. plt.plot(range(1, X.shape[1] + 1), scores, 'o-')
  11. plt.axvline(range(1, X.shape[1] + 1)[index], linestyle='--', color='k', linewidth=1)
  12. plt.xlabel('max_features')
  13. plt.ylabel('R2')
  14. plt.title('Choose max_features via Test Set')



  1. #RF
  2. scores_rf = []
  3. for n_estimators in range(1, 301):
  4. model = RandomForestRegressor(max_features=9,
  5. n_estimators=n_estimators, random_state=123)
  6. model.fit(X_train, y_train)
  7. pred = model.predict(X_test)
  8. mse = mean_squared_error(y_test, pred)
  9. scores_rf.append(mse)
  10. # Bagging
  11. scores_bag = []
  12. for n_estimators in range(1, 301):
  13. model = BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=123), n_estimators=n_estimators, random_state=0)
  14. model.fit(X_train, y_train)
  15. pred = model.predict(X_test)
  16. mse = mean_squared_error(y_test, pred)
  17. scores_bag.append(mse)
  18. #DecisionTree
  19. model = DecisionTreeRegressor()
  20. path = model.cost_complexity_pruning_path(X_train, y_train)
  21. param_grid = {'ccp_alpha': path.ccp_alphas}
  22. kfold = KFold(n_splits=10, shuffle=True, random_state=1)
  23. model = GridSearchCV(DecisionTreeRegressor(random_state=123), param_grid, cv=kfold, scoring='neg_mean_squared_error')
  24. model.fit(X_train, y_train)
  25. score_tree = -model.score(X_test, y_test)
  26. scores_tree = [score_tree for i in range(1, 301)]
  27. #画图
  28. plt.plot(range(1, 301), scores_tree, 'k--', label='Single Tree')
  29. plt.plot(range(1, 301), scores_bag, 'k-', label='Bagging')
  30. plt.plot(range(1, 301), scores_rf, 'b-', label='Random Forest')
  31. plt.xlabel('Number of Trees')
  32. plt.ylabel('MSE')
  33. plt.title('Test Error')
  34. plt.legend()


  1. max_features = range(1, X.shape[1] + 1)
  2. param_grid = {'max_features': max_features }
  3. kfold = KFold(n_splits=10, shuffle=True, random_state=1)
  4. model = GridSearchCV(RandomForestRegressor(n_estimators=300, random_state=123),
  5. param_grid, cv=kfold, scoring='neg_mean_squared_error', return_train_score=True)
  6. model.fit(X_train, y_train)
  7. model.best_params_
  8. cv_mse = -model.cv_results_['mean_test_score']
  9. plt.plot(max_features, cv_mse, 'o-')
  10. plt.axvline(max_features[np.argmin(cv_mse)], linestyle='--', color='k', linewidth=1)
  11. plt.xlabel('max_features')
  12. plt.ylabel('MSE')
  13. plt.title('CV Error for Random Forest')



  1. #读取数据
  2. Sonar = pd.read_csv('Sonar.csv')
  3. Sonar.shape
  4. Sonar.head(2)
  5. #取出X和y
  6. X = Sonar.iloc[:, :-1]
  7. y = Sonar.iloc[:, -1]
  8. #画变量之间的相关性热力图
  9. sns.heatmap(X.corr(), cmap='Blues')
  10. plt.title('Correlation Matrix')
  11. plt.tight_layout()


  1. X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=50, random_state=1)
  2. # Single Tree as benchmark
  3. model = DecisionTreeClassifier()
  4. path = model.cost_complexity_pruning_path(X_train, y_train)
  5. param_grid = {'ccp_alpha': path.ccp_alphas}
  6. kfold = KFold(n_splits=10, shuffle=True, random_state=1)
  7. model = GridSearchCV(DecisionTreeClassifier(random_state=123), param_grid, cv=kfold)
  8. model.fit(X_train, y_train)
  9. model.score(X_test, y_test)
  10. # Random Forest
  11. model = RandomForestClassifier(n_estimators=500, max_features='sqrt', random_state=123)
  12. model.fit(X_train, y_train)
  13. model.score(X_test, y_test)


  1. # Choose optimal mtry parameter via CV
  2. #GridSearchCV需要响应变量y是数值,所以生成虚拟变量
  3. y_train_dummy = pd.get_dummies(y_train)
  4. y_train_dummy = y_train_dummy.iloc[:, 1]
  5. param_grid = {'max_features': range(1, 11) }
  6. kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
  7. model = GridSearchCV(RandomForestClassifier(n_estimators=300, random_state=123), param_grid, cv=kfold)
  8. model.fit(X_train, y_train_dummy)
  9. model.best_params_
  10. #max_features=8
  11. #因此采用8进行估计
  12. model = RandomForestClassifier(n_estimators=500, max_features=8, random_state=123)
  13. model.fit(X_train, y_train)
  14. model.score(X_test, y_test)
  15. #变量重要性的图
  16. sorted_index = model.feature_importances_.argsort()
  17. plt.barh(range(X.shape[1]), model.feature_importances_[sorted_index])
  18. plt.yticks(np.arange(X.shape[1]), X.columns[sorted_index])
  19. plt.xlabel('Feature Importance')
  20. plt.ylabel('Feature')
  21. plt.title('Random Forest')


  1. # Prediction Performance
  2. pred = model.predict(X_test)
  3. table = pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
  4. table



  1. table = np.array(table)
  2. Accuracy = (table[0, 0] + table[1, 1]) / np.sum(table)
  3. Accuracy
  4. Sensitivity = table[1 , 1] / (table[1, 0] + table[1, 1])
  5. Sensitivity
  6. Specificity = table[0, 0] / (table[0, 0] + table[0, 1])
  7. Specificity
  8. Recall = table[1, 1] / (table[0, 1] + table[1, 1])
  9. Recall
  10. cohen_kappa_score(y_test, pred)
  11. #画ROC曲线
  12. plot_roc_curve(model, X_test, y_test)
  13. x = np.linspace(0, 1, 100)
  14. plt.plot(x, x, 'k--', linewidth=1)
  15. plt.title('ROC Curve for Random Forest')



  1. X,y = load_iris(return_X_y=True)
  2. X2 = X[:, 2:4]
  3. model = RandomForestClassifier(n_estimators=500, max_features=1, random_state=1)
  4. model.fit(X2,y)
  5. model.score(X2,y)
  6. plot_decision_regions(X2, y, model)
  7. plt.xlabel('petal_length')
  8. plt.ylabel('petal_width')
  9. plt.title('Decision Boundary for Random Forest')


