当前位置:   article > 正文

数据分析 | 特征重要性分析 | 树模型、SHAP值法











  1. import optuna
  2. from sklearn.model_selection import KFold,cross_validate
  3. import pandas as pd
  4. import numpy as np
  5. from sklearn.ensemble import GradientBoostingRegressor
  6. from sklearn.tree import DecisionTreeRegressor
  7. from sklearn.ensemble import RandomForestRegressor
  8. import time
  9. start_time = time.time()
  10. # 读取数据
  11. data = pd.read_csv(r'D:\2暂存文件\Sth with Py\重要性程度分析\data.csv')
  12. X = data.iloc[:,[0, 1, 2, 3, 4, 5, 6, 7]]
  13. y = data.iloc[:,8]
  14. '''贝叶斯随机森林寻优'''
  15. # 定义目标函数和参数空间
  16. def optuna_objective(trial):
  17. # 定义参数空间
  18. n_estimators = trial.suggest_int('n_estimators', 10, 30, 1)
  19. max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  20. max_features = trial.suggest_int('max_features', 10, 30, 1)
  21. # 定义评估器
  22. reg = RandomForestRegressor(n_estimators=n_estimators,
  23. max_depth=max_depth,
  24. max_features=max_features,
  25. random_state=1412,
  26. verbose=False,
  27. n_jobs=-1)
  28. # 定义交叉过程,输出负均方误差
  29. cv = KFold(n_splits=5, shuffle=True, random_state=1412)
  30. validation_loss = cross_validate(reg, X, y,
  31. scoring='neg_mean_squared_error',
  32. cv=cv,
  33. verbose=True,
  34. n_jobs=-1,
  35. error_score='raise')
  36. return np.mean(validation_loss['test_score'])
  37. # 定义优化目标函数
  38. def optimizer_optuna(n_trials):
  39. study = optuna.create_study(sampler=optuna.samplers.TPESampler(n_startup_trials=20, n_ei_candidates=30),
  40. direction='maximize')
  41. study.optimize(optuna_objective, n_trials=n_trials, show_progress_bar=True)
  42. print('随机森林最优参数:\nbest_params:', study.best_trial.params,
  43. '随机森林最优得分:\nbest_score:', study.best_trial.values,
  44. '\n')
  45. return study.best_trial.params, study.best_trial.values
  46. import warnings
  47. warnings.filterwarnings('ignore',message='The objective has been evaluated at this point before trails')
  48. optuna.logging.set_verbosity(optuna.logging.ERROR)
  49. best_params, best_score = optimizer_optuna(100)
  50. # 保存最优参数和最优得分到文件
  51. with open(r'D:\2暂存文件\Sth with Py\重要性程度分析\随机森林贝叶斯优化结果.txt', 'w') as f:
  52. for key, value in best_params.items():
  53. f.write(f'{key}: {value}\n')
  54. f.write(f'Best Score: {best_score}\n')
  55. '''贝叶斯决策树寻优'''
  56. # 定义目标函数和参数空间
  57. def optuna_objective(trial):
  58. # 定义参数空间
  59. max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  60. min_samples_split = trial.suggest_int('min_samples_split',10,30,1)
  61. min_samples_leaf = trial.suggest_int('min_samples_leaf',10,30,1)
  62. max_features = trial.suggest_int('max_features', 10, 30, 1)
  63. # 定义评估器
  64. dtr = DecisionTreeRegressor(
  65. max_depth=max_depth,
  66. max_features=max_features,
  67. min_samples_split=min_samples_split,
  68. min_samples_leaf=min_samples_leaf,
  69. random_state=1412,)
  70. # 定义交叉过程,输出负均方误差
  71. cv = KFold(n_splits=5, shuffle=True, random_state=1412)
  72. validation_loss = cross_validate(dtr, X, y,
  73. scoring='neg_mean_squared_error',
  74. cv=cv,
  75. verbose=True,
  76. n_jobs=-1,
  77. error_score='raise')
  78. return np.mean(validation_loss['test_score'])
  79. # 定义优化目标函数
  80. def optimizer_optuna(n_trials):
  81. study = optuna.create_study(sampler=optuna.samplers.TPESampler(n_startup_trials=20, n_ei_candidates=30),
  82. direction='maximize')
  83. study.optimize(optuna_objective, n_trials=n_trials, show_progress_bar=True)
  84. print('决策树最优参数:\nbest_params:', study.best_trial.params,
  85. '决策树最优得分:\nbest_score:', study.best_trial.values,
  86. '\n')
  87. return study.best_trial.params, study.best_trial.values
  88. import warnings
  89. warnings.filterwarnings('ignore',message='The objective has been evaluated at this point before trails')
  90. optuna.logging.set_verbosity(optuna.logging.ERROR)
  91. best_params, best_score = optimizer_optuna(100)
  92. # 保存最优参数和最优得分到文件
  93. with open(r'D:\2暂存文件\Sth with Py\重要性程度分析\决策树贝叶斯优化结果.txt', 'w') as f:
  94. for key, value in best_params.items():
  95. f.write(f'{key}: {value}\n')
  96. f.write(f'Best Score: {best_score}\n')
  97. '''贝叶斯梯度提升树寻优'''
  98. # 定义目标函数和参数空间
  99. def optuna_objective(trial):
  100. # 定义参数空间
  101. max_depth = trial.suggest_int('max_depth', 3, 10)
  102. learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
  103. n_estimators = trial.suggest_int('n_estimators', 50, 200)
  104. subsample = trial.suggest_float('subsample', 0.5, 1.0)
  105. # 定义评估器
  106. gbr = GradientBoostingRegressor(
  107. max_depth=max_depth,
  108. learning_rate=learning_rate,
  109. n_estimators=n_estimators,
  110. subsample=subsample,
  111. random_state=1412, )
  112. # 定义交叉过程,输出负均方误差
  113. cv = KFold(n_splits=5, shuffle=True, random_state=1412)
  114. validation_loss = cross_validate(gbr, X, y,
  115. scoring='neg_mean_squared_error',
  116. cv=cv,
  117. verbose=True,
  118. n_jobs=-1,
  119. error_score='raise')
  120. return np.mean(validation_loss['test_score'])
  121. # 定义优化目标函数
  122. def optimizer_optuna(n_trials):
  123. study = optuna.create_study(sampler=optuna.samplers.TPESampler(n_startup_trials=20, n_ei_candidates=30),
  124. direction='maximize')
  125. study.optimize(optuna_objective, n_trials=n_trials, show_progress_bar=True)
  126. print('梯度提升树最优参数:\nbest_params:', study.best_trial.params,
  127. '梯度提升树最优得分:\nbest_score:', study.best_trial.value, '\n')
  128. return study.best_trial.params, study.best_trial.value
  129. import warnings
  130. warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before trails')
  131. optuna.logging.set_verbosity(optuna.logging.ERROR)
  132. best_params, best_score = optimizer_optuna(100)
  133. # 保存最优参数和最优得分到文件
  134. with open(r'D:\2暂存文件\Sth with Py\重要性程度分析\梯度提升树贝叶斯优化结果.txt', 'w') as f:
  135. for key, value in best_params.items():
  136. f.write(f'{key}: {value}\n')
  137. f.write(f'Best Score: {best_score}\n')
  138. # 结束计时
  139. end_time = time.time()
  140. # 输出执行时间
  141. execution_time = end_time - start_time
  142. print("模型训练执行时间: {:.2f}秒".format(execution_time))


max_depth: 11、min_samples_split: 18、min_samples_leaf: 13、max_features: 12
Best Score: -0.24607607821335736


n_estimators: 30、max_depth: 19、max_features: 24
Best Score: -0.18016603147647478


max_depth: 9、learning_rate: 0.0418665547136736、n_estimators: 188、subsample: 0.676537978032126
Best Score: -0.16401985559476492




  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.ensemble import RandomForestRegressor
  4. from sklearn.tree import DecisionTreeRegressor
  5. from sklearn.ensemble import GradientBoostingRegressor
  6. import matplotlib.pyplot as plt
  7. import time
  8. plt.rcParams['font.sans-serif']=['SimHei'] # 把中文字体改成国际黑体
  9. plt.rcParams['axes.unicode_minus'] = False # 显示负号
  10. start_time = time.time()
  11. # 读取数据
  12. data = pd.read_csv(r'D:\2暂存文件\Sth with Py\重要性程度分析\data.csv')
  13. X = data.iloc[0:5000,[0, 1, 2, 3, 4, 5, 6, 7]]
  14. y = data.iloc[0:5000,8]
  15. '''决策树'''
  16. model_dtr = DecisionTreeRegressor(max_depth=11,
  17. min_samples_split=18,
  18. min_samples_leaf=13,
  19. max_features=12)
  20. model_dtr.fit(X, y)
  21. importances_dtr = model_dtr.feature_importances_
  22. '''随机森林'''
  23. model_rfr = RandomForestRegressor(n_estimators=30,max_depth=19,max_features=24)
  24. model_rfr.fit(X, y)
  25. importances_rfr = model_rfr.feature_importances_
  26. '''梯度提升树'''
  27. model_gbr = GradientBoostingRegressor(max_depth=9,learning_rate=0.0418665547136736,n_estimators=188,subsample=0.676537978032126)
  28. model_gbr.fit(X, y)
  29. importances_gbr = model_gbr.feature_importances_
  30. # 创建特征名称列表
  31. feature_names = ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']
  32. # 将特征名称和重要性值进行配对
  33. feature_importances_dtr = list(zip(feature_names, importances_dtr))
  34. print('决策树特征重要性:',feature_importances_dtr)
  35. feature_importances_rfr = list(zip(feature_names, importances_rfr))
  36. print('\n随机森林特征重要性:',feature_importances_rfr)
  37. feature_importances_gbr = list(zip(feature_names, importances_gbr))
  38. print('\n梯度提升树特征重要性:',feature_importances_gbr)
  39. '''绘图'''
  40. tree = pd.read_excel(r'D:\2暂存文件\Sth with Py\重要性程度分析\树模型重要性.xlsx')
  41. labels=["特征","Decision trees","Random Forest","GBDT","Average"]
  42. # 把dataframe转换为list
  43. x = tree['特征'].values.tolist()
  44. y1 = tree['Decision trees'].values.tolist()
  45. y2 = tree['Random Forest'].values.tolist()
  46. y3 = tree['GBDT'].values.tolist()
  47. y4 = tree['Average'].values.tolist()
  48. plt.bar(x=np.arange(len(x))-0.2,height=y1,label="Decision trees",color="#AADCE0",width=0.1)
  49. plt.bar(x=np.arange(len(x))-0.1,height=y2,label="Random Forest",color="#FFD06F",width=0.1)
  50. plt.bar(x=x,height=y3,label="GBDT",color="#FFE6B7",width=0.1)
  51. plt.bar(x=np.arange(len(x))+0.1,height=y4,label="Average",color="#E76254",width=0.1)
  52. plt.legend(loc="upper right")
  53. plt.xticks(x)
  54. 轴=plt.gca()
  55. 轴.set_xticklabels(x,rotation=45,ha="center")
  56. 图形=plt.gcf()
  57. plt.xlabel('Feature',fontsize=15)
  58. plt.ylabel('Importance',fontsize=15)
  59. plt.title('Feature Importance',fontsize=18)
  60. 图形.subplots_adjust(left=0.1,bottom=0.3)
  61. plt.savefig(r'D:\2暂存文件\Sth with Py\重要性程度分析\树模型重要性.png',dpi=600)
  62. # 输出执行时间
  63. end_time = time.time()
  64. execution_time = end_time - start_time
  65. print("模型训练执行时间: {:.2f}秒".format(execution_time))
  66. plt.show()




  1. import pandas as pd
  2. import shap
  3. from sklearn.preprocessing import MinMaxScaler
  4. from sklearn.ensemble import RandomForestRegressor
  5. import matplotlib.pyplot as plt
  6. import time
  7. plt.rcParams['font.sans-serif']=['SimHei'] # 把中文字体改成国际黑体
  8. plt.rcParams['axes.unicode_minus'] = False # 显示负号
  9. # 读取数据
  10. normalized_data = pd.read_excel(r'D:\2暂存文件\Sth with Py\重要性程度分析\归一化数据.xlsx')
  11. X = normalized_data.iloc[:,[0, 1, 2, 3, 4, 5, 6, 7]]
  12. y = normalized_data.iloc[:,8]
  13. start_time = time.time()
  14. # 初始化随机森林模型
  15. model = RandomForestRegressor(n_estimators=30,max_depth=19,max_features=24)
  16. # 训练模型
  17. model.fit(X, y)
  18. # 创建一个Explainer对象
  19. explainer = shap.Explainer(model)
  20. # 计算SHAP值
  21. shap_values = explainer.shap_values(X)
  22. # 结束计时
  23. end_time = time.time()
  24. # 输出执行时间
  25. execution_time = end_time - start_time
  26. print("模型训练执行时间: {:.2f}秒".format(execution_time))
  27. # 打印特征重要性得分
  28. shap.summary_plot(shap_values, X)
  29. # 打印每个特征的重要性图
  30. shap.summary_plot(shap_values, X, plot_type="bar")





