当前位置:   article > 正文

python实现xgboost回归_xgboost regression

xgboost regression

1、本文实现的是一个简易版的xgboost回归例子,只是为了帮助理解xgboost底层原理,像一些抽样的参数比如subsample或者colsample_bytree等就没考虑了,同时也假定特征都是连续变量

2、与gbdt不同的是xgboost中的cart树分裂是依据信息增益最大的方向进行分裂,同时gain要大于0,gain的公式:

3、在分裂的时候还需要满足min_child_weight <= min(H_L,H_R),如果不满足就不给分裂,起到预剪枝作用

4、回归:g_i = ypred_i - y_i,h_i = 1,分类:g_i = ypred_i - y_i,h_i = ypred_i * (1 - y_pred_i)

5、对于分裂得到的叶子节点利用节点的更新公式更新

6、计算特征重要度important_type = 'gain'

   a、计算每个特征的总增益

   b、根据每个特征的分裂次数计算平均增益,最后在归一化

代码

  1. import numpy as np
  2. from collections import Counter,defaultdict
  3. import copy
  4. n_estimators =10#树的棵数
  5. MAX_DEPTH = 2
  6. LR = 0.3
  7. min_child_weight = 1 # 最小叶子节点占比权重
  8. base_score = 0.5
  9. GAMMA = 0.05
  10. # 回归:G = ypred - y,H = 1
  11. # 分类:G = ypred - y,H = ypred * (1 - ypred)
  12. class XGBoostModel:
  13. def __init__(self,target,n_estimators,lr,max_depth,min_child_weight,reg_lambda,reg_alpha,base_score,gamma):
  14. '''
  15. :param target: reg if target is a regression else classify
  16. :param n_estimators: cart树的棵树
  17. :param lr: 学习率
  18. :param max_depth: 树的最大深度
  19. :param min_child_weight: 最小叶子节点占比权重
  20. :param reg_lambda: l2正则
  21. :param reg_alpha: l1正则 # 该代码还未实现
  22. :param gamma: gamma参数
  23. '''
  24. self.target = target
  25. self.n_estimators = n_estimators
  26. self.lr = lr
  27. self.max_depth = max_depth
  28. self.min_child_weight = min_child_weight
  29. self.reg_lambda = reg_lambda
  30. self.reg_alpha = reg_alpha
  31. self.tree_list = []
  32. self.gain_list = []
  33. if self.target.startswith('reg'):
  34. self.base_score = base_score
  35. else:
  36. self.base_score = np.log(base_score / (1 - base_score))
  37. self.gamma = gamma
  38. def calc_G(self,pred,y):
  39. return np.sum(pred - y)
  40. def calc_H(self,pred):
  41. if self.target.startswith('reg'):
  42. return len(pred)
  43. return np.sum(pred * (1 - pred))
  44. @staticmethod
  45. # 切分分类数据
  46. def split_data(data,feat,val,data_type='classifier'):
  47. if data_type == 'classifier':
  48. arr1 = data[np.nonzero(data[:,feat] == val)]
  49. arr2 = data[np.nonzero(data[:,feat] != val)]
  50. else:
  51. arr1 = data[np.nonzero(data[:,feat].astype(float) < val)]
  52. arr2 = data[np.nonzero(data[:,feat].astype(float) >= val)]
  53. return arr1,arr2,np.nonzero(data[:,feat].astype(float) < val)[0],np.nonzero(data[:,feat].astype(float) >= val)[0]
  54. @staticmethod
  55. # 连续变量的切分点处理
  56. def continuity_params_process(arr,feat):
  57. c = arr[:,feat].astype(float)
  58. c_sort = sorted(set(c),reverse=True)
  59. new_c = []
  60. for i in range(len(c_sort)-1):
  61. val = (c_sort[i] + c_sort[i+1]) / 2
  62. new_c.append(val)
  63. return new_c
  64. # 选择最好的切分点
  65. # 满足Gain最大且大于0才分裂
  66. def select_split(self,data,Y):
  67. max_gain = -1
  68. best_feat = None
  69. best_val = None
  70. left = None
  71. right = None
  72. left_y = None
  73. right_y = None
  74. g_left = None
  75. h_left = None
  76. g_right = None
  77. h_right = None
  78. data_type = 'continuity'
  79. for i in range(data.shape[1]-1):
  80. # c_set = set(data[:, i])
  81. c_set = self.continuity_params_process(data,i)
  82. for val in c_set:
  83. arr1,arr2,arr1_index,arr2_index = self.split_data(data,i,val,data_type)
  84. gain, G_left, H_left, G_right, H_right = self.calc_gain(arr1,Y[arr1_index],arr2,Y[arr2_index])
  85. if (max_gain < gain) and (gain > 0) and (self.min_child_weight <= min(H_left,H_right)):
  86. max_gain = gain
  87. best_feat = i
  88. best_val = val
  89. left = arr1
  90. right = arr2
  91. left_y = Y[arr1_index]
  92. right_y = Y[arr2_index]
  93. g_left = G_left
  94. h_left = H_left
  95. g_right = G_right
  96. h_right = H_right
  97. if best_feat is None:
  98. # g = np.sum(data[:,-1] - Y)
  99. g = self.calc_G(data[:,-1],Y)
  100. # h = len(data)
  101. h = self.calc_H(data[:,-1])
  102. return best_feat,best_val,left,right,left_y,right_y,g,h,g,h
  103. self.gain_list.append({best_feat:max_gain+self.gamma})
  104. return best_feat,best_val,left,right,left_y,right_y,g_left,h_left,g_right,h_right
  105. def calc_gain(self,left,left_y,right,right_y):
  106. G_left = self.calc_G(left[:,-1],left_y)
  107. H_left = self.calc_H(left[:,-1])
  108. G_right = self.calc_G(right[:,-1],right_y)
  109. H_right = self.calc_H(right[:,-1])
  110. Gain = (G_left ** 2 / (H_left + self.reg_lambda) + G_right ** 2 / (H_right+self.reg_lambda) -
  111. (G_left + G_right) ** 2/ (H_left + H_right + self.reg_lambda)) - self.gamma
  112. return Gain,G_left,H_left,G_right,H_right
  113. # 构建递归树
  114. def create_tree(self,data,Y,n=0):
  115. '''
  116. 利用递归构建回归树,n用来限制树的最大深度
  117. '''
  118. tree = {}
  119. dd = data[:,:-1].tolist()
  120. ddd = list(map(tuple,dd))
  121. cc = Counter(ddd)
  122. if len(cc) == 1:
  123. g = self.calc_G(data[:,-1],Y)
  124. h = self.calc_H(data[:,-1])
  125. return -g / (h + self.reg_lambda)
  126. best_feat,best_val,left,right,left_y,right_y,g_left,h_left,g_right,h_right = self.select_split(data,Y)
  127. if best_feat is None:
  128. return -g_left / (h_left + self.reg_lambda)
  129. n += 1
  130. if n >= self.max_depth:
  131. tree[(best_feat,best_val,'left')] = -g_left / (h_left + self.reg_lambda)
  132. tree[(best_feat,best_val,'right')] = -g_right / (h_right + self.reg_lambda)
  133. else:
  134. tree[(best_feat,best_val,'left')] = self.create_tree(left,left_y,n)
  135. tree[(best_feat,best_val,'right')] = self.create_tree(right,right_y,n)
  136. return tree
  137. def fit(self,dataset):
  138. data = copy.copy(dataset)
  139. self.tree_list.append(self.base_score)
  140. for i in range(self.n_estimators):
  141. for j in range(len(data)):
  142. data[j,-1] = self.predict(data[j,:-1])
  143. self.tree_list.append(self.create_tree(data,dataset[:,-1]))
  144. # 预测单颗树
  145. def predict_one(self,tree,X):
  146. if type(tree) != dict:
  147. return tree
  148. for key in tree:
  149. if X[key[0]] < key[1]:
  150. r = tree[(key[0],key[1],'left')]
  151. else:
  152. r = tree[(key[0], key[1], 'right')]
  153. return self.predict_one(r, X)
  154. # 预测
  155. def predict(self,X):
  156. result = self.tree_list[0]
  157. for tree in self.tree_list[1:]:
  158. result += self.lr * self.predict_one(tree,X)
  159. if self.target.startswith('reg'):
  160. return result
  161. return 1 / (1 + np.exp(-result))
  162. # 计算特征重要度
  163. def feat_importance(self):
  164. feat_imp = defaultdict(float)
  165. feat_counts = defaultdict(int)
  166. for item in self.gain_list:
  167. k, v = list(item.items())[0]
  168. feat_imp[k] += v
  169. feat_counts[k] += 1
  170. # 计算平均增益
  171. for k in feat_imp:
  172. feat_imp[k] /= feat_counts[k]
  173. v_sum = sum(feat_imp.values())
  174. for k in feat_imp:
  175. feat_imp[k] /= v_sum
  176. return feat_imp
  177. from xgboost.sklearn import XGBRegressor,XGBClassifier
  178. # 回归例子
  179. data = np.array([[5,20,1.1],
  180. [7,30,1.3],
  181. [21,55,1.7],
  182. [30,60,1.8],
  183. [26,40,1.6],
  184. ])
  185. xgb = XGBRegressor(n_estimators=n_estimators,learning_rate=LR,max_depth=MAX_DEPTH,
  186. min_child_weight=min_child_weight,base_score=base_score,gamma=GAMMA)
  187. xgb.fit(data[:,:-1],data[:,-1])
  188. print("xgboost:",xgb.predict(data[0,:-1].reshape(1,-1)))
  189. my_xgb_tree = XGBoostModel(target='regression',n_estimators=n_estimators,lr=LR,max_depth=MAX_DEPTH,
  190. min_child_weight=min_child_weight,reg_lambda=1,reg_alpha=0,base_score=base_score,gamma=GAMMA)
  191. my_xgb_tree.fit(data)
  192. print("my xgb tree:",my_xgb_tree.predict(data[0,:-1]))
  193. print(xgb.feature_importances_)
  194. print(my_xgb_tree.feat_importance())
  195. print('----------------classify test---------------------')
  196. data = np.array([[1,-5,0],
  197. [2,5,0],
  198. [3,-2,1],
  199. [2,2,1],
  200. [2,0,1],
  201. [6,-6,1],
  202. [7,5,1],
  203. [6,-2,0],
  204. [7,2,0]
  205. ])
  206. data = data.astype(float)
  207. xgb = XGBClassifier(n_estimators=n_estimators,learning_rate=LR,max_depth=MAX_DEPTH,
  208. min_child_weight=min_child_weight,base_score=base_score,gamma=GAMMA)
  209. xgb.fit(data[:,:-1],data[:,-1])
  210. print("xgboost:",xgb.predict_proba(data[0,:-1].reshape(1,-1)))
  211. my_xgb_tree = XGBoostModel(target='classify',n_estimators=n_estimators,lr=LR,max_depth=MAX_DEPTH,
  212. min_child_weight=min_child_weight,reg_lambda=1,reg_alpha=0,base_score=base_score,gamma=GAMMA)
  213. my_xgb_tree.fit(data)
  214. print("my xgb tree:",my_xgb_tree.predict(data[0,:-1]))
  215. print('xgboost feature importance',xgb.feature_importances_)
  216. print(my_xgb_tree.feat_importance())
  217. # print(data[0,:-1])
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/604655
推荐阅读
相关标签
  

闽ICP备14008679号