赞
踩
在上一期的笔记中,我们主要讨论了XGBoost
的原理,最近我在公众号机器学习实验室看到一篇文章,里面给出了XGBoost
的纯Python实现,刚好能够作为补充,于是我将代码摘录了过来,方便学习和记录。
定义XGBoost单棵树模型如下:
class XGBoostTree(Tree): # 结点分裂 def _split(self, y): col = int(np.shape(y)[1]/2) y, y_pred = y[:, :col], y[:, col:] return y, y_pred # 信息增益计算公式 def _gain(self, y, y_pred): Gradient = np.power((y * self.loss.gradient(y, y_pred)).sum(), 2) Hessian = self.loss.hess(y, y_pred).sum() return 0.5 * (Gradient / Hessian) # 树分裂增益计算 def _gain_by_taylor(self, y, y1, y2): # 结点分裂 y, y_pred = self._split(y) y1, y1_pred = self._split(y1) y2, y2_pred = self._split(y2) true_gain = self._gain(y1, y1_pred) false_gain = self._gain(y2, y2_pred) gain = self._gain(y, y_pred) return true_gain + false_gain - gain # 叶子结点最优权重 def _approximate_update(self, y): # y split into y, y_pred y, y_pred = self._split(y) # Newton's Method gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0) hessian = np.sum(self.loss.hess(y, y_pred), axis=0) update_approximation = gradient / hessian return update_approximation # 树拟合方法 def fit(self, X, y): self._impurity_calculation = self._gain_by_taylor self._leaf_value_calculation = self._approximate_update super(XGBoostTree, self).fit(X, y)
然后根据前向分步算法定义XGBoost模型:
class XGBoost(object): def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2, min_impurity=1e-7, max_depth=2): self.n_estimators = n_estimators # Number of trees self.learning_rate = learning_rate # Step size for weight update self.min_samples_split = min_samples_split # The minimum n of sampels to justify split self.min_impurity = min_impurity # Minimum variance reduction to continue self.max_depth = max_depth # Maximum depth for tree # 交叉熵损失 self.loss = LogisticLoss() # 初始化模型 self.estimators = [] # 前向分步训练 for _ in range(n_estimators): tree = XGBoostTree( min_samples_split=self.min_samples_split, min_impurity=min_impurity, max_depth=self.max_depth, loss=self.loss) self.estimators.append(tree) def fit(self, X, y): y = to_categorical(y) y_pred = np.zeros(np.shape(y)) for i in range(self.n_estimators): tree = self.trees[i] y_and_pred = np.concatenate((y, y_pred), axis=1) tree.fit(X, y_and_pred) update_pred = tree.predict(X) y_pred -= np.multiply(self.learning_rate, update_pred) def predict(self, X): y_pred = None # 预测 for tree in self.estimators: update_pred = tree.predict(X) if y_pred is None: y_pred = np.zeros_like(update_pred) y_pred -= np.multiply(self.learning_rate, update_pred) y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True) # 将概率预测转换为标签 y_pred = np.argmax(y_pred, axis=1) return y_pred
使用sklearn数据作为示例:
from sklearn import datasets
data = datasets.load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)
clf = XGBoost()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Accuracy: 0.9666666666666667
XGBoost也提供了原生的模型库可供我们调用。pip安装xgboost即可:
pip install xgboost
同样使用sklearn数据集进行测试:
import xgboost as xgb from xgboost import plot_importance from matplotlib import pyplot as plt # 设置模型参数 params = { 'booster': 'gbtree', 'objective': 'multi:softmax', 'num_class': 3, 'gamma': 0.1, 'max_depth': 2, 'lambda': 2, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 3, 'silent': 1, 'eta': 0.001, 'seed': 1000, 'nthread': 4, } plst = params.items() dtrain = xgb.DMatrix(X_train, y_train) num_rounds = 200 model = xgb.train(plst, dtrain, num_rounds) # 对测试集进行预测 dtest = xgb.DMatrix(X_test) y_pred = model.predict(dtest) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy) # 绘制特征重要性 plot_importance(model) plt.show();
Accuracy: 0.9166666666666666
注:以上代码转自公众号 机器学习实验室 的文章《数学推导+纯Python实现机器学习算法17:XGBoost》
联系邮箱:curren_wong@163.com
Github:https://github.com/CurrenWong
欢迎转载/Star/Fork,有问题欢迎通过邮箱交流。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。