赞
踩
任务: 使用鸢尾花的数据来说明多分类的问题
损失:
Multi-classification的问题可以用Multi-LogLoss
Multi-LogLoss
=
−
1
n
∑
i
=
1
n
∑
j
=
1
m
y
i
,
j
log
(
p
i
,
j
)
.
\text{Multi-LogLoss}=-\frac{1}{n}\sum_{i=1}^n\sum_{j=1}^m y_{i,j}\log (p_{i,j}).
Multi-LogLoss=−n1i=1∑nj=1∑myi,jlog(pi,j).
其中n是预测样本的个数
m是class的数量
yi,j是真实的分类,如果第i个样本属于第j个class,那么yi,j=1
log是自然对数
pi,j是模型预测出来第i个样本属于第j个class的概率
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb
import datetime
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
data = iris.data
data[:10]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1]])
#一共有150个样本数据, 维度为4维
label = iris.target
label
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
feature = iris.data.T
plt.scatter(feature[0], feature[1], alpha=0.5, s=100*feature[3], c=iris.target)
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.show()
# 划分数据集
train_x, test_x, train_y, test_y = train_test_split(data, label, random_state=0)
dtrain = xgb.DMatrix(train_x,label=train_y)
dtest = xgb.DMatrix(test_x)
param1 = {'verbosity':2 , "tree_method":"gpu_hist" # 使用GPU , "gpu_id": 0 # 指定GPU , 'objective': 'multi:softmax' # 目标 , "num_class":3 # 指定多分类别数目 , 'eval_metric':'mlogloss' # , "eta":0.01 # , "gamma":0 # , "max_depth":6 # , "subsample":1 # 横向抽样 # , "colsample_bytree":1 # 纵向抽样 # , "colsample_bylevel":1 # , "colsample_bynode":1 # , "lambda":1 # L2 # , "alpha":0 # L1 } num_round = 400 print("The time of param1:") t0 = time() cvresult1 = xgb.cv(params=param1, dtrain=dtrain, num_boost_round=num_round) print(datetime.datetime.fromtimestamp(time()-t0).strftime("%M:%S:%f"))
The time of param1:
00:02:254538
param2 = {'verbosity':2 , "tree_method":"gpu_hist" # 使用GPU , "gpu_id": 0 # 指定GPU , 'objective': 'multi:softmax' # 目标 , "num_class":3 # 指定多分类别数目 , 'eval_metric':'mlogloss' # , "eta":0.01 # , "gamma":0 # , "max_depth":6 # , "subsample":1 # 横向抽样 # , "colsample_bytree":1 # 纵向抽样 # , "colsample_bylevel":1 # , "colsample_bynode":1 # , "lambda":1 # L2 # , "alpha":0 # L1 } t0 = time() print("The time of param2:") cvresult2 = xgb.cv(params=param2, dtrain=dtrain, num_boost_round=num_round) print(datetime.datetime.fromtimestamp(time()-t0).strftime("%M:%S:%f"))
The time of param2:
00:02:002574
param3 = {'verbosity':2 , "tree_method":"gpu_hist" # 使用GPU , "gpu_id": 0 # 指定GPU , 'objective': 'multi:softmax' # 目标 , "num_class":3 # 指定多分类别数目 , 'eval_metric':'mlogloss' , "eta":0.01 # , "gamma":0 , "max_depth":6 # , "subsample":1 # 横向抽样 # , "colsample_bytree":1 # 纵向抽样 # , "colsample_bylevel":1 # , "colsample_bynode":1 # , "lambda":1 # L2 # , "alpha":0 # L1 } print("The time of param3:") t0 = time() cvresult3 = xgb.cv(params=param3, dtrain=dtrain, num_boost_round=num_round) print(datetime.datetime.fromtimestamp(time()-t0).strftime("%M:%S:%f"))
The time of param3:
00:02:332151
# 调整参数后的效果 fig, ax = plt.subplots(1, figsize=(15, 8)) ax.grid() end = num_round + 1 ax.plot(range(1, end), cvresult1.iloc[:, 0], c="red", label="test_origin") ax.plot(range(1, end), cvresult1.iloc[:, 2], c="green", label="train_origin") ax.plot(range(1, end), cvresult2.iloc[:, 0], c="pink", label="test_last") ax.plot(range(1, end), cvresult2.iloc[:, 2], c="yellow", label="train_last") ax.plot(range(1, end), cvresult3.iloc[:, 0], c="black", label="test_this") ax.plot(range(1, end), cvresult3.iloc[:, 2], c="blue", label="train_this") ax.legend(fontsize = "xx-large") ax.set_ylim(bottom=-0.1, top=0.7) plt.ylabel(cvresult1.columns[1].split("-")[1], fontsize=16) plt.xlabel("num_round", fontsize=16) plt.savefig("./imgs/cv.png") plt.show()
Multi_xgb = xgb.train(param1, dtrain, num_boost_round=400)
y_hat = Multi_xgb.predict(dtest)
print("Accuracy:",round(accuracy_score(y_true=test_y, y_pred=y_hat), 4))
Accuracy: 0.9737
xgb.plot_importance(Multi_xgb)
plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。