当前位置:   article > 正文

xgboost实现蘑菇数据分类预测demo

xgboost实现蘑菇数据分类预测demo

数据集下载:

训练集测试集

  1. import xgboost as xgb
  2. import numpy as np
  3. # 自己实现loss function,softmax函数
  4. def log_reg(y_hat, y):
  5. p = 1.0 / (1.0 + np.exp(-y_hat))
  6. g = p - y.get_label()
  7. h = p * (1.0 - p)
  8. return g, h
  9. # 自己实现错误率计算
  10. def error_rate(y_hat, y):
  11. return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)
  12. if __name__ == '__main__':
  13. # 读取数据
  14. data_train = xgb.DMatrix('agaricus_train.txt')
  15. data_test = xgb.DMatrix('agaricus_test.txt')
  16. print('data_train:\n', data_train)
  17. print(type(data_train))
  18. # 设定相关参数
  19. param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
  20. # param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:logistic'}
  21. watchlist = [(data_test, 'eval'), (data_train, 'train')]
  22. n_round = 10
  23. bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)
  24. # 计算错误率
  25. y_hat = bst.predict(data_test)
  26. y = data_test.get_label()
  27. print('y_hat:\n', y_hat)
  28. print('y:\n', y)
  29. error = sum(y != (y_hat > 0.5))
  30. error_rate = float(error) / len(y_hat)
  31. print('total samples:%d' % len(y_hat))
  32. print('the wrong numbers:%d' % error)
  33. print('error ratio:%.3f%%' % error_rate)


我们加入logistic回归作对比:

  1. import xgboost as xgb
  2. import numpy as np
  3. import pandas as pd
  4. import scipy.sparse
  5. from sklearn.linear_model import LogisticRegression
  6. from sklearn.model_selection import train_test_split
  7. # 自己实现loss function,softmax函数
  8. def log_reg(y_hat, y):
  9. p = 1.0 / (1.0 + np.exp(-y_hat))
  10. g = p - y.get_label()
  11. h = p * (1.0 - p)
  12. return g, h
  13. # 自己实现错误率计算
  14. def error_rate(y_hat, y):
  15. return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)
  16. def read_data(path):
  17. y = []
  18. row = []
  19. col = []
  20. values = []
  21. r = 0 # 首行
  22. for d in open(path):
  23. d = d.strip().split() # 以空格分开
  24. y.append(int(d[0]))
  25. d = d[1:]
  26. for c in d:
  27. key, value = c.split(':')
  28. row.append(r)
  29. col.append(int(key))
  30. values.append(float(value))
  31. r += 1
  32. x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
  33. y = np.array(y)
  34. return x, y
  35. if __name__ == '__main__':
  36. # 读取数据
  37. data_train = xgb.DMatrix('agaricus_train.txt')
  38. data_test = xgb.DMatrix('agaricus_test.txt')
  39. print('data_train:\n', data_train)
  40. print(type(data_train))
  41. # 设定相关参数
  42. param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
  43. # param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:logistic'}
  44. watchlist = [(data_test, 'eval'), (data_train, 'train')]
  45. n_round = 10
  46. bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)
  47. # 计算错误率
  48. y_hat = bst.predict(data_test)
  49. y = data_test.get_label()
  50. print('y_hat:\n', y_hat)
  51. print('y:\n', y)
  52. error = sum(y != (y_hat > 0.5))
  53. error_rate = float(error) / len(y_hat)
  54. print('total samples:%d' % len(y_hat))
  55. print('the wrong numbers:%d' % error)
  56. print('error ratio:%.3f%%' % error_rate)
  57. print('logistic accuracy ratio:%.3f%%' % (1 - error_rate))
  58. print('=========================================')
  59. # logistic regression
  60. x, y = read_data('agaricus_train.txt')
  61. x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1)
  62. lr = LogisticRegression(penalty='l2')
  63. lr.fit(x_train, y_train.ravel())
  64. y_hat = lr.predict(x_test)
  65. acc = y_hat.ravel() == y_test.ravel()
  66. print('acc:\t', acc)
  67. print('XGBosst accuracy:\t', float(acc.sum()) / y_hat.size)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/97466
推荐阅读
相关标签
  

闽ICP备14008679号