当前位置:   article > 正文

PyML(五)——用sklearn训练决策树模型_sklearn在一个训练好的决策数树上继续训练

sklearn在一个训练好的决策数树上继续训练

1.决策树

没什么好说的,可以用graphviz(dot命令)画流程图

  1. # -*- coding: utf-8 -*-
  2. # @Time : 2018/7/24 9:13
  3. # @Author : Alan
  4. # @Email : xiezhengwen2013@163.com
  5. # @File : decision tree_sk1.py
  6. # @Software: PyCharm
  7. from sklearn.tree import DecisionTreeClassifier
  8. from sklearn import datasets
  9. import numpy as np
  10. import matplotlib.pyplot as plt
  11. from sklearn.cross_validation import train_test_split
  12. from sklearn.preprocessing import StandardScaler
  13. from matplotlib.colors import ListedColormap
  14. from sklearn.tree import export_graphviz
  15. iris = datasets.load_iris()
  16. X = iris.data[:,[2,3]]
  17. y = iris.target
  18. X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 0)
  19. sc = StandardScaler()
  20. sc.fit(X_train)
  21. X_train_std = sc.transform(X_train)
  22. X_test_std = sc.transform(X_test)
  23. X_combined_std = np.vstack((X_train_std,X_test_std))
  24. y_combined_std = np.hstack((y_train,y_test))
  25. def plot_decision_regions(X, y, classifier,test_idx=None, resolution=0.02):
  26. # setup marker generator and color map
  27. markers = ('s', 'x', 'o', '^', 'v')
  28. colors = ('red', 'blue', 'lightgreen', 'cyan','gray')
  29. cmap = ListedColormap(colors[:len(np.unique(y))])
  30. # plot the decision surface
  31. x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
  32. x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  33. xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
  34. np.arange(x2_min, x2_max, resolution))
  35. Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
  36. Z = Z.reshape(xx1.shape)
  37. plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
  38. plt.xlim(xx1.min(), xx1.max())
  39. plt.ylim(xx2.min(), xx2.max())
  40. # plot all samples
  41. #X_test, y_test = X[test_idx, :], y[test_idx]
  42. for idx, cl in enumerate(np.unique(y)):
  43. plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
  44. alpha=0.8, c=cmap(idx),
  45. marker=markers[idx], label=cl)
  46. # highlight test samples
  47. if test_idx:
  48. X_test, y_test = X[test_idx, :], y[test_idx]
  49. plt.scatter(X_test[:,0], X_test[:,1], c='',
  50. alpha=1.0, linewidth=1, marker='o',
  51. s=55, label='test set')
  52. tree = DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=0)
  53. tree.fit(X_train_std,y_train)
  54. plot_decision_regions(X_combined_std,y_combined_std,classifier=tree,test_idx=range(105,150))
  55. plt.xlabel('petal length [cm]')
  56. plt.ylabel('petal width [cm]')
  57. plt.legend(loc='upper left')
  58. plt.show()
  59. export_graphviz(tree,out_file='tree.dot',feature_names=['petal length','petal width'])

2.随机森林

random forest一大优点是受超参数的影响波动不是很大,但是几个主要参数还是需要好好调参的。比如说:在实际运用随机森林模型时,树的数目(k)需要好好调参。一般,k越大,随机森林的性能越好,当然计算成本也越高。

  1. # -*- coding: utf-8 -*-
  2. # @Time : 2018/7/24 10:41
  3. # @Author : Alan
  4. # @Email : xiezhengwen2013@163.com
  5. # @File : decision_tree_sk2.py
  6. # @Software: PyCharm
  7. from sklearn.ensemble import RandomForestClassifier
  8. from sklearn import datasets
  9. import numpy as np
  10. import matplotlib.pyplot as plt
  11. from sklearn.cross_validation import train_test_split
  12. from sklearn.preprocessing import StandardScaler
  13. from matplotlib.colors import ListedColormap
  14. from sklearn.tree import export_graphviz
  15. iris = datasets.load_iris()
  16. X = iris.data[:,[2,3]]
  17. y = iris.target
  18. X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 0)
  19. sc = StandardScaler()
  20. sc.fit(X_train)
  21. X_train_std = sc.transform(X_train)
  22. X_test_std = sc.transform(X_test)
  23. X_combined_std = np.vstack((X_train_std,X_test_std))
  24. y_combined_std = np.hstack((y_train,y_test))
  25. def plot_decision_regions(X, y, classifier,test_idx=None, resolution=0.02):
  26. # setup marker generator and color map
  27. markers = ('s', 'x', 'o', '^', 'v')
  28. colors = ('red', 'blue', 'lightgreen', 'cyan','gray')
  29. cmap = ListedColormap(colors[:len(np.unique(y))])
  30. # plot the decision surface
  31. x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
  32. x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  33. xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
  34. np.arange(x2_min, x2_max, resolution))
  35. Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
  36. Z = Z.reshape(xx1.shape)
  37. plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
  38. plt.xlim(xx1.min(), xx1.max())
  39. plt.ylim(xx2.min(), xx2.max())
  40. # plot all samples
  41. #X_test, y_test = X[test_idx, :], y[test_idx]
  42. for idx, cl in enumerate(np.unique(y)):
  43. plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
  44. alpha=0.8, c=cmap(idx),
  45. marker=markers[idx], label=cl)
  46. # highlight test samples
  47. if test_idx:
  48. X_test, y_test = X[test_idx, :], y[test_idx]
  49. plt.scatter(X_test[:,0], X_test[:,1], c='',
  50. alpha=1.0, linewidth=1, marker='o',
  51. s=55, label='test set')
  52. forest = RandomForestClassifier(criterion='entropy',n_estimators=10,random_state=1,n_jobs=2)
  53. forest.fit(X_train_std,y_train)
  54. plot_decision_regions(X_combined_std, y_combined_std,classifier=forest, test_idx=range(105,150))
  55. plt.xlabel('petal length')
  56. plt.ylabel('petal width')
  57. plt.legend(loc='upper left')
  58. plt.show()

reference:

《python machine learning》

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/AllinToyou/article/detail/643585
推荐阅读
相关标签
  

闽ICP备14008679号