赞
踩
import pandas as pd import numpy as np from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_graphviz from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt import matplotlib as mpl #导入数据 iris = load_iris() #用pandas读取数据 data = pd.DataFrame(iris.data) #添加特征名称 data.columns = iris.feature_names #添加目标值 data['Species'] = load_iris().target # 特征数据只取花萼长度和宽度 x = data.iloc[:, :2] #取出目标数据 y = data.iloc[:, -1] #切分数据集,指定随机种子 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=42) #实例化一个决策树对象 指定最大树深为2,信息增益计算标准为基尼系数 tree_clf = DecisionTreeClassifier(max_depth=4, criterion='gini') #传入训练集数据进行训练 tree_clf.fit(x_train, y_train) #传入测试集进行预测 y_test_hat = tree_clf.predict(x_test) #打印准确率 print("acc score:", accuracy_score(y_test, y_test_hat)) #验证树深与准确率关系 depth = np.arange(1, 15) err_list = [] for d in depth: clf = DecisionTreeClassifier(criterion='entropy', max_depth=d) clf.fit(x_train, y_train) y_test_hat = clf.predict(x_test) result = (y_test_hat == y_test) err = 1 - np.mean(result) err_list.append(err) print(d, '错误率:%.2f%%' % (100 * err)) mpl.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(facecolor='w') plt.plot(depth, err_list, 'ro-', lw=2) plt.xlabel('决策树深度', fontsize=15) plt.ylabel('错误率', fontsize=15) plt.title('决策树深度和过拟合', fontsize=18) plt.grid(True) plt.show()
运行结果:
acc score: 0.7631578947368421
1 错误率:36.84%
2 错误率:34.21%
3 错误率:31.58%
4 错误率:26.32%
5 错误率:34.21%
6 错误率:28.95%
7 错误率:31.58%
8 错误率:26.32%
9 错误率:26.32%
10 错误率:39.47%
11 错误率:42.11%
12 错误率:34.21%
13 错误率:36.84%
14 错误率:36.84%
from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn.datasets import load_iris iris = load_iris() X = iris.data[:, :2] # 花萼长度和宽度 y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #实例化一个随机森林分类器对象 指定包含5棵决策树,最大叶子节点数为16,用5个线程进行训练 rnd_clf = RandomForestClassifier(n_estimators=15, max_leaf_nodes=20, n_jobs=5) rnd_clf.fit(X_train, y_train) y_pred_rf = rnd_clf.predict(X_test) print(accuracy_score(y_test, y_pred_rf)) # 计算特征重要性 iris = load_iris() rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1) rnd_clf.fit(iris["data"], iris['target']) for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_): print(name, score)
运行结构:
0.78
sepal length (cm) 0.10222109801813306
sepal width (cm) 0.02548487947686973
petal length (cm) 0.4183213187649062
petal width (cm) 0.4539727037400911
Process finished with exit code 0
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。