赞
踩
第七章.集成学习 (Ensemble Learning)
集成学习就是组合多个学习器,最后得到一个更好的学习器。
AdaBoost是英文“Adaptive Boosting”(自适应强调)的英文缩写。
①.原始训练数据集:{0,1,2,3,4,5,6,7,8,9}
②.Bootstrap采样:
{7,2,6,7,5,4,8,8,1,0}—未采样3,9
{1,3,8,4,3,5,4,0,1,4}—未采样2,6,7,9
{4,9,4,2,4,4,3,0,1,4}—未采样5,6,7,8
③.例如样本4是出错率很高的样本,我们就多次对它进行采样,可以提升学习器的性能。
①.初始化训练数据的权值分布D1
②.训练弱分类器hi
③.将各个训练得到的弱分类器组成一个强分类器
import numpy as np import matplotlib.pyplot as plt from sklearn import tree from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import make_gaussian_quantiles # 绘制图像 def plot(model): x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1 # 生成网格矩阵 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) z = model.predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) # 绘制等高线 cs = plt.contourf(xx, yy, z) # 绘制散斑点 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) # 生成二维正态分布,生成的数据按分位数分成两类,500个样本,两个样本特征 x1, y1 = make_gaussian_quantiles(n_samples=500, n_features=2, n_classes=2) # 生成二维正态分布,生成的数据按分位数分成两类,500个样本,两个样本特征均值都为3 x2, y2 = make_gaussian_quantiles(mean=(3, 3), n_samples=500, n_features=2, n_classes=2) # 将两组数据合并成一组数据 x_data = np.concatenate((x1, x2)) y_data = np.concatenate((y1, -y2 + 1)) # 决策树模型 dtree = tree.DecisionTreeClassifier(max_depth=3) dtree.fit(x_data, y_data) dtree_accuracy = dtree.score(x_data, y_data) print('dtree_accuracy:', dtree_accuracy) # Adaboost模型 adaboost = AdaBoostClassifier(dtree, n_estimators=10) adaboost.fit(x_data, y_data) adaboost_accuracy = adaboost.score(x_data, y_data) print('adaboost_accuracy:', adaboost_accuracy) # 绘制决策树模型 plt.subplot(1, 2, 1) plot(dtree) # 绘制Adaboost模型 plt.subplot(1, 2, 2) plot(adaboost) plt.show()
使用多个不同的分类器对训练数据进行预测,把预测得到的结果作为一个次级分类器的输入,次级分类器的输出是整个模型的预测结果。
需要经过两层分类器,输出最后的预测结果。
from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn import model_selection from mlxtend.classifier import StackingClassifier # 加载鸢尾花数据集 iris = load_iris() # 只要第1,2列特征 x_data = iris.data[:, 1:3] y_data = iris.target # 定义三个不同的分类器 clf1 = LogisticRegression() # 逻辑回归 clf2 = DecisionTreeClassifier() # 决策树 clf3 = KNeighborsClassifier() # KNN # 次级分类器 lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) for clf, label in zip([clf1, clf2, clf3, sclf], ['LogisticRegression', 'DecisionTree', 'KNN', 'Stacking']): scores = model_selection.cross_val_score(clf, x_data, y_data, cv=3, scoring='accuracy') #cross_val_score:交叉验证法 cv=3:数据集分成3个部分:1部分训练集,2部分测试集 print("Accuracy: %0.2f [%s]" % (scores.mean(), label))
from sklearn import model_selection from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import VotingClassifier # 加载数据 iris = load_iris() # 数据分割:只要第1,2列特征 x_data = iris.data[:, 1:3] y_data = iris.target # 定义三个不同的分类器 clf1 = KNeighborsClassifier() # KNN clf2 = DecisionTreeClassifier() # DecisionTree clf3 = LogisticRegression() # 逻辑回归 # 次级分类器 sclf = VotingClassifier([('knn', clf1), ('dtree', clf2), ('lr', clf3)]) for clf, label in zip([clf1, clf2, clf3, sclf], ['KNN', 'Decision Tree', 'LogisticRegression', 'VotingClassifier']): scores = model_selection.cross_val_score(clf, x_data, y_data, cv=3, scoring='accuracy') print("Accuracy: %0.2f [%s]" % (scores.mean(), label))
import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import VotingClassifier # 加载数据 titanic = pd.read_csv('D:\\Data\\titanic_train.csv') # 空余的age填充整体age的中间值 titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median()) # 把male变成0,female变成1 titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0 titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1 # 数据填充 titanic["Embarked"] = titanic["Embarked"].fillna('S') # 把类别变成数字 titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0 titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1 titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2 # 选定特征 predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] x_data = titanic[predictors] y_data = titanic["Survived"] # 数据标准化 scaler = StandardScaler() x_data = scaler.fit_transform(x_data) # 逻辑回归 LR = LogisticRegression() scores = model_selection.cross_val_score(LR, x_data, y_data, cv=3) # 计算交叉验证的误差 avg = scores.mean() # 求平均 print('逻辑回归误差:', avg) # 神经网络 mlp = MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=2000) scores = model_selection.cross_val_score(mlp, x_data, y_data, cv=3) avg = scores.mean() print('神经网络误差:', avg) # KNN knn = KNeighborsClassifier(n_neighbors=21) scores = model_selection.cross_val_score(knn, x_data, y_data, cv=3) avg = scores.mean() print('KNN误差:', avg) # DecisionTree dtree = DecisionTreeClassifier(max_depth=5, min_samples_split=4) scores = model_selection.cross_val_score(dtree, x_data, y_data, cv=3) avg = scores.mean() print('DecisionTree误差:', avg) # RandomForest RF = RandomForestClassifier(n_estimators=100, min_samples_split=4) scores = model_selection.cross_val_score(RF, x_data, y_data, cv=3) avg = scores.mean() print('RandomForest误差:', avg) # Bagging bagging = BaggingClassifier(RF, n_estimators=20) scores = model_selection.cross_val_score(bagging, x_data, y_data, cv=3) avg = scores.mean() print('Bagging误差:', avg) # Adaboost adaboost = AdaBoostClassifier(bagging, n_estimators=10) scores = model_selection.cross_val_score(adaboost, x_data, y_data, cv=3) avg = scores.mean() print('Adaboost误差:', avg) # Stacking Stacking = VotingClassifier([('adaboost', adaboost), ('mlp', mlp), ('LR', LR), ('knn', knn), ('dtree', dtree)]) scores = model_selection.cross_val_score(Stacking, x_data, y_data, cv=3) avg = scores.mean() print('Stacking误差:', avg)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。