当前位置:   article > 正文

随机森林算法java代码_数据挖掘实践(27):算法基础(五)Random Forest(随机森林)算法(集成学习)(一)...

import numpy as np import os %matplotlib inline import matplotlib import mat

0 简介

0.1 主题

7466be55c3b281d24cf7a875a4a75ea4.png

0.2 目标

199d130f2c211142932b9561c1ecf474.png

1. Bootstraping与Bagging策略

1.1 Bootstraping/自助算法

91c7c268972ddcf024417fe1e0c2e47c.png

62212a1098052ee2d255b3f89d208c85.png

11ccee47c5738525a8d795eb8c8ca147.png

2f1a305e51e815ed908d3d6fde258188.png

968984f406c5355908c63008fad8dc5a.png

1.2 分类

dbc81d7d6452a537d9a6408b73705206.png

1.3 Bagging/套袋法

250adc78fd3bbe8d5f23d01196d8ad0e.png

3b0b1970c9d767c2df86ca70c3c36eec.png

1.4 集成学习之结合策略

8451348dfd9bf3880cc555155bc8ba39.png

427262a3e19ee0ea033eb0ab496a7c24.png 

cdba0e3ed44107b2590258367f237a8b.png

5c01b99ec299b5188a0e185a2776ecd6.png

cb714ea5df570d579aee08e19210a84f.png

2998cbc83ae42d748fa998b5fb222a2f.png

1.5 代码实验

importnumpy as npimportos%matplotlib inlineimportmatplotlibimportmatplotlib.pyplot as plt

plt.rcParams['axes.labelsize'] = 14plt.rcParams['xtick.labelsize'] = 12plt.rcParams['ytick.labelsize'] = 12

importwarnings

warnings.filterwarnings('ignore')

np.random.seed(42)

from sklearn.model_selection import train_test_split #分割数据集

from sklearn.datasets import make_moons #生成数据

"""主要参数作用如下:

n_numbers:生成样本数量

noise:默认是false,数据集是否加入高斯噪声

random_state:生成随机种子,给定一个int型数据,能够保证每次生成数据相同。"""X,y= make_moons(n_samples=500, noise=0.30, random_state=42)

X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42)

plt.plot(X[:,0][y==0],X[:,1][y==0],'yo',alpha = 0.6) #黄色的圆

plt.plot(X[:,0][y==0],X[:,1][y==1],'bs',alpha = 0.6) #蓝色的矩形

[]

9be847c9065a59d0a87b36fe2c005d3c.png

82b3723d69f9e32e00fdfb2a1700e116.png

from sklearn.tree importDecisionTreeClassifierfrom sklearn.ensemble import VotingClassifier #投票分类器

from sklearn.linear_model importLogisticRegressionfrom sklearn.svm importSVC

log_clf= LogisticRegression(random_state=42)

rnd_clf= DecisionTreeClassifier(random_state=42)

svm_clf= SVC(random_state=42)#投票 参数估计

voting_clf = VotingClassifier(estimators =[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],voting='hard')#voting_clf = VotingClassifier(estimators =[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],voting='soft')

voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr',

LogisticRegression(C=1.0, class_weight=None,

dual=False, fit_intercept=True,

intercept_scaling=1,

l1_ratio=None, max_iter=100,

multi_class='warn',

n_jobs=None, penalty='l2',

random_state=42, solver='warn',

tol=0.0001, verbose=0,

warm_start=False)),

('rf',

DecisionTreeClassifier(class_weight=None,

criterion='gini',

max_depth=None,

ma...

min_weight_fraction_leaf=0.0,

presort=False,

random_state=42,

splitter='best')),

('svc',

SVC(C=1.0, cache_size=200, class_weight=None,

coef0=0.0, decision_function_shape='ovr',

degree=3, gamma='auto_deprecated',

kernel='rbf', max_iter=-1, probability=False,

random_state=42, shrinking=True, tol=0.001,

verbose=False))],

flatten_transform=True, n_jobs=None, voting='hard',

weights=None)

from sklearn.metrics import accuracy_score #导入准确率

for clf in(log_clf,rnd_clf,svm_clf,voting_clf):

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)print (clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.864

DecisionTreeClassifier 0.856

SVC 0.888

VotingClassifier 0.896

b22fda3b6599474a9fb4a1d13f4c0c95.png

from sklearn.ensemble importBaggingClassifierfrom sklearn.tree importDecisionTreeClassifier"""n_estimators:int, optional (default=10),要集成的基估计器的个数

max_samples: int or float, optional (default=1.0)。

决定从x_train抽取去训练基估计器的样本数量。int 代表抽取数量,float代表抽取比例

bootstrap : boolean, optional (default=True) 决定样本子集的抽样方式(有放回和不放回)

n_jobs : int, optional (default=1)

random_state:如果int,random_state是随机数生成器使用的种子"""

#用集成BaggingClassifier分类器

bag_clf =BaggingClassifier(DecisionTreeClassifier(),

n_estimators= 500,

max_samples= 100,

bootstrap=True,

n_jobs= -1,

random_state= 42)

bag_clf.fit(X_train,y_train)

y_pred= bag_clf.predict(X_test)

accuracy_score(y_test,y_pred)

0.904

#用随机森林分类器

tree_clf = DecisionTreeClassifier(random_state = 42)

tree_clf.fit(X_train,y_train)

y_pred_tree=tree_clf.predict(X_test)

accuracy_score(y_test,y_pred_tree)

0.856

2 随机森林

cfef4f01f6e2167f9a0cb3d5794ae734.png

da29460a8975b371ffa51f3f0f83462c.png

094ae9091dbd2c47c9aa6976dbf497c2.png

3 扩展点

3.1 使用场景:数据维度相对低(几十维),同时对准确性有较高要求时

3.2 随机森林在现实分析中被大量使用,它相对于决策树,在准确性上有了很大的提升

4.总结

4.1 随机森林的生成步骤

5367eb9e316f1cddd1c358c90bb14266.png

4.2 RF与传统bagging的区别

e8220fc4b06ccdc9f7984ebb343e6db0.png

4.3 RF的优点

36a6bedb5c2545a5152570c605c31fe3.png

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/blog/article/detail/53046
推荐阅读
相关标签
  

闽ICP备14008679号