当前位置:   article > 正文

机器学习pyspark使用决策树算法生成训练模型_pyspark ml导出树模型结构

pyspark ml导出树模型结构
# encoding:utf-8
import os
os.environ['HADOOP_HOME'] = 'D:\\system\\hadoop-common-2.6.0-bin-master'
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder \
             .appName("envTest")\
             .getOrCreate()
sc = spark.sparkContext

data = pd.read_csv("E:\\data\\t_iris.csv",header=None)
print(data)
print(data[0:2])  # 取前两行数据
# 创建dataFrame
data000002 = spark.createDataFrame(data, ["sepal_length","sepal_width","petal_length","petal_width", "iris_class"])
print(data000002.dtypes)
print(data000002.take(2))
print(data000002.show(2))

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from Distributed.common import getEvaluationDF
from Distributed.common import preProcessing
def DT(label_data, label_col, features_col, test_data_pro=0.3, maxdepth=5, impurity="gini"):
    label_data, prePipeline, decode, label_col_indexed = preProcessing(label_data, label_col, features_col)
    (trainingData, testData) = label_data.randomSplit([1.0-test_data_pro, test_data_pro], 1234)
    dt = DecisionTreeClassifier(labelCol=label_col_indexed, maxDepth=maxdepth, impurity=impurity, maxBins=320)
    pipeline = Pipeline(stages=[prePipeline, dt])
    model = pipeline.fit(trainingData)
    if decode is not None:
        model.stages.append(decode)
    predictions = model.transform(testData)
    conf_matrix, evaluation = getEvaluationDF(predictions, label_col_indexed)
    if decode is not None:
        predictions = predictions.drop("prediction")
        predictions = predictions.withColumnRenamed("prediction_str", "prediction")
    treeModel = model.stages[1]
    trainResult = treeModel.toDebugString
    i = 0
    for col in features_col:
        trainResult = trainResult.replace("feature " + str(i), col)
        i = i + 1
    result = {}
    result["test_data"] = predictions.select(features_col + [label_col, "prediction"])
    result["conf_matrix"] = conf_matrix
    result["evaluation"] = evaluation
    result["train_result"] = spark.createDataFrame([[trainResult]], ["trainResult"])
    result["model"] = model
    return result
features_col=['sepal_length','sepal_width','petal_length','petal_width']
result=DT(label_data=data000002,label_col="iris_class",features_col=['sepal_length','sepal_width','petal_length','petal_width'],test_data_pro=0.3,maxdepth=5,impurity='gini')
result["test_data"].show(5)
result["conf_matrix"].show()
result["evaluation"].show()
result["model"].save("file:///E:/out/iris")
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/羊村懒王/article/detail/643553
推荐阅读
  

闽ICP备14008679号