赞
踩
# encoding:utf-8 import os os.environ['HADOOP_HOME'] = 'D:\\system\\hadoop-common-2.6.0-bin-master' from pyspark.sql import SparkSession import pandas as pd spark = SparkSession.builder \ .appName("envTest")\ .getOrCreate() sc = spark.sparkContext data = pd.read_csv("E:\\data\\t_iris.csv",header=None) print(data) print(data[0:2]) # 取前两行数据 # 创建dataFrame data000002 = spark.createDataFrame(data, ["sepal_length","sepal_width","petal_length","petal_width", "iris_class"]) print(data000002.dtypes) print(data000002.take(2)) print(data000002.show(2)) from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml import Pipeline from Distributed.common import getEvaluationDF from Distributed.common import preProcessing def DT(label_data, label_col, features_col, test_data_pro=0.3, maxdepth=5, impurity="gini"): label_data, prePipeline, decode, label_col_indexed = preProcessing(label_data, label_col, features_col) (trainingData, testData) = label_data.randomSplit([1.0-test_data_pro, test_data_pro], 1234) dt = DecisionTreeClassifier(labelCol=label_col_indexed, maxDepth=maxdepth, impurity=impurity, maxBins=320) pipeline = Pipeline(stages=[prePipeline, dt]) model = pipeline.fit(trainingData) if decode is not None: model.stages.append(decode) predictions = model.transform(testData) conf_matrix, evaluation = getEvaluationDF(predictions, label_col_indexed) if decode is not None: predictions = predictions.drop("prediction") predictions = predictions.withColumnRenamed("prediction_str", "prediction") treeModel = model.stages[1] trainResult = treeModel.toDebugString i = 0 for col in features_col: trainResult = trainResult.replace("feature " + str(i), col) i = i + 1 result = {} result["test_data"] = predictions.select(features_col + [label_col, "prediction"]) result["conf_matrix"] = conf_matrix result["evaluation"] = evaluation result["train_result"] = spark.createDataFrame([[trainResult]], ["trainResult"]) result["model"] = model return result features_col=['sepal_length','sepal_width','petal_length','petal_width'] result=DT(label_data=data000002,label_col="iris_class",features_col=['sepal_length','sepal_width','petal_length','petal_width'],test_data_pro=0.3,maxdepth=5,impurity='gini') result["test_data"].show(5) result["conf_matrix"].show() result["evaluation"].show() result["model"].save("file:///E:/out/iris")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。