from __future__ import print_function
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import PCA
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import StandardScaler
label_set = data.map(lambda x: x[10]).distinct().collect()
label_dict = dict()
#此句为构建一个空白的字典 i = 0
for key in label_set:
if key not in label_dict.keys():
#label_dict.keys()中keys()表示字典里边的键值对,意思为如果key没有在字典中出现,那么记录此key的值为i,key和keys没有关系。 label_dict[key ]= i
i = i+1
print(label_dict)
print('999')
data1=data.map(lambda x: ([x[i] for i in range(10)],label_dict[x[10]])).\
#第一个map将数据集映射为两个部分,分为了x和y两部分 map(lambda (x,y): [float(x[0]), float(x[1]),float(x[2]),float(x[3]),float(x[4]),float(x[5]),
#第二个map将上一个map中的x转换为数值类型,y不动 float(x[6]),float(x[7]),int(float(x[8])),int(float(x[9])),y])