赞
踩
准备数据建模:
- from pyspark.context import SparkContext
- from pyspark.sql.session import SparkSession
- sc = SparkContext('local')
- spark = SparkSession(sc)
-
- df = spark.createDataFrame([
- (1, 144.5, 5.9, 33, 'M'),
- (2, 167.2, 5.4, 45, 'M'),
- (3, 124.1, 5.2, 23, 'F'),
- (4, 144.5, 5.9, 33, 'M'),
- (5, 133.2, 5.7, 54, 'F'),
- (3, 124.1, 5.2, 23, 'F'),
- (5, 129.2, 5.3, 42, 'M'),
- ], ['id', 'weight', 'height', 'age', 'gender'])
-
- df.show()
- +---+------+------+---+------+
- | id|weight|height|age|gender|
- +---+------+------+---+------+
- | 1| 144.5| 5.9| 33| M|
- | 2| 167.2| 5.4| 45| M|
- | 3| 124.1| 5.2| 23| F|
- | 4| 144.5| 5.9| 33| M|
- | 5| 133.2| 5.7| 54| F|
- | 3| 124.1| 5.2| 23| F|
- | 5| 129.2| 5.3| 42| M|
- +---+------+------+---+------+
- print(df.count()) #打印出行数--- # 7
- print(df.distinct().count()) # 6
- #删除重复的样本
- df = df.dropDuplicates()
- df.show()
- +---+------+------+---+------+
- | id|weight|height|age|gender|
- +---+------+------+---+------+
- | 5| 133.2| 5.7| 54| F|
- | 5| 129.2| 5.3| 42| M|
- | 1| 144.5| 5.9| 33| M|
- | 4| 144.5| 5.9| 33| M|
- | 2| 167.2| 5.4| 45| M|
- | 3| 124.1| 5.2| 23| F|
- +---+------+------+---+------+
- #计算id 的总数和 不同id 的个数
- import pyspark.sql.functions as F
- df.agg(
- F.count('id').alias('all'),
- F.countDistinct('id').alias('distinct_id')
- ).show()
- +---+-----------+
- |all|distinct_id|
- +---+-----------+
- | 6| 5|
- +---+-----------+
- #设置唯一的 id 号
- df.withColumn('new_id', F.monotonically_increasing_id()).show()
- +---+------+------+---+------+-------------+
- | id|weight|height|age|gender| new_id|
- +---+------+------+---+------+-------------+
- | 5| 133.2| 5.7| 54| F| 171798691840|
- | 5| 129.2| 5.3| 42| M| 326417514496|
- | 1| 144.5| 5.9| 33| M| 481036337152|
- | 4| 144.5| 5.9| 33| M| 644245094400|
- | 2| 167.2| 5.4| 45| M| 721554505728|
- | 3| 124.1| 5.2| 23| F|1623497637888|
- +---+------+------+---+------+-------------+
缺失值处理
- df_miss = spark.createDataFrame([
- (1, 143.5, 5.6, 28, 'M', 100000),
- (2, 167.2, 5.4, 45, 'M', None),
- (3, None , 5.2, None, None, None),
- (4, 144.5, 5.9, 33, 'M', None),
- (5, 133.2, 5.7, 54, 'F', None),
- (6, 124.1, 5.2, None, 'F', None),
- (7, 129.2, 5.3, 42, 'M', 76000),
- ], ['id', 'weight', 'height', 'age', 'gender', 'income'])
- df_miss.show()
- +---+------+------+----+------+------+
- | id|weight|height| age|gender|income|
- +---+------+------+----+------+------+
- | 1| 143.5| 5.6| 28| M|100000|
- | 2| 167.2| 5.4| 45| M| null|
- | 3| null| 5.2|null| null| null|
- | 4| 144.5| 5.9| 33| M| null|
- | 5| 133.2| 5.7| 54| F| null|
- | 6| 124.1| 5.2|null| F| null|
- | 7| 129.2| 5.3| 42| M| 76000|
- +---+------+------+----+------+------+
- #计算出每行的缺失值, * 指示该方法计算所有的列
- df_miss.rdd.map(lambda row: (row['id'], sum([c == None for c in row]))).collect()
#计算出每行的缺失值, * 指示该方法计算所有的列 df_miss.rdd.map(lambda row: (row['id'], sum([c == None for c in row]))).collect()
#计算出每行的缺失值, * 指示该方法计算所有的列
df_miss.rdd.map(lambda row: (row['id'], sum([c == None for c in row]))).collect()
Out[9]:
[(1, 0), (2, 1), (3, 4), (4, 1), (5, 1), (6, 2), (7, 0)]
- #计算每个特征的缺失率
- df_miss.agg(
- *[(1 - (F.count(c) / F.count('*'))).alias(c+'_missing') for c in df_miss.columns]
- ).show()
- +----------+------------------+--------------+------------------+------------------+------------------+
- |id_missing| weight_missing|height_missing| age_missing| gender_missing| income_missing|
- +----------+------------------+--------------+------------------+------------------+------------------+
- | 0.0|0.1428571428571429| 0.0|0.2857142857142857|0.1428571428571429|0.7142857142857143|
- +----------+------------------+--------------+------------------+------------------+------------------+
- #删除income 列
- data_drop_income = df_miss.select([c for c in df_miss.columns if c != 'income'])
- data_drop_income.show()
- +---+------+------+----+------+
- | id|weight|height| age|gender|
- +---+------+------+----+------+
- | 1| 143.5| 5.6| 28| M|
- | 2| 167.2| 5.4| 45| M|
- | 3| null| 5.2|null| null|
- | 4| 144.5| 5.9| 33| M|
- | 5| 133.2| 5.7| 54| F|
- | 6| 124.1| 5.2|null| F|
- | 7| 129.2| 5.3| 42| M|
- +---+------+------+----+------+
- #删除样本中多于3 个缺失值的样本
- data_drop_income.dropna(thresh=3).show()
- +---+------+------+----+------+
- | id|weight|height| age|gender|
- +---+------+------+----+------+
- | 1| 143.5| 5.6| 28| M|
- | 2| 167.2| 5.4| 45| M|
- | 4| 144.5| 5.9| 33| M|
- | 5| 133.2| 5.7| 54| F|
- | 6| 124.1| 5.2|null| F|
- | 7| 129.2| 5.3| 42| M|
- +---+------+------+----+------+
- #对连续的特征以平均数填充,离散特征 为 missing
- means = data_drop_income.agg(*
- [F.mean(c).alias(c) for c in data_drop_income.columns if c != 'gender']
- ).toPandas().to_dict('recordes')[0]
- means['gender'] = 'missing'
- data_drop_income.fillna(means).show()
- +---+------------------+------+---+-------+
- | id| weight|height|age| gender|
- +---+------------------+------+---+-------+
- | 1| 143.5| 5.6| 28| M|
- | 2| 167.2| 5.4| 45| M|
- | 3|140.28333333333333| 5.2| 40|missing|
- | 4| 144.5| 5.9| 33| M|
- | 5| 133.2| 5.7| 54| F|
- | 6| 124.1| 5.2| 40| F|
- | 7| 129.2| 5.3| 42| M|
- +---+------------------+------+---+-------+
离群值
- #离群值
- df_outliers = spark.createDataFrame([
- (1, 143.5, 5.3, 28),
- (2, 154.2, 5.5, 45),
- (3, 342.3, 5.1, 99),
- (4, 144.5, 5.5, 33),
- (5, 133.2, 5.4, 54),
- (6, 124.1, 5.1, 21),
- (7, 129.2, 5.3, 42),
- ], ['id', 'weight', 'height', 'age'])
- df_outliers.show()
- +---+------+------+---+
- | id|weight|height|age|
- +---+------+------+---+
- | 1| 143.5| 5.3| 28|
- | 2| 154.2| 5.5| 45|
- | 3| 342.3| 5.1| 99|
- | 4| 144.5| 5.5| 33|
- | 5| 133.2| 5.4| 54|
- | 6| 124.1| 5.1| 21|
- | 7| 129.2| 5.3| 42|
- +---+------+------+---+
- cols = ['weight', 'height', 'age']
- bounds = {}
- for col in cols:
- quan = df_outliers.approxQuantile(col, (0.25, 0.75), 0.05)
- IQR = quan[1] - quan[0]
- bounds[col] = [quan[0] - 1.5 * IQR, quan[1] + 1.5 * IQR]
- bounds
- {'weight': [91.69999999999999, 191.7],
- 'height': [4.499999999999999, 6.1000000000000005],
- 'age': [-11.0, 93.0]}
- # 找出离群点
- outliers = df_outliers.select(*['id'] + [
- (
- (df_outliers[c] < bounds[c][0]) |
- (df_outliers[c] > bounds[c][1])
- ).alias(c + '_o') for c in cols
- ])
- outliers.show()
- +---+--------+--------+-----+
- | id|weight_o|height_o|age_o|
- +---+--------+--------+-----+
- | 1| false| false|false|
- | 2| false| false|false|
- | 3| true| false| true|
- | 4| false| false|false|
- | 5| false| false|false|
- | 6| false| false|false|
- | 7| false| false|false|
- +---+--------+--------+-----+
- # 显示出离群点的值
- df_outliers = df_outliers.join(outliers, on='id')
- df_outliers.filter('weight_o').select('id', 'weight').show()
- df_outliers.filter('age_o').select('id', 'age').show()
- +---+------+
- | id|weight|
- +---+------+
- | 3| 342.3|
- +---+------+
-
- +---+---+
- | id|age|
- +---+---+
- | 3| 99|
- +---+---+
Java微服务实战296集大型视频-谷粒商城【附代码和课件】
Java开发微服务畅购商城实战【全357集大项目】-附代码和课件
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。