赞
踩
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool02_CountCsv s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/01_csv
9
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool03_LookCsv s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/01_csv
+----+----+-------+
|学号|姓名|年龄 |
+----+----+-------+
|7 |sfl |3223432|
|8 |fe |432 |
|9 |dsds|9868 |
|4 |dd |50 |
|5 |ee |1210 |
|6 |ff |ds |
|1 |aa |10 |
|2 |bb |20 |
|3 |cc |30 |
+----+----+-------+
package lifecycle01_tool import org.apache.hadoop.yarn.webapp.hamlet.HamletSpec.COL import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import scala.collection.mutable.ArrayBuffer // 入参 2 个 : csv 路径,parquet 路径 object Tool04_ParseCsvToParquetSnappy { def main(args: Array[String]): Unit = { // 1 ,入参 : 输入路径,输出路径,列名字 val path_in: String = args(0) val path_out: String = args(1) val colNames: Array[String] = args(2).split(",") // 2 ,获取上下文 val spark: SparkSession = Tool01_SparkSession.getSpark() // 3 ,隐式转换 import org.apache.spark.sql.types._ import org.apache.spark.sql.functions._ // 1 ,读文件 : 分隔符 ( , ) ,不要第一行 val df: DataFrame = spark.read.option("header","true").option("delimiter",",").csv("s3a://"+path_in) // 2 ,建表 : val res: DataFrame = df.toDF(colNames:_*) // 3 ,输出 : res.write.option("compression","snappy").option("delimiter","\001").option("header","true").parquet("s3a://"+ path_out) spark.close() } }
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode cluster --class lifecycle01_tool.Tool04_ParseCsvToParquetSnappy s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/01_csv lifecyclebigdata/dataWareHouse/BALABALA/09_testData/02_parquet/res "id,name,age"
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool05_CountParquet s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/02_parquet/res
9
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool06_LookParquet s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/02_parquet/res
+----+----+-------+
|学号|姓名|年龄 |
+----+----+-------+
|7 |sfl |3223432|
|8 |fe |432 |
|9 |dsds|9868 |
|4 |dd |50 |
|5 |ee |1210 |
|6 |ff |ds |
|1 |aa |10 |
|2 |bb |20 |
|3 |cc |30 |
+----+----+-------+
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode cluster --class lifecycle01_tool.Tool07_ParseParquetToCsv s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/02_parquet/res lifecyclebigdata/dataWareHouse/BALABALA/09_testData/03_csv/res
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool02_CountCsv s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/03_csv/res
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool03_LookCsv s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar lifecyclebigdata/dataWareHouse/BALABALA/09_testData/03_csv/res
create external table person(id string,name string,age string)
stored as parquet
location 's3://lifecyclebigdata/dataWareHouse/BALABALA/09_testData/02_parquet/res'
TBLPROPERTIES ('parquet.compress'='SNAPPY');
package lifecycle01_tool import org.apache.spark.sql.{DataFrame, SparkSession} object Tool08_CreateTable { def main(args: Array[String]): Unit = { // 1 ,上下文 val spark: SparkSession = Tool01_SparkSession.getHiveSpark() // 2 ,建表 sql val sql = "create external table person (id string,name string,age string) "+ "stored as parquet "+ "location 's3://lifecyclebigdata/dataWareHouse/BALABALA/09_testData/02_parquet/res' "+ "TBLPROPERTIES ('parquet.compress'='SNAPPY')" spark.sql("use default") spark.sql(sql) spark.close() } }
spark-submit --master yarn --num-executors 5 --executor-cores 3 --executor-memory 6144m --deploy-mode client --class lifecycle01_tool.Tool08_CreateTable s3://lifecyclebigdata/dataWareHouse/BALABALA/00jar/03_hive/hiveLifeCycle-1.0-SNAPSHOT.jar
select * from person;
=================================
7 sfl 3223432
8 fe 432
9 dsds 9868
4 dd 50
5 ee 1210
6 ff ds
1 aa 10
2 bb 20
3 cc 30
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。