赞
踩
- #设置计算引擎
- set hive.execution.engine=spark;
-
- #设置spark提交模式
- set spark.master=yarn-cluster;
-
- #设置作业提交队列
- set spark.yarn.queue=${queue_name};
-
- #设置队列的名字
- set mapreduce.job.queuename=root.users.hdfs;
-
- #设置作业名称
- set spark.app.name=${job_name};
-
- #该参数用于设置Spark作业总共要用多少个Executor进程来执行
- set spark.executor.instances=25;
-
- #设置执行器计算核个数
- set spark.executor.cores=4;
-
- #设置执行器内存
- set spark.executor.memory=8g
-
- #设置任务并行度
- set mapred.reduce.tasks=600;
-
- #设置每个executor的jvm堆外内存
- set spark.yarn.executor.memoryOverhead=2048;
-
- #设置内存比例(spark2.0+)
- set spark.memory.fraction=0.8;
-
- #设置对象序列化方式
- set spark.serializer=org.apache.serializer.KyroSerializer;
-
- #设置动态分区
- set hive.exec.dynamic.partition=true; --开启动态分区功能
- set hive.exec.dynamic.partition.mode=nonstrict; --允许所有分区是动态的
- set hive.exec.max.dynamic.partitions.pernode=1000; --每个mapper/reducer可以创建的最大动态分区数
- --set hive.exec.dynamic.partitions=10000; 这个可不要
- insert overwrite table test partition(country,state) select * from test2; --添加动态分区示例
- #python版本提交spark任务
-
- spark-submit \
- --master yarn \
- --deploy-mode client \
- --driver-memory 24G \
- --driver-cores 8 \
- --num-executors 100 \
- --executor-cores 8 \
- --executor-memory 24G \
- --conf spark.driver.maxResultSize=24G \
- --conf spark.kubernetes.executor.limit.cores=12 \
- --conf spark.kryoserializer.buffer.max=1024m \
- --conf spark.kryoserializer.buffer=512m \
- --conf spark.dynamicAllocation.enabled=true \
- --conf spark.shuffle.service.enabled=true \
- --conf spark.sql.shuffle.partitions=3200 \
- --conf spark.default.parallelism=3200 \
- --conf spark.storage.memoryfraction=0.3 \
- --conf spark.shuffle.memoryFraction=0.3 \
- --conf spark.sql.hive.mergeFiles=true \
- --conf spark.blacklist.enabled=true \
- --conf spark.speculation=true \
- --conf spark.sql.sources.readWithSubdirectories.enabled=false \
- --conf spark.sql.autoBroadcastJoinThreshold=102400 \
- --py-files utils.py \
- --name analysis \
- analysis.py ${calc_date}
- #scala 版本提交 spark任务
-
- spark-submit --class com.ad.data.algorithms.model.runModel \
- --master yarn \
- --deploy-mode cluster \
- --driver-memory 16G \
- --conf spark.driver.maxResultSize=16G \
- --driver-cores 8 \
- --num-executors 100 \
- --executor-cores 8 \
- --executor-memory 16G \
- --conf spark.dynamicAllocation.enabled=true \
- --conf spark.shuffle.service.enabled=true \
- --conf spark.sql.shuffle.partitions=3200 \
- --conf spark.default.parallelism=3200 \
- --conf spark.storage.memoryfraction=0.4 \
- --conf spark.shuffle.memoryFraction=0.4 \
- --conf spark.sql.hive.mergeFiles=true \
- --conf spark.blacklist.enabled=true \
- --conf spark.speculation=true \
- --conf spark.hadoop.hive.exec.orc.split.strategy=ETL \
- --name segment-model \
- ${basePath}/../algorithms-model.jar ${calculateDate} ${cateCodes}
- from pyspark.sql import SparkSession
- from pyspark.sql.functions import lit, col, expr
-
- if __name__ == '__main__':
- script, calc_date = argv
- spark = SparkSession.builder.appName("analysis")\
- .config("spark.sql.autoBroadcastJoinThreshold", 102400)\
- .config("spark.driver.maxResultSize", "24G")\
- .enableHiveSupport().getOrCreate()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。