赞
踩
spark-submit --master yarn-client --num-executors 8 --driver-memory 4g --executor-memory 2g spark_demo.py
spark = SparkSession.builder \ .master("yarn") \ .appName("version_1") \ #.config("spark.sql.warehouse.dir", "spark.warehouse") \ .config("spark.executor.heartbeatInterval", 120) \ .config("spark.debug.maxToStringFields", 1000) \ #.config("spark.sql.execution.arrow.enabled", "true") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.kryoserializer.buffer.max", "256m") \ .config("spark.rpc.message.maxSize",256) \ .config("spark.port.maxRetries", 32) \ .config("spark.rpc.numRetries", 9) \ .config("spark.rpc.retry.wait","9s") \ .config("spark.scheduler.maxRegisteredResourcesWaitingTime","120s") \ .config("spark.shuffle.service.enabled","true") \ .config("spark.dynamicAllocation.enabled","true") \ .config("spark.dynamicAllocation.maxExecutors",108) \ .config("spark.dynamicAllocation.minExecutors",1) \ .config("spark.dynamicAllocation.schedulerBacklogTimeout","10s") \ .config("spark.dynamicAllocation.executorIdleTimeout","120s") \ .config("spark.network.timeout","240s") \ .config("spark.executor.memory", "4g") \ .config("spark.executor.memoryOverhead", "2g") \ .config("spark.driver.cores", 16) \ .config("spark.driver.memory", "64g") \ .config("spark.driver.maxResultSize", "4g") \ .config("spark.python.worker.memory", "1g") \ .config("spark.python.worker.reuse", "true") \ .enableHiveSupport() \ .getOrCreate()
spark = SparkSession.builder \ .master("yarn") \ .appName("version_2") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.kryoserializer.buffer.max", "512m") \ .config("spark.debug.maxToStringFields", 1024) \ .config("spark.broadcast.blockSize", "4m") \ .config("spark.executor.heartbeatInterval", 240) \ .config("spark.network.timeout", "360s") \ .config("spark.rpc.message.maxSize", 256) \ .config("spark.port.maxRetries", 32) \ .config("spark.rpc.numRetries", 24) \ .config("spark.rpc.retry.wait", "3s") \ .config("spark.shuffle.io.numConnectionsPerPeer", 3) \ .config("spark.shuffle.io.maxRetries", 9) \ .config("spark.sql.execution.arrow.enabled", "true") \ .config("spark.scheduler.maxRegisteredResourcesWaitingTime", "120s") \ .config("spark.driver.cores", 16) \ .config("spark.driver.memory", "64g") \ .config("spark.driver.maxResultSize", "4g") \ .config("spark.driver.memoryOverhead","2G") \ .config("spark.executor.memory", "4g") \ .config("spark.executor.memoryOverhead", "2g") \ .enableHiveSupport() \ .getOrCreate()
reload(sys) sys.setdefaultencoding('utf-8') spark = SparkSession.builder \ .master("yarn") \ .appName("version_3") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.kryoserializer.buffer.max", "512m") \ .config("spark.debug.maxToStringFields", 1024) \ .config("spark.sql.execution.arrow.enabled", "true") \ .config("spark.driver.cores", 4) \ .config("spark.driver.memory", "128g") \ .config("spark.driver.maxResultSize", "2g") \ .config("spark.driver.memoryOverhead", "4g") \ .config("spark.executor.cores", 4) \ .config("spark.executor.memory", "6g") \ .config("spark.executor.memoryOverhead", "4g") \ .config("spark.dynamicAllocation.maxExecutors", 1000) \ .config("spark.default.parallelism", 1000) \ .config("spark.cleaner.periodicGC.interval", "10min") \ .enableHiveSupport() \ .getOrCreate()
#常用时间函数 from __future__ import division, print_function, absolute_import import sys, time, datetime from pyspark.sql import SparkSession import pyspark.sql.functions as fn reload(sys) sys.setdefaultencoding('utf-8') spark = SparkSession.builder \ .master("yarn") \ .appName("xxx") \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.kryoserializer.buffer.max", "1024m") \ .config("spark.debug.maxToStringFields", 1024) \ .config("spark.driver.cores", 4) \ .config("spark.driver.memory", "16g") \ .config("spark.driver.maxResultSize", "4g") \ .config("spark.driver.memoryOverhead", "4g") \ .config("spark.executor.cores", 4) \ .config("spark.executor.memory", "16g") \ .config("spark.executor.memoryOverhead", "4g") \ .config("spark.dynamicAllocation.maxExecutors", 1000) \ .config("spark.default.parallelism", 1000) \ .config("spark.cleaner.periodicGC.interval", "10min") \ .enableHiveSupport() \ .getOrCreate() def getDate(fakeToday='today', moveDays=0, isReturnPt=False): fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d') the_day = fakeToday + datetime.timedelta(days=moveDays) date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d' return the_day.strftime(date_format) def ptToDate(pt, isDetial=False, isMinTime=True): dt = datetime.datetime.strptime(pt, '%Y%m%d').strftime('%Y-%m-%d') dt = dt + ' 00:00:00' if isMinTime else dt + ' 23:59:59' return dt if isDetial else dt[:10] def toTimestamp(datetime): fmt = '%Y-%m-%d %H:%M:%S' if len(datetime) == 8: fmt = '%Y%m%d' elif len(datetime) == 10: fmt = '%Y-%m-%d' elif len(datetime) == 19: fmt = '%Y-%m-%d %H:%M:%S' return int(time.mktime(time.strptime(datetime, fmt)) * 1000) # len(x)==13 def getLastMonday(fakeToday='today',includeTodayMonday=False,isReturnPt=False): fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d') moveDays = -7 if fakeToday.weekday() == 0 else -fakeToday.weekday() moveDays = 0 if includeTodayMonday and moveDays==-7 else moveDays the_day = fakeToday + datetime.timedelta(days=moveDays) date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d' return the_day.strftime(date_format) def timeStat(func): def wrapper(*args, **kwargs): import time t1 = time.time() r = func(*args, **kwargs) t2 = time.time() print('UseTimeMinutes(fn: {}): '.format(func.__name__), round((t2 - t1) / 60, 2)) return r return wrapper import pandas as pd pt_list = [str(x)[:10].replace('-','') for x in pd.date_range(end=last_1_date, freq='D',periods=30)]
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。