当前位置:   article > 正文

pyspark参数设置,常用时间函数_pyspark strptime

pyspark strptime
spark-submit --master yarn-client --num-executors 8 --driver-memory 4g --executor-memory 2g spark_demo.py 
  • 1
spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_1") \
    #.config("spark.sql.warehouse.dir", "spark.warehouse") \
    .config("spark.executor.heartbeatInterval", 120) \
    .config("spark.debug.maxToStringFields", 1000) \
    #.config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "256m") \
    .config("spark.rpc.message.maxSize",256) \
    .config("spark.port.maxRetries", 32) \
    .config("spark.rpc.numRetries", 9) \
    .config("spark.rpc.retry.wait","9s") \
    .config("spark.scheduler.maxRegisteredResourcesWaitingTime","120s") \
    .config("spark.shuffle.service.enabled","true") \
    .config("spark.dynamicAllocation.enabled","true") \
    .config("spark.dynamicAllocation.maxExecutors",108) \
    .config("spark.dynamicAllocation.minExecutors",1) \
    .config("spark.dynamicAllocation.schedulerBacklogTimeout","10s") \
    .config("spark.dynamicAllocation.executorIdleTimeout","120s") \
    .config("spark.network.timeout","240s") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.driver.cores", 16) \
    .config("spark.driver.memory", "64g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.python.worker.memory", "1g") \
    .config("spark.python.worker.reuse", "true") \
    .enableHiveSupport() \
    .getOrCreate()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_2") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.broadcast.blockSize", "4m") \
    .config("spark.executor.heartbeatInterval", 240) \
    .config("spark.network.timeout", "360s") \
    .config("spark.rpc.message.maxSize", 256) \
    .config("spark.port.maxRetries", 32) \
    .config("spark.rpc.numRetries", 24) \
    .config("spark.rpc.retry.wait", "3s") \
    .config("spark.shuffle.io.numConnectionsPerPeer", 3) \
    .config("spark.shuffle.io.maxRetries", 9) \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.scheduler.maxRegisteredResourcesWaitingTime", "120s") \
    .config("spark.driver.cores", 16) \
    .config("spark.driver.memory", "64g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memoryOverhead","2G") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .enableHiveSupport() \
    .getOrCreate()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
reload(sys)
sys.setdefaultencoding('utf-8')

spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_3") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.driver.cores", 4) \
    .config("spark.driver.memory", "128g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.driver.memoryOverhead", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "6g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.dynamicAllocation.maxExecutors", 1000) \
    .config("spark.default.parallelism", 1000) \
    .config("spark.cleaner.periodicGC.interval", "10min") \
    .enableHiveSupport() \
    .getOrCreate()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
#常用时间函数
from __future__ import division, print_function, absolute_import
import sys, time, datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn

reload(sys)
sys.setdefaultencoding('utf-8')

spark = SparkSession.builder \
    .master("yarn") \
    .appName("xxx") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.driver.cores", 4) \
    .config("spark.driver.memory", "16g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memoryOverhead", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.dynamicAllocation.maxExecutors", 1000) \
    .config("spark.default.parallelism", 1000) \
    .config("spark.cleaner.periodicGC.interval", "10min") \
    .enableHiveSupport() \
    .getOrCreate()


def getDate(fakeToday='today', moveDays=0, isReturnPt=False):
    fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
    the_day = fakeToday + datetime.timedelta(days=moveDays)
    date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
    return the_day.strftime(date_format)


def ptToDate(pt, isDetial=False, isMinTime=True):
    dt = datetime.datetime.strptime(pt, '%Y%m%d').strftime('%Y-%m-%d')
    dt = dt + ' 00:00:00' if isMinTime else dt + ' 23:59:59'
    return dt if isDetial else dt[:10]


def toTimestamp(datetime):
    fmt = '%Y-%m-%d %H:%M:%S'
    if len(datetime) == 8:
        fmt = '%Y%m%d'
    elif len(datetime) == 10:
        fmt = '%Y-%m-%d'
    elif len(datetime) == 19:
        fmt = '%Y-%m-%d %H:%M:%S'
    return int(time.mktime(time.strptime(datetime, fmt)) * 1000)  # len(x)==13


def getLastMonday(fakeToday='today',includeTodayMonday=False,isReturnPt=False):
    fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
    moveDays = -7 if fakeToday.weekday() == 0 else -fakeToday.weekday()
    moveDays = 0 if includeTodayMonday and moveDays==-7 else moveDays
    the_day = fakeToday + datetime.timedelta(days=moveDays)
    date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
    return the_day.strftime(date_format)

def timeStat(func):
    def wrapper(*args, **kwargs):
        import time
        t1 = time.time()
        r = func(*args, **kwargs)
        t2 = time.time()
        print('UseTimeMinutes(fn: {}): '.format(func.__name__), round((t2 - t1) / 60, 2))
        return r
    return wrapper

import pandas as pd
pt_list = [str(x)[:10].replace('-','') for x in pd.date_range(end=last_1_date, freq='D',periods=30)]
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/66437
推荐阅读
相关标签
  

闽ICP备14008679号