当前位置:   article > 正文

数据分析SQL日期维度表生成(含节假日)_sql 创建时间维度表

sql 创建时间维度表

概述

  • 在商业数据分析中,需要用到日期维度
    例如:绘制连续日期的销量曲线;按周、月、季聚合求和…
  • 按周、月、季聚合的格式:
    周:2022年第01周2022w51202251
    月:2022年01月2022-01
    季:2022年第1季2022q12022Q1
  • 在某些零售行业中,线下零售假日销量高于工作日
    因此,在按周汇总时,希望以星期一作为周的起点
    对于跨年,以 年内第1个星期一 作为 年的第一周,此前归入上1年最后1周,例如下面

例如,2023-01-01归属到2022w52那周

Python生成日期维度表

from datetime import datetime, timedelta
from math import ceil

START = '2022-01-01'
END = '2022-12-31'
HOLIDAY = {
    '2022-01-01': ('元旦', False), '2022-01-02': ('元旦', False),
    '2022-01-03': ('元旦', False), '2022-01-29': ('补班', True),
    '2022-01-30': ('补班', True), '2022-01-31': ('春节', False),
    '2022-02-01': ('春节', False), '2022-02-02': ('春节', False),
    '2022-02-03': ('春节', False), '2022-02-04': ('春节', False),
    '2022-02-05': ('春节', False), '2022-02-06': ('春节', False),
    '2022-04-02': ('补班', True), '2022-04-03': ('清明节', False),
    '2022-04-04': ('清明节', False), '2022-04-05': ('清明节', False),
    '2022-04-24': ('补班', True), '2022-04-30': ('劳动节', False),
    '2022-05-01': ('劳动节', False), '2022-05-02': ('劳动节', False),
    '2022-05-03': ('劳动节', False), '2022-05-04': ('劳动节', False),
    '2022-05-07': ('补班', True), '2022-06-03': ('端午节', False),
    '2022-06-04': ('端午节', False), '2022-06-05': ('端午节', False),
    '2022-09-11': ('中秋节', False), '2022-09-12': ('中秋节', False),
    '2022-09-10': ('中秋节', False), '2022-10-01': ('国庆节', False),
    '2022-10-02': ('国庆节', False), '2022-10-03': ('国庆节', False),
    '2022-10-04': ('国庆节', False), '2022-10-05': ('国庆节', False),
    '2022-10-06': ('国庆节', False), '2022-10-07': ('国庆节', False),
    '2022-10-08': ('补班', True), '2022-10-09': ('补班', True),
}

def get_day_of_year(d: datetime) -> int:
    return (d - datetime(year=d.year, month=1, day=1)).days + 1

def get_week_of_year(d: datetime) -> int:
    day_of_year = get_day_of_year(d)
    return ceil((day_of_year - d.weekday()) / 7)

def get_weekday_cn(d: datetime) -> str:
    return ['周一', '周二', '周三', '周四', '周五', '周六', '周日'][d.weekday()]

def get_year_week(d: datetime) -> (int, int, str):
    day_of_year = get_day_of_year(d)
    week_of_year = get_week_of_year(d)
    if week_of_year == 0:
        y = d.year - 1
        yw = '%04dw%02d' % (y, get_week_of_year(datetime(year=y, month=12, day=31)))
    else:
        y = d.year
        yw = '%04dw%02d' % (y, week_of_year)
    return day_of_year, week_of_year, yw

def generate_date():
    start = datetime.strptime(START, '%Y-%m-%d')
    end = datetime.strptime(END, '%Y-%m-%d')
    for i in range((end - start).days + 1):
        d = start + timedelta(i)
        ymd = d.strftime('%Y-%m-%d')
        ym = d.strftime('%Y-%m')
        yq = '%d%s%d' % (d.year, 'q', ceil(d.month / 3))
        day_of_year, week_of_year, yw = get_year_week(d)
        weekday_cn = get_weekday_cn(d)
        holiday, is_workday = HOLIDAY.get(ymd, (
            '休息日' if d.weekday() > 4 else '工作日',
            d.weekday() < 5))
        print(ymd, ym, yq, yw, weekday_cn, day_of_year, week_of_year, holiday, is_workday, sep='\t')

if __name__ == '__main__':
    generate_date()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65

HIVE日期维度表

datediff

SELECT datediff('2022-01-04','2022-01-01');
  • 1

输出:3

repeat

SELECT repeat('a',3);
  • 1

输出:aaa

split

SELECT split('aaa','a');
  • 1

输出:["","","",""]

posexplode

SELECT posexplode(split('aaa','a'));
  • 1

输出:

date_add

SELECT date_add('2022-01-01',1)
  • 1

输出:
2022-01-02

生成日期序列

WITH t AS (SELECT posexplode(split(repeat('a',3),'a')))
SELECT date_add('2022-01-01',pos) AS ymd FROM t;
  • 1
  • 2

输出:
2022-01-01
2022-01-02
2022-01-03
2022-01-04

创建节日临时表(以2022为例,缺双十一、七夕……)

CREATE TABLE temp_holiday2022 AS
SELECT '2022-01-01','元旦',false UNION ALL
SELECT '2022-01-02','元旦',false UNION ALL
SELECT '2022-01-03','元旦',false UNION ALL
SELECT '2022-01-29','补班',true UNION ALL
SELECT '2022-01-30','补班',true UNION ALL
SELECT '2022-01-31','春节',false UNION ALL
SELECT '2022-02-01','春节',false UNION ALL
SELECT '2022-02-02','春节',false UNION ALL
SELECT '2022-02-03','春节',false UNION ALL
SELECT '2022-02-04','春节',false UNION ALL
SELECT '2022-02-05','春节',false UNION ALL
SELECT '2022-02-06','春节',false UNION ALL
SELECT '2022-04-02','补班',true UNION ALL
SELECT '2022-04-03','清明节',false UNION ALL
SELECT '2022-04-04','清明节',false UNION ALL
SELECT '2022-04-05','清明节',false UNION ALL
SELECT '2022-04-24','补班',true UNION ALL
SELECT '2022-04-30','劳动节',false UNION ALL
SELECT '2022-05-01','劳动节',false UNION ALL
SELECT '2022-05-02','劳动节',false UNION ALL
SELECT '2022-05-03','劳动节',false UNION ALL
SELECT '2022-05-04','劳动节',false UNION ALL
SELECT '2022-05-07','补班',true UNION ALL
SELECT '2022-06-03','端午节',false UNION ALL
SELECT '2022-06-04','端午节',false UNION ALL
SELECT '2022-06-05','端午节',false UNION ALL
SELECT '2022-09-11','中秋节',false UNION ALL
SELECT '2022-09-12','中秋节',false UNION ALL
SELECT '2022-09-10','中秋节',false UNION ALL
SELECT '2022-10-01','国庆节',false UNION ALL
SELECT '2022-10-02','国庆节',false UNION ALL
SELECT '2022-10-03','国庆节',false UNION ALL
SELECT '2022-10-04','国庆节',false UNION ALL
SELECT '2022-10-05','国庆节',false UNION ALL
SELECT '2022-10-06','国庆节',false UNION ALL
SELECT '2022-10-07','国庆节',false UNION ALL
SELECT '2022-10-08','补班',true UNION ALL
SELECT '2022-10-09','补班',true;

SELECT `_c0`,`_c1`,`_c2` FROM temp_holiday2022 ORDER BY `_c0`;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41

生成日期维度

WITH
t0 AS (SELECT posexplode(split(repeat('a',datediff('2022-12-31','2022-01-01')),'a'))),
t1 AS (SELECT date_add('2022-01-01',pos) AS d FROM t0),
t2 AS (
  SELECT date_format(d,'yyyy-MM-dd') AS ymd
        ,year(d) AS y
        ,date_format(d,'yyyy-MM') AS ym
        ,concat(year(d),'q',quarter(d)) AS yq
        ,CAST(date_format(d,'D') AS SMALLINT) AS day_of_year
        ,CAST(date_format(d,'w') AS SMALLINT) AS week_of_year
        ,CAST(date_format(d,'u') AS TINYINT) AS day_of_week
        ,date_format(d,'EEEE') AS day_of_week_en
  FROM t1
)
SELECT t2.*
      ,NVL(`_c1`,IF(day_of_week>5,'休息日','工作日')) AS holiday
      ,NVL(`_c2`,day_of_week<6) AS is_workday
FROM t2
LEFT JOIN temp_holiday2022 ON ymd=`_c0`;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19

日期维度表

CREATE TABLE dim_date (
  ymd            STRING   COMMENT '日期',
  y              INT      COMMENT '年',
  ym             STRING   COMMENT '年月',
  yq             STRING   COMMENT '季度',
  day_of_year    SMALLINT COMMENT '一年中的第几天',
  week_of_year   TINYINT  COMMENT '一年中的第几周,以周日为起点',
  day_of_week    TINYINT  COMMENT '星期',
  day_of_week_en STRING   COMMENT '星期',
  holiday        STRING   COMMENT '节假日',
  is_workday     BOOLEAN  COMMENT 'true=工作日、false=休息日'
) COMMENT '日期维度表';

INSERT INTO TABLE dim_date
SELECT t2.*
      ,NVL(`_c1`,IF(day_of_week>5,'休息日','工作日')) AS holiday
      ,NVL(`_c2`,day_of_week<6) AS is_workday
FROM (
  SELECT date_format(d,'yyyy-MM-dd') AS ymd
        ,year(d) AS y
        ,date_format(d,'yyyy-MM') AS ym
        ,concat(year(d),'q',quarter(d)) AS yq
        ,CAST(date_format(d,'D') AS SMALLINT) AS day_of_year
        ,CAST(date_format(d,'w') AS SMALLINT) AS week_of_year
        ,CAST(date_format(d,'u') AS TINYINT) AS day_of_week
        ,date_format(d,'EEEE') AS day_of_week_en
  FROM (
    SELECT date_add('2022-01-01',pos) AS d FROM (
      SELECT posexplode(split(repeat('a',datediff('2022-12-31','2022-01-01')),'a')))t0)t1
)t2
LEFT JOIN temp_holiday2022 ON ymd=`_c0`;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31

HIVE近30天日期序列

-- 近30天
WITH t AS (SELECT posexplode(split(repeat('a',29),'a')))
SELECT date_sub(current_date(),pos) AS ymd FROM t ORDER BY ymd;

-- 1~31天前
WITH t AS (SELECT posexplode(split(repeat('a',29),'a')))
SELECT date_sub(current_date(),pos+1) AS ymd FROM t ORDER BY ymd;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
-- 近7天
WITH t AS (SELECT posexplode(split(repeat('a',6),'a')))
SELECT date_sub(current_date(),pos) AS ymd FROM t ORDER BY ymd;

-- 1~8天前
WITH t AS (SELECT posexplode(split(repeat('a',6),'a')))
SELECT date_sub(current_date(),pos+1) AS ymd FROM t ORDER BY ymd;
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

昨天1024:
2022-10-18
2022-10-19
2022-10-20
2022-10-21
2022-10-22
2022-10-23
2022-10-24

HIVE近48小时序列

WITH t AS (SELECT posexplode(split(repeat('a',47),'a')))
SELECT date_format(from_unixtime(unix_timestamp()-((pos+1)*3600)),'yyyy-MM-dd HH') AS h
FROM t ORDER BY h;
  • 1
  • 2
  • 3

2022-10-25 08:42时刻的搜索结果:
2022-10-23 08
2022-10-23 09
2022-10-23 10
2022-10-23 11
2022-10-23 12
2022-10-23 13
2022-10-23 14
2022-10-23 15
2022-10-23 16
2022-10-23 17
2022-10-23 18
2022-10-23 19
2022-10-23 20
2022-10-23 21
2022-10-23 22
2022-10-23 23
2022-10-24 00
2022-10-24 01
2022-10-24 02
2022-10-24 03
2022-10-24 04
2022-10-24 05
2022-10-24 06
2022-10-24 07
2022-10-24 08
2022-10-24 09
2022-10-24 10
2022-10-24 11
2022-10-24 12
2022-10-24 13
2022-10-24 14
2022-10-24 15
2022-10-24 16
2022-10-24 17
2022-10-24 18
2022-10-24 19
2022-10-24 20
2022-10-24 21
2022-10-24 22
2022-10-24 23
2022-10-25 00
2022-10-25 01
2022-10-25 02
2022-10-25 03
2022-10-25 04
2022-10-25 05
2022-10-25 06
2022-10-25 07

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/675585
推荐阅读
相关标签
  

闽ICP备14008679号