赞
踩
目录
DWD层的目的是进行维度建模(尽可能的使用星型模型,减少雪花模型的使用),所以一般情况下一个DWD层的表为一个事实表:
注:脚本实现如下所示,关于具体使用和总结可以查看文章最后DWD层流量日志解析总结章节。
- -- 建表语句
- create table if not exists ${yishou_data_dbname}.dwd_goods_click_dt (
- `user_id` STRING COMMENT '用户id',
- `goods_id` STRING COMMENT '商品id',
- `is_rec` STRING COMMENT '是否搜索结果推荐商品',
- `special_id` STRING COMMENT '专场id',
- `os` STRING COMMENT '操作系统',
- `goods_no` STRING COMMENT '货号',
- `pid` STRING COMMENT '来源',
- `ptime` STRING COMMENT '页面停留时间',
- `source` STRING COMMENT '专场列表来源',
- `report_time` STRING COMMENT '上报时间',
- `event_id` STRING COMMENT '事件id',
- `search_event_id` STRING COMMENT '搜索事件id',
- `keyword` STRING COMMENT '关键词',
- `app_version` STRING COMMENT 'APP版本',
- `index` STRING COMMENT '商品下标',
- `strategy_id` STRING COMMENT '策略id',
- `is_default` STRING COMMENT '主被动切换(1主动,0被动)',
- `is_operat` STRING COMMENT '是否广告位商品'
- )
- comment 'APP商品点击分区表'
- PARTITIONED BY (`dt` STRING COMMENT '日期分区(yyyymmdd)')
- STORED AS orc LOCATION 'obs://yishou-test/yishou_data_qa_test.db/dwd_goods_click_dt'
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- -- 写数脚本
- insert overwrite table ${yishou_data_dbname}.dwd_goods_click_dt partition(dt)
- select
- user_id,
- coalesce(get_json_object(click_goods, '$.goods_id'), get_json_object(click_goods, '$.good_id')) as goods_id,
- get_json_object(click_goods, '$.is_rec') as is_rec,
- special_id,
- os,
- get_json_object(click_goods, '$.goods_no') as goods_no,
- pid,
- ptime,
- source,
- time as report_time,
- event_id,
- search_event_id,
- keyword,
- app_version,
- get_json_object(click_goods, '$.index') as index,
- get_json_object(click_goods, '$.strategy_id') as strategy_id,
- get_json_object(click_goods, '$.is_default') as is_default,
- get_json_object(click_goods, '$.is_operat') as is_operat,
- from_unixtime(time - 25200, 'yyyyMMdd') as dt
- from (
- select
- get_json_object(scdata, '$.user_id') as user_id,
- get_json_object(scdata, '$.special_id') as special_id,
- get_json_object(scdata, '$.os') as os,
- get_json_object(scdata, '$.pid') as pid,
- get_json_object(scdata, '$.ptime') as ptime,
- get_json_object(scdata, '$.source') as source,
- __time__ as time,
- get_json_object(scdata, '$.event_id') as event_id,
- get_json_object(scdata, '$.search_event_id') as search_event_id,
- get_json_object(scdata, '$.keyword') as keyword,
- get_json_object(scdata, '$.app_version') as app_version,
- regexp_replace1(
- regexp_replace1(
- regexp_replace1(
- get_json_object(scdata, '$.click_goods_arr'),
- '^\\[',
- ''
- ),
- '\\]$',
- ''
- ),
- '},\\{',
- '}|-|{'
- ) as click_goods_arr
- from
- ${yishou_data_dbname}.ods_yishou_log_exposure_dt
- where
- dt BETWEEN ${one_day_ago} and ${today}
- and from_unixtime(__time__ - 25200, 'yyyyMMdd') = ${one_day_ago}
- and (lower(topic) = 'goodsexposure' or lower(__topic__) = 'goodsexposure')
- )
- lateral view outer explode (split(click_goods_arr, '\\|-\\|'))t as click_goods
- where
- click_goods is not null
- and click_goods != ''
- DISTRIBUTE by floor(rand() * 10)
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- -- 创建视图
- drop view if exists ${yishou_data_dbname}.dwd_goods_click;
- create view if not exists ${yishou_data_dbname}.dwd_goods_click (
- `user_id` COMMENT '用户id',
- `goods_id` COMMENT '商品id',
- `is_rec` COMMENT '是否搜索结果推荐商品',
- `special_id` COMMENT '专场id',
- `os` COMMENT '操作系统',
- `goods_no` COMMENT '货号',
- `pid` COMMENT '来源',
- `ptime` COMMENT '页面停留时间',
- `source` COMMENT '专场列表来源',
- `report_time` COMMENT '上报时间',
- `event_id` COMMENT '事件id',
- `search_event_id` COMMENT '搜索事件id',
- `keyword` COMMENT '关键词',
- `app_version` COMMENT 'APP版本',
- `index` COMMENT '商品下标',
- `strategy_id` COMMENT '策略id',
- `is_default` COMMENT '主被动切换(1主动,0被动)',
- `is_operat` COMMENT '是否广告位商品',
- `dt` COMMENT '日期分区(yyyymmdd)'
-
- ) comment 'APP商品点击视图'
- as
- select
- user_id,
- coalesce(get_json_object(click_goods, '$.goods_id'), get_json_object(click_goods, '$.good_id')) as goods_id,
- get_json_object(click_goods, '$.is_rec') as is_rec,
- special_id,
- os,
- get_json_object(click_goods, '$.goods_no') as goods_no,
- pid,
- ptime,
- source,
- time as report_time,
- event_id,
- search_event_id,
- keyword,
- app_version,
- get_json_object(click_goods, '$.index') as index,
- get_json_object(click_goods, '$.strategy_id') as strategy_id,
- get_json_object(click_goods, '$.is_default') as is_default,
- get_json_object(click_goods, '$.is_operat') as is_operat,
- from_unixtime(time - 25200, 'yyyyMMdd') as dt
- from (
- select
- get_json_object(scdata, '$.user_id') as user_id,
- get_json_object(scdata, '$.special_id') as special_id,
- get_json_object(scdata, '$.os') as os,
- get_json_object(scdata, '$.pid') as pid,
- get_json_object(scdata, '$.ptime') as ptime,
- get_json_object(scdata, '$.source') as source,
- __time__ as time,
- get_json_object(scdata, '$.event_id') as event_id,
- get_json_object(scdata, '$.search_event_id') as search_event_id,
- get_json_object(scdata, '$.keyword') as keyword,
- get_json_object(scdata, '$.app_version') as app_version,
- regexp_replace1(
- regexp_replace1(
- regexp_replace1(
- get_json_object(scdata, '$.click_goods_arr'),
- '^\\[',
- ''
- ),
- '\\]$',
- ''
- ),
- '},\\{',
- '}|-|{'
- ) as click_goods_arr
- from ${yishou_data_dbname}.ods_yishou_log_exposure_dt
- where
- dt >= ${thirty_day_ago}
- and (lower(topic) = 'goodsexposure' or lower(__topic__) = 'goodsexposure')
- )
- lateral view outer explode (split(click_goods_arr, '\\|-\\|'))t as click_goods
- where
- click_goods is not null
- and click_goods != ''
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
分区表查询结果展示:
视图结果展示:
注:脚本实现如下所示,关于具体使用和总结可以查看文章最后DWD层流量日志解析总结章节。
- -- 建表语句
- create table if not exists ${yishou_data_dbname}.dwd_goods_exposure_dt (
- `user_id` STRING COMMENT '用户id',
- `goods_id` STRING COMMENT '商品id',
- `is_rec` STRING COMMENT '是否搜索结果推荐商品',
- `special_id` STRING COMMENT '专场id',
- `os` STRING COMMENT '操作系统',
- `goods_no` STRING COMMENT '货号',
- `pid` STRING COMMENT '来源',
- `ptime` STRING COMMENT '页面停留时间',
- `source` STRING COMMENT '专场列表来源',
- `report_time` STRING COMMENT '上报时间',
- `event_id` STRING COMMENT '事件id',
- `search_event_id` STRING COMMENT '搜索事件id',
- `keyword` STRING COMMENT '关键词',
- `app_version` STRING COMMENT 'APP版本',
- `exposure_index` STRING COMMENT '曝光商品下标',
- `strategy_id` STRING COMMENT '策略id',
- `is_default` STRING COMMENT '主被动切换(1主动,0被动)',
- `is_operat` STRING COMMENT '是否广告位商品'
- )
- comment 'APP商品曝光分区表'
- PARTITIONED BY (`dt` STRING COMMENT '日期分区(yyyymmdd)')
- STORED AS orc LOCATION 'obs://yishou-test/yishou_data_qa_test.db/dwd_goods_exposure_dt'
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- -- 写数脚本
- insert overwrite table ${yishou_data_dbname}.dwd_goods_exposure_dt partition(dt)
- select
- user_id,
- coalesce(get_json_object(exposure_goods, '$.goods_id'), get_json_object(exposure_goods, '$.good_id')) as goods_id,
- get_json_object(exposure_goods, '$.is_rec') as is_rec,
- special_id,
- os,
- get_json_object(exposure_goods, '$.goods_no') as goods_no,
- pid,
- ptime,
- source,
- time as report_time,
- event_id,
- search_event_id,
- keyword,
- app_version,
- get_json_object(exposure_goods, '$.index') as exposure_index,
- get_json_object(exposure_goods, '$.strategy_id') as strategy_id,
- get_json_object(exposure_goods, '$.is_default') as is_default,
- get_json_object(exposure_goods, '$.is_operat') as is_operat,
- from_unixtime(time - 25200, 'yyyyMMdd') as dt
- from (
- select
- get_json_object(scdata, '$.user_id') as user_id,
- get_json_object(scdata, '$.special_id') as special_id,
- get_json_object(scdata, '$.os') as os,
- get_json_object(scdata, '$.pid') as pid,
- get_json_object(scdata, '$.ptime') as ptime,
- get_json_object(scdata, '$.source') as source,
- __time__ as time,
- get_json_object(scdata, '$.event_id') as event_id,
- get_json_object(scdata, '$.search_event_id') as search_event_id,
- get_json_object(scdata, '$.keyword') as keyword,
- get_json_object(scdata, '$.app_version') as app_version,
- regexp_replace1(
- regexp_replace1(
- regexp_replace1(
- get_json_object(scdata, '$.goods_arr'),
- '^\\[',
- ''
- ),
- '\\]$',
- ''
- ),
- '},\\{',
- '}|-|{'
- ) as exposure_goods_arr
- from
- ${yishou_data_dbname}.ods_yishou_log_exposure_dt
- where
- dt BETWEEN ${one_day_ago} and ${today}
- and from_unixtime(__time__ - 25200, 'yyyyMMdd') = ${one_day_ago}
- and (lower(topic) = 'goodsexposure' or lower(__topic__) = 'goodsexposure')
- )
- lateral view outer explode (split(exposure_goods_arr, '\\|-\\|'))t as exposure_goods
- where
- exposure_goods is not null
- and exposure_goods != ''
- DISTRIBUTE by floor(rand() * 20)
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
-
- -- 创建视图
- drop view if exists ${yishou_data_dbname}.dwd_goods_exposure;
- create view if not exists ${yishou_data_dbname}.dwd_goods_exposure (
- `user_id` COMMENT '用户id',
- `goods_id` COMMENT '商品id',
- `is_rec` COMMENT '是否搜索结果推荐商品',
- `special_id` COMMENT '专场id',
- `os` COMMENT '操作系统',
- `goods_no` COMMENT '货号',
- `pid` COMMENT '来源',
- `ptime` COMMENT '页面停留时间',
- `source` COMMENT '专场列表来源',
- `report_time` COMMENT '上报时间',
- `event_id` COMMENT '事件id',
- `search_event_id` COMMENT '搜索事件id',
- `keyword` COMMENT '关键词',
- `app_version` COMMENT 'APP版本',
- `exposure_index` COMMENT '曝光商品下标',
- `strategy_id` COMMENT '策略id',
- `is_default` COMMENT '主被动切换(1主动,0被动)',
- `is_operat` COMMENT '是否广告位商品',
- `dt` COMMENT '日期分区(yyyymmdd)'
-
- ) comment 'APP商品曝光视图'
- as
- select
- user_id,
- coalesce(get_json_object(exposure_goods, '$.goods_id'), get_json_object(exposure_goods, '$.good_id')) as goods_id,
- get_json_object(exposure_goods, '$.is_rec') as is_rec,
- special_id,
- os,
- get_json_object(exposure_goods, '$.goods_no') as goods_no,
- pid,
- ptime,
- source,
- time as report_time,
- event_id,
- search_event_id,
- keyword,
- app_version,
- get_json_object(exposure_goods, '$.index') as exposure_index,
- get_json_object(exposure_goods, '$.strategy_id') as strategy_id,
- get_json_object(exposure_goods, '$.is_default') as is_default,
- get_json_object(exposure_goods, '$.is_operat') as is_operat,
- from_unixtime(time - 25200, 'yyyyMMdd') as dt
- from (
- select
- get_json_object(scdata, '$.user_id') as user_id,
- get_json_object(scdata, '$.special_id') as special_id,
- get_json_object(scdata, '$.os') as os,
- get_json_object(scdata, '$.pid') as pid,
- get_json_object(scdata, '$.ptime') as ptime,
- get_json_object(scdata, '$.source') as source,
- __time__ as time,
- get_json_object(scdata, '$.event_id') as event_id,
- get_json_object(scdata, '$.search_event_id') as search_event_id,
- get_json_object(scdata, '$.keyword') as keyword,
- get_json_object(scdata, '$.app_version') as app_version,
- regexp_replace1(
- regexp_replace1(
- regexp_replace1(
- get_json_object(scdata, '$.goods_arr'),
- '^\\[',
- ''
- ),
- '\\]$',
- ''
- ),
- '},\\{',
- '}|-|{'
- ) as exposure_goods_arr
- from
- ${yishou_data_dbname}.ods_yishou_log_exposure_dt
- where
- dt > ${thirty_day_ago}
- and (lower(topic) = 'goodsexposure' or lower(__topic__) = 'goodsexposure')
- )
- lateral view outer explode (split(exposure_goods_arr, '\\|-\\|'))t as exposure_goods
- where
- exposure_goods is not null
- and exposure_goods != ''
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
分区表查询结果展示:
视图结果展示:
注:脚本实现如下所示,关于具体使用和总结可以查看文章最后DWD层流量日志解析总结章节。
- -- 建表语句
- create table if not exists ${yishou_data_dbname}.dwd_app_start_dt (
- `report_time` bigint COMMENT '上报时间(10位时间戳,秒值)',
- `time` bigint COMMENT '触发时间(10位时间戳,秒值)',
- `distinct_id` STRING COMMENT '唯一标识码',
- `os` STRING COMMENT '手机操作系统',
- `ab_test` STRING COMMENT 'AB版本区分',
- `is_first_day` STRING COMMENT '是否首日访问',
- `is_first_time` STRING COMMENT '是否首次访问',
- `resume_from_background` STRING COMMENT '是否从后台唤起',
- `user_id` STRING COMMENT '用户id',
- `app_version` STRING COMMENT 'app版本',
- `network_type` STRING COMMENT '网络环境',
- `manufacturer` STRING COMMENT '设备名称',
- `device_id` STRING COMMENT '设备号',
- `os_version` STRING COMMENT '手机操作系统版本号',
- `carrier` STRING COMMENT '运营商',
- `is_pro` STRING COMMENT '是否为Pro版{1:童装货源APP,0:一手APP}',
- `model` STRING COMMENT '手机型号'
- )
- comment 'APP启动分区表'
- PARTITIONED BY (`dt` STRING COMMENT '日期分区(yyyymmdd)')
- STORED AS orc LOCATION 'obs://yishou-test/yishou_data_qa_test.db/dwd_app_start_dt'
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- -- 写数脚本
- insert overwrite table ${yishou_data_dbname}.dwd_app_start_dt partition(dt)
- select
- __time__ as report_time,
- get_json_object(scdata, '$.time') / 1000 as time,
- get_json_object(scdata, '$.distinct_id') as distinct_id,
- get_json_object(scdata, '$.properties.os') as os,
- get_json_object(scdata, '$.properties.isAorB') as ab_test,
- get_json_object(scdata, '$.properties.is_first_day') as is_first_day,
- get_json_object(scdata, '$.properties.is_first_time') as is_first_time,
- get_json_object(scdata, '$.properties.resume_from_background') as resume_from_background,
- get_json_object(scdata, '$.properties.userid') as user_id,
- get_json_object(scdata, '$.properties.app_version') as app_version,
- get_json_object(scdata, '$.properties.network_type') as network_type,
- get_json_object(scdata, '$.properties.manufacturer') as manufacturer,
- get_json_object(scdata, '$.properties.device_id') as device_id,
- get_json_object(scdata, '$.properties.os_version') as os_version,
- get_json_object(scdata, '$.properties.carrier') as carrier,
- get_json_object(scdata, '$.properties.isPro') as is_pro,
- get_json_object(scdata, '$.properties.model') as model,
- from_unixtime(__time__ - 25200, 'yyyyMMdd') as dt
- from
- ${yishou_data_dbname}.ods_new_app_log_store_dt
- where
- dt BETWEEN ${one_day_ago} and ${today}
- and from_unixtime(__time__ - 25200, 'yyyyMMdd') = ${one_day_ago}
- and (lower(topic) = 'appstart' or lower(__topic__) = 'appstart')
- DISTRIBUTE by floor(rand() * 3)
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
- -- 创建视图
- drop view if exists ${yishou_data_dbname}.dwd_app_start;
- create view if not exists ${yishou_data_dbname}.dwd_app_start (
- `report_time` COMMENT '上报时间(10位时间戳,秒值)',
- `time` COMMENT '触发时间(10位时间戳,秒值)',
- `distinct_id` COMMENT '唯一标识码',
- `os` COMMENT '手机操作系统',
- `ab_test` COMMENT 'AB版本区分',
- `is_first_day` COMMENT '是否首日访问',
- `is_first_time` COMMENT '是否首次访问',
- `resume_from_background` COMMENT '是否从后台唤起',
- `user_id` COMMENT '用户id',
- `app_version` COMMENT 'app版本',
- `network_type` COMMENT '网络环境',
- `manufacturer` COMMENT '设备名称',
- `device_id` COMMENT '设备号',
- `os_version` COMMENT '手机操作系统版本号',
- `carrier` COMMENT '运营商',
- `is_pro` COMMENT '是否为Pro版{1:童装货源APP,0:一手APP}',
- `model` COMMENT '手机型号',
- `dt` COMMENT '日期分区(yyyymmdd)'
- )
- as
- select
- __time__ as report_time,
- get_json_object(scdata, '$.time') / 1000 as time,
- get_json_object(scdata, '$.distinct_id') as distinct_id,
- get_json_object(scdata, '$.properties.os') as os,
- get_json_object(scdata, '$.properties.isAorB') as ab_test,
- get_json_object(scdata, '$.properties.is_first_day') as is_first_day,
- get_json_object(scdata, '$.properties.is_first_time') as is_first_time,
- get_json_object(scdata, '$.properties.resume_from_background') as resume_from_background,
- get_json_object(scdata, '$.properties.userid') as user_id,
- get_json_object(scdata, '$.properties.app_version') as app_version,
- get_json_object(scdata, '$.properties.network_type') as network_type,
- get_json_object(scdata, '$.properties.manufacturer') as manufacturer,
- get_json_object(scdata, '$.properties.device_id') as device_id,
- get_json_object(scdata, '$.properties.os_version') as os_version,
- get_json_object(scdata, '$.properties.carrier') as carrier,
- get_json_object(scdata, '$.properties.isPro') as is_pro,
- get_json_object(scdata, '$.properties.model') as model,
- from_unixtime(__time__ - 25200, 'yyyyMMdd') as dt
- from
- ${yishou_data_dbname}.ods_new_app_log_store_dt
- where
- dt > ${thirty_day_ago}
- and (lower(topic) = 'appstart' or lower(__topic__) = 'appstart')
- ;
![](https://csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreWhite.png)
分区表查询结果展示:
视图结果展示:
在上述代码中,会从ODS中获取数据,然后根据不同业务主题进行数据ETL工作,再写入到DWD层对应表中,每次都是获取ODS所有的数据,一个ODS表中会包含几十个DWD的主题;这里可以考虑在ODS层就对以日期和主题名进行分区,但博主这里没有划分,因为ODS层会根据APP曝光、APP点击、H5日志等已经划分为9个大主题了,并且每次读取数据也很快(20秒以内),如下图SparkUI所示:
在对ODS是否进行主题分区,可以综合考虑读取文件的时间和文件系统中小文件的大小个数;根据 离线数仓(8):ODS层实现之导入流量日志 文章中介绍,会3分钟生成一个文件,每天会有480个文件生成,每个文件大小平均为20M左右,但如果进行分区,那有多少个分区就会有多少倍的小文件;可以对这2方面综合考虑,来看是否在ODS层中对日期和主题进行分区。
在上述代码中,我们可以看到每天会将前一天的数据进行ELT,并将结果数据写入到DWD层对应表的分区中;那为什么还要创建视图呢?这是因为博主这会有近实时作业,要求使用最新的数据,每小时跑一次,这种作业如果每小时将数据重写一次到文件系统中会比较麻烦;而且现在是每小时,后续可能会要求30分钟,但综合考虑又不需要使用Flink(使用Flink完全实时对资源要求比较高);这时候就能使用视图了,在ODS中为3分钟级别更新,这样就能满足需要了。
并且注意,视图为每天早晨会删除旧视图,然后创建新的视图,视图中获取的是近30天的数据;这是因为需要当天的数据时,一般来说30天的历史数据就完全足够,这可以根据实际情况增长或者缩短。
在HQL中(博主这里使用的是华为云的DLI服务,使用的是Spark on Hive,所以编写的为SparkSQL),如果要获取JSONObject中的数据,可以使用get_json_object函数来获取;但没有现成的解析JSONArray的函数,所以一般情况下需要通过编写代码,自定义UDTF函数来实现此功能,然后通过 lateral view outer explode () 来将数据由一行转换成多行。
博主这里由于历史原因,导致埋点的数据不是很符合JSONArray格式(即商品点击或商品曝光中,既有JSONArray,又有JSONObject);所以博主使用的方法是,通过regexp_replace方法和正则表达式(在代码中是regexp_replace1,是由于原先是阿里云后面迁移到华为云,迁移导致的),将其中的开始的 [ 和最后的 ] 进行替换,再将其中的 },{ 进行替换,最后使用 split 切割成集合,最后炸裂实现,具体可以查看上述代码。
注:其他 离线数仓 相关文章链接由此进 -> 离线数仓文章汇总
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。