赞
踩
全量 select * from 表 where 1 = 1
增量 select * from 表 where createtime = 当天
新增和变化 select * from 表 where createtime or updatetime = 当天
#! /bin/bash sqoop=/opt/module/sqoop/bin/sqoop do_date=`date -d '-1 day' +%F` import_data(){ $sqoop import \ --connect jdbc:mysql://hadoop102:3306/gmall \ --username root \ --password 000000 \ --target-dir /origin_data/gmall/db/$1/$do_date \ --delete-target-dir \ --query "$2 and \$CONDITIONS" \ --num-mappers 1 \ --fields-terminated-by '\t' \ --compress \ --compression-codec lzop \ --null-string '\\N' \ --null-non-string '\\N' hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /origin_data/gmall/db/$1/$do_date } import_order_info(){ import_data order_info "select id, final_total_amount, order_status, user_id, out_trade_no, create_time, operate_time, province_id, benefit_reduce_amount, original_total_amount, feight_fee from order_info where (date_format(create_time,'%Y-%m-%d')='$do_date' or date_format(operate_time,'%Y-%m-%d')='$do_date')" } case $1 in "order_info") import_order_info ;; "first") import_base_category1 import_base_category2 import_base_category3 import_order_info import_order_detail import_sku_info import_user_info import_payment_info import_base_province import_base_region import_base_trademark import_activity_info import_activity_order import_cart_info import_comment_info import_coupon_use import_coupon_info import_favor_info import_order_refund_info import_order_status_log import_spu_info import_activity_rule import_base_dic ;; "all") import_base_category1 import_base_category2 import_base_category3 import_order_info import_order_detail import_sku_info import_user_info import_payment_info import_base_trademark import_activity_info import_activity_order import_cart_info import_comment_info import_coupon_use import_coupon_info import_favor_info import_order_refund_info import_order_status_log import_spu_info import_activity_rule import_base_dic ;; esac
chmod 777 mysql_to_hdfs.sh
mysql_to_hdfs.sh first 2020-03-10
mysql_to_hdfs.sh all 2020-03-11
create external table ods_order_info ( `id` string COMMENT '订单号', `final_total_amount` decimal(10,2) COMMENT '订单金额', `order_status` string COMMENT '订单状态', `user_id` string COMMENT '用户id', `out_trade_no` string COMMENT '支付流水号', `create_time` string COMMENT '创建时间', `operate_time` string COMMENT '操作时间', `province_id` string COMMENT '省份ID', `benefit_reduce_amount` decimal(10,2) COMMENT '优惠金额', `original_total_amount` decimal(10,2) COMMENT '原价金额', `feight_fee` decimal(10,2) COMMENT '运费' ) COMMENT '订单表' PARTITIONED BY (`dt` string) row format delimited fields terminated by '\t' STORED AS INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' location '/warehouse/gmall/ods/ods_order_info/';
维度表:一般是对事实的描述信息。每一张维表对应现实世界中的一个对象或者概念。 例如:用户、商品、日期、地区等。
维表的特征:
维表的范围很宽(具有多个属性、列比较多)
跟事实表相比,行数相对较小:通常< 10万条
内容相对固定:编码表
事实表中的每行数据代表一个业务事件(下单、支付、退款、评价等)。“事实”这个术语表示的是业务事件的度量值(可统计次数、个数、金额等),例如,订单事件中的下单金额。
每一个事实表的行包括:具有可加性的数值型的度量值、与维表相连接的外键、通常具有两个和两个以上的外键、外键之间表示维表之间多对多的关系。
事实表的特征:
非常的大
内容相对的窄:列数较少
经常发生变化,每天会新增加很多。
CREATE EXTERNAL TABLE `dwd_dim_sku_info` ( `id` string COMMENT '商品id', `spu_id` string COMMENT 'spuid', `price` double COMMENT '商品价格', `sku_name` string COMMENT '商品名称', `sku_desc` string COMMENT '商品描述', `weight` double COMMENT '重量', `tm_id` string COMMENT '品牌id', `tm_name` string COMMENT '品牌名称', `category3_id` string COMMENT '三级分类id', `category2_id` string COMMENT '二级分类id', `category1_id` string COMMENT '一级分类id', `category3_name` string COMMENT '三级分类名称', `category2_name` string COMMENT '二级分类名称', `category1_name` string COMMENT '一级分类名称', `spu_name` string COMMENT 'spu名称', `create_time` string COMMENT '创建时间' ) COMMENT '商品维度表' PARTITIONED BY (`dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_dim_sku_info/' tblproperties ("parquet.compression"="lzo");
insert overwrite table dwd_dim_sku_info partition(dt='2020-03-10') select sku.id, sku.spu_id, sku.price, sku.sku_name, sku.sku_desc, sku.weight, sku.tm_id, ob.tm_name, sku.category3_id, c2.id category2_id, c1.id category1_id, c3.name category3_name, c2.name category2_name, c1.name category1_name, spu.spu_name, sku.create_time from ( select * from ods_sku_info where dt='2020-03-10' )sku join ( select * from ods_base_trademark where dt='2020-03-10' )ob on sku.tm_id=ob.tm_id join ( select * from ods_spu_info where dt='2020-03-10' )spu on spu.id = sku.spu_id join ( select * from ods_base_category3 where dt='2020-03-10' )c3 on sku.category3_id=c3.id join ( select * from ods_base_category2 where dt='2020-03-10' )c2 on c3.category2_id=c2.id join ( select * from ods_base_category1 where dt='2020-03-10' )c1 on c2.category1_id=c1.id;
select * from dwd_dim_sku_info where dt='2020-03-10';
事实表的设计都遵循5个步骤:
1、确定业务过程
2、定义粒度
3、确定维度
4、确定事实
1)事务型事实表
以每个事务或事件为单位,例如一个销售订单记录,一笔支付记录等,作为事实表里的一行数据。一旦事务被提交,事实表数据被插入,数据就不再进行更改,其更新方式为增量更新。
2)周期型快照事实表
周期型快照事实表中不会保留所有数据,只保留固定时间间隔的数据,例如每天或者每月的销售额,或每月的账户余额等。
3)累积型快照事实表
累计快照事实表用于跟踪业务事实的变化。例如,数据仓库中可能需要累积或者存储订单从下订单开始,到订单商品被打包、运输、和签收的各个业务阶段的时间点数据来跟踪订单声明周期的进展情况。当这个业务过程进行时,事实表的记录也要不断更新。
create external table dwd_fact_payment_info ( `id` string COMMENT '', `out_trade_no` string COMMENT '对外业务编号', `order_id` string COMMENT '订单编号', `user_id` string COMMENT '用户编号', `alipay_trade_no` string COMMENT '支付宝交易流水编号', `payment_amount` decimal(16,2) COMMENT '支付金额', `subject` string COMMENT '交易内容', `payment_type` string COMMENT '支付类型', `payment_time` string COMMENT '支付时间', `province_id` string COMMENT '省份ID' ) PARTITIONED BY (`dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_fact_payment_info/' tblproperties ("parquet.compression"="lzo");
insert overwrite table dwd_fact_payment_info partition(dt='2020-03-10') select pi.id, pi.out_trade_no, pi.order_id, pi.user_id, pi.alipay_trade_no, pi.total_amount, pi.subject, pi.payment_type, pi.payment_time, oi.province_id from ( select * from ods_payment_info where dt='2020-03-10' )pi join ( select id, province_id from ods_order_info where dt='2020-03-10' )oi on pi.order_id = oi.id;
select * from dwd_fact_payment_info where dt='2020-03-10';
create external table dwd_fact_favor_info(
`id` string COMMENT '编号',
`user_id` string COMMENT '用户id',
`sku_id` string COMMENT 'skuid',
`spu_id` string COMMENT 'spuid',
`is_cancel` string COMMENT '是否取消',
`create_time` string COMMENT '收藏时间',
`cancel_time` string COMMENT '取消时间'
) COMMENT '收藏事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_favor_info/';
insert overwrite table dwd_fact_favor_info partition(dt='2020-03-10')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ods_favor_info
where dt='2020-03-10';
select * from dwd_fact_favor_info where dt='2020-03-10';
create external table dwd_fact_order_info ( `id` string COMMENT '订单编号', `order_status` string COMMENT '订单状态', `user_id` string COMMENT '用户id', `out_trade_no` string COMMENT '支付流水号', `create_time` string COMMENT '创建时间(未支付状态)', `payment_time` string COMMENT '支付时间(已支付状态)', `cancel_time` string COMMENT '取消时间(已取消状态)', `finish_time` string COMMENT '完成时间(已完成状态)', `refund_time` string COMMENT '退款时间(退款中状态)', `refund_finish_time` string COMMENT '退款完成时间(退款完成状态)', `province_id` string COMMENT '省份ID', `activity_id` string COMMENT '活动ID', `original_total_amount` string COMMENT '原价金额', `benefit_reduce_amount` string COMMENT '优惠金额', `feight_fee` string COMMENT '运费', `final_total_amount` decimal(10,2) COMMENT '订单金额' ) PARTITIONED BY (`dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_fact_order_info/' tblproperties ("parquet.compression"="lzo");
set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dwd_fact_order_info partition(dt) select if(new.id is null,old.id,new.id), if(new.order_status is null,old.order_status,new.order_status), if(new.user_id is null,old.user_id,new.user_id), if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no), if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001对应未支付状态 if(new.tms['1002'] is null,old.payment_time,new.tms['1002']), if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']), if(new.tms['1004'] is null,old.finish_time,new.tms['1004']), if(new.tms['1005'] is null,old.refund_time,new.tms['1005']), if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']), if(new.province_id is null,old.province_id,new.province_id), if(new.activity_id is null,old.activity_id,new.activity_id), if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount), if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount), if(new.feight_fee is null,old.feight_fee,new.feight_fee), if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount), date_format(if(new.tms['1001'] is null,old.create_time,new.tms['1001']),'yyyy-MM-dd') from ( select id, order_status, user_id, out_trade_no, create_time, payment_time, cancel_time, finish_time, refund_time, refund_finish_time, province_id, activity_id, original_total_amount, benefit_reduce_amount, feight_fee, final_total_amount from dwd_fact_order_info where dt in ( select date_format(create_time,'yyyy-MM-dd') from ods_order_info where dt='2020-03-10' ) )old full outer join ( select info.id, info.order_status, info.user_id, info.out_trade_no, info.province_id, act.activity_id, log.tms, info.original_total_amount, info.benefit_reduce_amount, info.feight_fee, info.final_total_amount from ( select order_id, str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms from ods_order_status_log where dt='2020-03-10' group by order_id )log join ( select * from ods_order_info where dt='2020-03-10' )info on log.order_id=info.id left join ( select * from ods_activity_order where dt='2020-03-10' )act on log.order_id=act.order_id )new on old.id=new.id;
select * from dwd_fact_order_info where dt='2020-03-10';
create external table dws_user_action_daycount
(
user_id string comment '用户 id',
login_count bigint comment '登录次数',
cart_count bigint comment '加入购物车次数',
cart_amount double comment '加入购物车金额',
order_count bigint comment '下单次数',
order_amount decimal(16,2) comment '下单金额',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额'
) COMMENT '每日用户行为'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action_daycount/'
tblproperties ("parquet.compression"="lzo");
insert overwrite table dws_user_action_daycount partition(dt='2020-03-10') select user_actions.user_id, sum(user_actions.login_count), sum(user_actions.cart_count), sum(user_actions.cart_amount), sum(user_actions.order_count), sum(user_actions.order_amount), sum(user_actions.payment_count), sum(user_actions.payment_amount) from ( select user_id, login_count, 0 cart_count, 0 cart_amount, 0 order_count, 0 order_amount, 0 payment_count, 0 payment_amount from tmp_login union all select user_id, 0 login_count, cart_count, cart_amount, 0 order_count, 0 order_amount, 0 payment_count, 0 payment_amount from tmp_cart union all select user_id, 0 login_count, 0 cart_count, 0 cart_amount, order_count, order_amount, 0 payment_count, 0 payment_amount from tmp_order union all select user_id, 0 login_count, 0 cart_count, 0 cart_amount, 0 order_count, 0 order_amount, payment_count, payment_amount from tmp_payment ) user_actions group by user_id;
select * from dws_user_action_daycount where dt='2020-03-10';
create external table dwt_user_topic ( user_id string comment '用户id', login_date_first string comment '首次登录时间', login_date_last string comment '末次登录时间', login_count bigint comment '累积登录天数', login_last_30d_count bigint comment '最近30日登录天数', order_date_first string comment '首次下单时间', order_date_last string comment '末次下单时间', order_count bigint comment '累积下单次数', order_amount decimal(16,2) comment '累积下单金额', order_last_30d_count bigint comment '最近30日下单次数', order_last_30d_amount bigint comment '最近30日下单金额', payment_date_first string comment '首次支付时间', payment_date_last string comment '末次支付时间', payment_count decimal(16,2) comment '累积支付次数', payment_amount decimal(16,2) comment '累积支付金额', payment_last_30d_count decimal(16,2) comment '最近30日支付次数', payment_last_30d_amount decimal(16,2) comment '最近30日支付金额' )COMMENT '用户主题宽表' stored as parquet location '/warehouse/gmall/dwt/dwt_user_topic/' tblproperties ("parquet.compression"="lzo");
insert overwrite table dwt_user_topic select nvl(new.user_id,old.user_id), if(old.login_date_first is null and new.login_count>0,'2020-03-10',old.login_date_first), if(new.login_count>0,'2020-03-10',old.login_date_last), nvl(old.login_count,0)+if(new.login_count>0,1,0), nvl(new.login_last_30d_count,0), if(old.order_date_first is null and new.order_count>0,'2020-03-10',old.order_date_first), if(new.order_count>0,'2020-03-10',old.order_date_last), nvl(old.order_count,0)+nvl(new.order_count,0), nvl(old.order_amount,0)+nvl(new.order_amount,0), nvl(new.order_last_30d_count,0), nvl(new.order_last_30d_amount,0), if(old.payment_date_first is null and new.payment_count>0,'2020-03-10',old.payment_date_first), if(new.payment_count>0,'2020-03-10',old.payment_date_last), nvl(old.payment_count,0)+nvl(new.payment_count,0), nvl(old.payment_amount,0)+nvl(new.payment_amount,0), nvl(new.payment_last_30d_count,0), nvl(new.payment_last_30d_amount,0) from dwt_user_topic old full outer join ( select user_id, sum(if(dt='2020-03-10',login_count,0)) login_count, sum(if(dt='2020-03-10',order_count,0)) order_count, sum(if(dt='2020-03-10',order_amount,0)) order_amount, sum(if(dt='2020-03-10',payment_count,0)) payment_count, sum(if(dt='2020-03-10',payment_amount,0)) payment_amount, sum(if(login_count>0,1,0)) login_last_30d_count, sum(order_count) order_last_30d_count, sum(order_amount) order_last_30d_amount, sum(payment_count) payment_last_30d_count, sum(payment_amount) payment_last_30d_amount from dws_user_action_daycount where dt>=date_add( '2020-03-10',-30) group by user_id )new on old.user_id=new.user_id;
select * from dwt_user_topic limit 5;
(1)mysql_to_hdfs.job type=command command=/home/atguigu/bin/mysql_to_hdfs.sh all ${dt} (2)hdfs_to_ods_log.job type=command command=/home/atguigu/bin/hdfs_to_ods_log.sh ${dt} (3)hdfs_to_ods_db.job type=command command=/home/atguigu/bin/hdfs_to_ods_db.sh all ${dt} dependencies=mysql_to_hdfs (4)ods_to_dwd_start_log.job type=command command=/home/atguigu/bin/ods_to_dwd_start_log.sh ${dt} dependencies=hdfs_to_ods_log (5)ods_to_dwd_db.job type=command command=/home/atguigu/bin/ods_to_dwd_db.sh ${dt} dependencies=hdfs_to_ods_db (6)dwd_to_dws.job type=command command=/home/atguigu/bin/dwd_to_dws.sh ${dt} dependencies=ods_to_dwd_db,ods_to_dwd_start_log (7)dws_to_dwt.job type=command command=/home/atguigu/bin/dws_to_dwt.sh ${dt} dependencies=dwd_to_dws (8)dwt_to_ads.job type=command command=/home/atguigu/bin/dwt_to_ads.sh ${dt} dependencies=dws_to_dwt (9)hdfs_to_mysql.job type=command command=/home/atguigu/bin/hdfs_to_mysql.sh ads_user_topic dependencies=dwt_to_ads
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。