赞
踩
-- 本地(指hive服务所在的主机)加载数据 load data local inpath '/root/hivedata/data1.tsv' into table db_msg.tb_msg_source; load data local inpath '/root/hivedata/data2.tsv' into table db_msg.tb_msg_source; -- 验证数据文件是否映射成功 select count(*) from db_msg.tb_msg_source; select * from db_msg.tb_msg_source limit 10; ------------- ETL -------------- -- 测试 select msg_time, substr(msg_time, 1, 10) dayinfo, substr(msg_time, 12, 2) hourinfo, sender_gps, split(sender_gps, ',')[0] sender_lng, split(sender_gps, ',')[1] sender_lat from db_msg.tb_msg_source where length(sender_gps) > 0 -- 过滤为空的非法数据 limit 3; --如果表已存在就删除 drop table if exists db_msg.tb_msg_etl; --将Select语句的结果保存到新表中 create table db_msg.tb_msg_etl as select *, substr(msg_time,0,10) as dayinfo, --获取天 substr(msg_time,12,2) as hourinfo, --获取小时 split(sender_gps,",")[0] as sender_lng, --提取经度 split(sender_gps,",")[1] as sender_lat --提取纬度 from db_msg.tb_msg_source where length(sender_gps) > 0; --过滤字段为空的数据 -- 检查数据 select * from db_msg.tb_msg_etl limit 3; --------------- 需求指标统计分析 ---------------- --需求:统计今日总消息量 drop table if exists tb_rs_total_msg_cnt; create table tb_rs_total_msg_cnt comment "今日总消息量" as select dayinfo, count(*) total_msg_cnt from tb_msg_etl group by dayinfo; select * from tb_rs_total_msg_cnt; -- 验证 --需求:统计今日每小时消息量、发送和接收用户数 drop table if exists tb_rs_hour_msg_cnt; create table tb_rs_hour_msg_cnt comment "每小时消息量趋势" as select dayinfo, hourinfo, count(*) total_msg_cnt, count(distinct sender_account) sender_usr_cnt, count(distinct receiver_account) receiver_usr_cnt from tb_msg_etl group by dayinfo, hourinfo; select * from tb_rs_hour_msg_cnt; --需求:统计今日各地区发送消息数据量 drop table if exists tb_rs_loc_cnt; create table tb_rs_loc_cnt comment "今日各地区发送消息总量" as select dayinfo, sender_gps, cast(sender_lng as double) longtitude, cast(sender_lat as double) latitude, count(*) total_msg_cnt from tb_msg_etl group by dayinfo,sender_gps,sender_lng,sender_lat; select * from tb_rs_loc_cnt; --结果验证 --需求:统计今日发送消息和接收消息的用户数 create table if not exists tb_rs_usr_cnt comment "今日发送消息人数、接受消息人数" as select dayinfo, count(distinct sender_account) as sender_usr_cnt, count(distinct receiver_account) as receiver_usr_cnt from db_msg.tb_msg_etl group by dayinfo; select * from tb_rs_usr_cnt; --结果验证 --需求:统计今日发送消息最多的Top10用户 create table if not exists tb_rs_susr_top10 comment "发送消息条数最多的Top10用户" as select dayinfo, sender_name as username, count(*) as sender_msg_cnt from db_msg.tb_msg_etl group by dayinfo,sender_name order by sender_msg_cnt desc limit 10; select * from tb_rs_susr_top10; --结果验证 --需求:统计今日接收消息最多的Top10用户 create table if not exists tb_rs_rusr_top10 comment "接受消息条数最多的Top10用户" as select dayinfo, receiver_name as username, count(*) as receiver_msg_cnt from db_msg.tb_msg_etl group by dayinfo,receiver_name order by receiver_msg_cnt desc limit 10; select * from tb_rs_rusr_top10; --结果验证 --需求:统计发送人的手机型号分布情况 create table if not exists tb_rs_sender_phone comment "发送人的手机型号分布" as select dayinfo, sender_phonetype, count(distinct sender_account) as cnt from tb_msg_etl group by dayinfo,sender_phonetype; select * from tb_rs_sender_phone; --结果验证 --需求:统计发送人的设备操作系统分布情况 create table if not exists tb_rs_sender_os comment "发送人的OS分布" as select dayinfo, sender_os, count(distinct sender_account) as cnt from tb_msg_etl group by dayinfo,sender_os; select * from tb_rs_sender_os; --结果验证
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。