赞
踩
--创建数据库
create database db_msg;
--切换数据库
use db_msg;
--建表 create table db_msg.tb_msg_source( msg_time string comment "消息发送时间" , sender_name string comment "发送人昵称" , sender_account string comment "发送人账号" , sender_sex string comment "发送人性别" , sender_ip string comment "发送人ip地址" , sender_os string comment "发送人操作系统" , sender_phonetype string comment "发送人手机型号" , sender_network string comment "发送人网络类型" , sender_gps string comment "发送人的GPS定位" , receiver_name string comment "接收人昵称" , receiver_ip string comment "接收人IP" , receiver_account string comment "接收人账号" , receiver_os string comment "接收人操作系统" , receiver_phonetype string comment "接收人手机型号" , receiver_network string comment "接收人网络类型" , receiver_gps string comment "接收人的GPS定位" , receiver_sex string comment "接收人性别" , msg_type string comment "消息类型" , distance string comment "双方距离" , message string comment "消息内容" ) row format delimited fields terminated by '\t';
--加载数据到表中
load data local inpath '/root/hivedata/data1.tsv' into table db_msg.tb_msg_source;
load data local inpath '/root/hivedata/data2.tsv' into table db_msg.tb_msg_source;
--查询表 验证数据文件是否映射成功
select * from tb_msg_source limit 10;
--统计行数
select count(*) as cnt from tb_msg_source;
--共有140465行
select
msg_time,
sender_name,
sender_gps
from db_msg.tb_msg_source
where length(sender_gps) = 0
limit 10;
select
msg_time
from db_msg.tb_msg_source
limit 10;
select
sender_gps
from db_msg.tb_msg_source
limit 10;
create table db_msg.tb_msg_etl as
select
*,
substr(msg_time,0,10) as dayinfo, --获取天
substr(msg_time,12,2) as hourinfo, --获取小时
split(sender_gps,",")[0] as sender_lng, --提取经度
split(sender_gps,",")[1] as sender_lat --提取纬度
from db_msg.tb_msg_source
--过滤字段为空的数据
where length(sender_gps) > 0 ;
select
msg_time,dayinfo,hourinfo,sender_gps,sender_lng,sender_lat
from db_msg.tb_msg_etl
limit 10;
create table if not exists tb_rs_total_msg_cnt
comment "今日消息总量"
as
select
dayinfo,
count(*) as total_msg_cnt
from db_msg.tb_msg_etl
group by dayinfo;
--结果验证
select * from tb_rs_total_msg_cnt;
create table if not exists tb_rs_hour_msg_cnt
comment "每小时消息量趋势"
as
select
dayinfo,
hourinfo,
count(*) as total_msg_cnt,
count(distinct sender_account) as sender_usr_cnt,
count(distinct receiver_account) as receiver_usr_cnt
from db_msg.tb_msg_etl
group by dayinfo, hourinfo;
--结果验证
select * from tb_rs_hour_msg_cnt;
create table if not exists tb_rs_loc_cnt
comment "今日各地区发送消息总量"
as
select
dayinfo,
sender_gps,
cast(sender_lng as double) as longitude, -- 转换为double
cast(sender_lat as double) as latitude, -- 转换为double
count(*) as total_msg_cnt
from db_msg.tb_msg_etl
group by dayinfo,sender_gps,sender_lng,sender_lat;
--结果验证
select * from tb_rs_loc_cnt;
create table if not exists tb_rs_usr_cnt
comment "今日发送消息人数、接受消息人数"
as
select
dayinfo,
count(distinct sender_account) as sender_usr_cnt, --去重
count(distinct receiver_account) as receiver_usr_cnt --去重
from db_msg.tb_msg_etl
group by dayinfo;
--结果验证
select * from tb_rs_usr_cnt;
create table if not exists tb_rs_susr_top10
comment "发送消息条数最多的Top10用户"
as
select
dayinfo,
sender_name as username,
count(*) as sender_msg_cnt
from db_msg.tb_msg_etl
group by dayinfo,sender_name
order by sender_msg_cnt desc
limit 10;
--结果验证
select * from tb_rs_susr_top10;
create table if not exists tb_rs_sender_phone
comment "发送人的手机型号分布"
as
select
dayinfo,
sender_phonetype,
count(distinct sender_account) as cnt
from tb_msg_etl
group by dayinfo,sender_phonetype;
--结果验证
select * from tb_rs_sender_phone;
create table if not exists tb_rs_sender_os
comment "发送人的OS分布"
as
select
dayinfo,
sender_os,
count(distinct sender_account) as cnt
from tb_msg_etl
group by dayinfo,sender_os;
--结果验证
select * from tb_rs_sender_os;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。