赞
踩
创建临时视频表
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
创建临时用户表
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
加载原数据到临时表
load data local inpath "/opt/module/data/video" into table gulivideo_ori;
load data local inpath "/opt/module/user" into table gulivideo_user_ori;
创建视频表
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
创建用户表
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
由于初始表和最终表他们两者中的结构都是一样的,所以对其中一种进行解读
对于用户表
视频表
需求Ⅰ:统计视频观看数 Top10
代码:
select videoId
from gulivideo_orc
order by views
limit 10;
需求Ⅱ:统计视频类别热度 Top10
步骤代码:
--步骤一使用下列任意一种都可以,第一种使用侧偏移,第二种直接炸裂 select videoId, category_name from gulivideo_orc lateral view explode(category) tmp_category as category_name;t1 select explode(category) categoru_name from gulivideo_orc;t1 --第二步代码: select category_name, count(*) ct from t1 group by category_name order by ct desc limit 10;
最终代码:
select
category_name,
count(*) ct
from
(select
explode(category) category_name
from gulivideo_orc)t1
group by category_name
order by ct desc
limit 10;
需求Ⅲ:统计出视频观看数最高的 20 个视频的所属类别以及每一个类别包含 Top20 视频的个数
select
category
from gulivideo_orc
order by views desc
limit 20;t1
select
explode(category) category_name
from t1;t2
select
category_name,
count(*) video_sum
from t2
group by category_name;
最终代码:
select
category_name,
count(*) video_sum
from
(select
explode(category) category_name
from
(select
category
from gulivideo_orc
order by views desc
limit 20)t1)t2
group by category_name;
需求Ⅳ: 统计视频观看数 Top50 所关联视频的所属类别排序
分步代码:
select relatedId from gulivideo_orc order by views desc limit 50;t1 select explode(relatedId) relatedId_id from t1;t2 select category from t2 join gulivideo_orc t3 on t2.relatedId_id = t3.videoId;t4 select explode(category) category_name from t4;t5 select category_name, count(*) ct from t5 group by category_name order by ct desc;
代码:
select category_name, count(*) ct from (select explode(category) category_name from (select category from (select explode(relatedId) relatedId_id from (select relatedId from gulivideo_orc order by views desc limit 50)t1)t2 join gulivideo_orc t3 on t2.relatedId_id = t3.videoId)t4)t5 group by category_name order by ct desc;
需求Ⅴ:统计每个类别中的视频热度 Top10,以 Music 为例
分布代码:
select
videoId,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name;t1
select
videoId,
views,
category_name
from t1
where category_name = "Music"
order by views desc
limit 10;
代码:
select
videoId,
views,
category_name
from
(select
videoId,
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name)t1
where category_name = "Music"
order by views desc
limit 10;
需求Ⅵ:统计每个类别视频观看数 Top10
分步代码:
select videoId, views, category_name from gulivideo_orc lateral view explode(category) tmp_category as category_name;t1 select videoId, views, category_name rank() over(partition by t1.category_name order by t1.views desc) rk from t1;t2 select videoId, views, category_name, rk from t2 where t2.rk <= 10;
代码:
select videoId, views, category_name rk from ( select videoId, views, category_name, rank() over(partition by t1.category_name order by t1.views desc) rk from (select videoId, views, category_name from gulivideo_orc lateral view explode(category) tmp_category as category_name)t1)t2 where t2.rk <= 10;
需求Ⅶ:统计上传视频最多的用户 Top10 以及他们上传的视频观看次数在前 20 的视频
分布代码:
select uploader from gulivideo_user_orc order by videos desc limit 10;t1 select videoId, views, t2.uploader from t1 join gulivideo_orc g on t1.uploader = g.uploader;t2 select videoId, views, uploader, rank()over(partition by uploader order by views desc) rk from t2;t3 select videoId, views, uploader from t3 where rk<= 20;
代码:
select videoId, views, uploader, rank()over(partition by uploader order by views desc) rk from (select videoId, views, t1.uploader from (select uploader from gulivideo_user_orc order by videos desc limit 10)t1 join gulivideo_orc g on t1.uploader = g.uploader) t2;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。