赞
踩
很多的UDTF不能将结果与源表进行关联,使用lateral view ,可以将UDTF拆分的单个字段数据与原始表数据关联上
select …… from tabelA lateral view UDTF(xxx) 视图名 as a,b,c
-- 假设我们有一张表pageAds,它有两列数据 -- 第一列是pageid(string类型),第二列是adid_list(Array<int>类型),即用逗号分隔的广告ID集合: pageid adid_list "front_page" [1, 2, 3] "contact_page" [3, 4, 5] -- 要统计所有广告ID在所有页面中出现的次数。 -- 首先分拆广告ID: SELECT pageid, adid FROM pageAds . LATERAL VIEW explode(adid_list) adTable AS adid; -- 执行结果如下:pageid(string类型),adid(int类型) pageid adid "front_page" 1 "front_page" 2 "front_page" 3 "contact_page" 3 "contact_page" 4 "contact_page" 5 -- 接下来就是一个聚合的统计: SELECT adid, count(1) FROM pageAds LATERAL VIEW explode(adid_list) adTable AS adid GROUP BY adid; -- 执行结果如下: adid count(1) 1 1 2 1 3 2 4 1 5 1
--准备数据:vim /export/datas/lateral.txt 1 http://facebook.com/path/p1.php?query=1 2 http://www.baidu.com/news/index.jsp?uuid=frank 3 http://www.jd.com/index?source=baidu --创建表 create table tb_url( id int, url string ) row format delimited fields terminated by '\t'; --加载数据 load data local inpath '/export/datas/lateral.txt' into table tb_url; --使用UDTF解析 SELECT parse_url_tuple(url, 'HOST') from tb_url; --使用UDTF+lateral view select a.*, b.host, b.path from tb_url a lateral view parse_url_tuple(url, 'HOST',"PATH") b as host,path; --对比 SELECT id,parse_url_tuple(url, 'HOST') from tb_url;--失败,UDTF函数不能与字段连用 select id, a.* from tb_url lateral view parse_url_tuple(url, 'HOST','PATH') a as host,path;
一个FROM语句后可以跟多个lateral view语句,后面的lateral view语句能够引用它前面的所有表和列名。
-- 以下面的表为例: --col1(Array<int>类型) col2(Array<string>类型) col1 col2 [1, 2] [a", "b", "c"] [3, 4] [d", "e", "f"] SELECT myCol1, col2 FROM baseTable LATERAL VIEW explode(col1) myTable1 AS myCol1; --执行结果为:mycol1(int类型) col2(Array<string>类型) mycol1 col2 1 [a", "b", "c"] 2 [a", "b", "c"] 3 [d", "e", "f"] 4 [d", "e", "f"] --加上一个lateral view: SELECT myCol1, myCol2 FROM baseTable LATERAL VIEW explode(col1) myTable1 AS myCol1 LATERAL VIEW explode(col2) myTable2 AS myCol2; --它的执行结果为:mycol1(int类型) col2(string类型) myCol1 myCol2 1 "a" 1 "b" 1 "c" 2 "a" 2 "b" 2 "c" 3 "d" 3 "e" 3 "f" 4 "d" 4 "e" 4 "f" 注意上面语句中,两个lateral view按照出现的次序被执行。
将array列表里的每个元素生成一行
每一对元素作为一行,key为一列,value为一列
1-实现wordcount【explode(array)】
select explode(split(word," ")) from words; --直接使用,word为array
2-将兴趣爱好展开【explode(map)】
select explode(hobby) from complex_map; --直接使用,hobby为map
3-与侧视图连用
select a.name,b.* from complex_map a lateral view explode(hobby) b as hobby,deep;
-- 与lateral view连用
-》创建数据:vim /export/datas/r2c1.txt
a c 1
a d 2
a e 3
b c 4
b d 5
b e 6
-》希望结果
+-------+----+----+----+--+
| col1 | c | d | e |
+-------+----+----+----+--+
| a | 1 | 2 | 3 |
| b | 4 | 5 | 6 |
+-------+----+----+----+--+
-》创建表
create table row2col1(
col1 string,
col2 string,
col3 int
)row format delimited fields terminated by '\t';
-》加载数据
load data local inpath '/export/datas/r2c1.txt' into table row2col1;
a c 1
a d 2
a e 3
case col2 when 'c' then col3 else 0 end =》 1 0 0 =》 1
case col2 when 'd' then col3 else 0 end =》 0 2 0 =》 2
case col2 when 'e' then col3 else 0 end =》 0 0 3 =》 3
select
col1 as col1,
max(case col2 when 'c' then col3 else 0 end) as c,
max(case col2 when 'd' then col3 else 0 end) as d,
max(case col2 when 'e' then col3 else 0 end) as e
from
row2col1
group by
col1;
dt | city_name | source_type | pv |
---|---|---|---|
2022-06-01 | 北京 | 0 | 1 |
2022-06-01 | 北京 | 0 | 3 |
2022-06-01 | 北京 | 1 | 5 |
2022-06-01 | 北京 | 2 | 7 |
dt | city_name | index_type | index_type_pv | index_type_cnt |
---|---|---|---|---|
2022-06-01 | 北京 | 1 | 11 | 3 |
2022-06-01 | 北京 | 2 | 4 | 2 |
2022-06-01 | 北京 | 3 | 7 | 1 |
方法一:union all with a as ( select '2022-06-01' as dt,'北京' as city_name,0 as source_type,1 as pv union all select '2022-06-01' as dt,'北京' as city_name,1 as source_type,5 as pv union all select '2022-06-01' as dt,'北京' as city_name,2 as source_type,7 as pv union all select '2022-06-01' as dt,'北京' as city_name,0 as source_type,3 as pv ) select dt ,city_name ,1 as index_type ,sum(pv) as index_type_pv ,count(1) as index_type_cnt from a where source_type in (0,2) group by dt ,city_name union all select dt ,city_name ,2 as index_type ,sum(pv) as index_type_pv ,count(1) as index_type_cnt from a where source_type in (0) group by dt ,city_name union all select dt ,city_name ,3 as index_type ,sum(pv) as index_type_pv ,count(1) as index_type_cnt from a where source_type in (2) group by dt ,city_name ;
方法二:groupingsets select dt ,city_name ,coalesce(index_type,1) as index_type ,sum(index_type_pv) as index_type_pv ,sum(index_type_cnt) as index_type_cnt from (select a.dt ,a.city_name ,case when a.source_type=0 then 2 when a.source_type=2 then 3 else -99 end as index_type ,sum(pv) as index_type_pv ,count(1) as index_type_cnt from ( select '2022-06-01' as dt,'北京' as city_name,0 as source_type,1 as pv union all select '2022-06-01' as dt,'北京' as city_name,1 as source_type,5 as pv union all select '2022-06-01' as dt,'北京' as city_name,2 as source_type,7 as pv union all select '2022-06-01' as dt,'北京' as city_name,0 as source_type,3 as pv ) a where source_type in (0,2) group BY a.dt ,a.city_name ,case when a.source_type=0 then 2 when a.source_type=2 then 3 else -99 end )a group by dt ,city_name ,index_type grouping sets( (dt,city_name) ,(dt,city_name,index_type) ) ;
方法三,这里给一个利用map结构的方法 select a.dt ,a.city_name ,v.index_type ,case when v.index_type=1 then index_type_pv_list['index_type_1_pv'] when v.index_type=2 then index_type_pv_list['index_type_2_pv'] when v.index_type=3 then index_type_pv_list['index_type_3_pv'] end as index_type_pv ,case when v.index_type=1 then index_type_cnt_list['index_type_1_cnt'] when v.index_type=2 then index_type_cnt_list['index_type_2_cnt'] when v.index_type=3 then index_type_cnt_list['index_type_3_cnt'] end as index_type_cnt from(select dt ,city_name ,concat_ws(',',index_type_1,index_type_2,index_type_3) as index_type_list ,map('index_type_1_pv',index_type_1_pv,'index_type_2_pv',index_type_2_pv,'index_type_3_pv',index_type_3_pv) as index_type_pv_list ,map('index_type_1_cnt',index_type_1_cnt,'index_type_2_cnt',index_type_2_cnt,'index_type_3_cnt',index_type_3_cnt) as index_type_cnt_list from (select a.dt ,a.city_name ,1 as index_type_1 ,max(case when a.source_type=0 then 2 end ) as index_type_2 ,max(case when a.source_type=2 then 3 end ) as index_type_3 ,sum(pv) as index_type_1_pv ,sum(case when a.source_type=0 then pv end ) as index_type_2_pv ,sum(case when a.source_type=2 then pv end ) as index_type_3_pv ,count(1) as index_type_1_cnt ,count(case when a.source_type=0 then 1 end ) as index_type_2_cnt ,count(case when a.source_type=2 then 1 end ) as index_type_3_cnt from ( select '2022-06-01' as dt,'北京' as city_name,0 as source_type,1 as pv union all select '2022-06-01' as dt,'北京' as city_name,1 as source_type,5 as pv union all select '2022-06-01' as dt,'北京' as city_name,2 as source_type,7 as pv union all select '2022-06-01' as dt,'北京' as city_name,0 as source_type,3 as pv ) a where source_type in (0,2) group BY a.dt ,a.city_name )a )a lateral view explode(split(index_type_list,','))v as index_type ;
案例:多行转单列
-》创建数据:vim /export/datas/r2c2.txt a b 1 a b 2 a b 3 c d 4 c d 5 c d 6 -》希望结果:concat_ws(分隔符,col3) +-------+-------+--------+--+ | col1 | col2 | col3 | +-------+-------+--------+--+ | a | b | 1,2,3 | | c | d | 4,5,6 | +-------+-------+--------+--+ -》创建表 create table row2col2( col1 string, col2 string, col3 int )row format delimited fields terminated by '\t'; -》加载数据 load data local inpath '/export/datas/r2c2.txt' into table row2col2;
select
col1,
col2,
concat_ws(',', collect_set(cast(col3 as string))) as col3
from
row2col2
group by
col1, col2;
-》创建数据:vim /export/datas/c2r1.txt
a 1 2 3
b 4 5 6
-》希望结果
+-----------+-----------+-----------+--+
| _u1.col1 | _u1.col2 | _u1.col3 |
+-----------+-----------+-----------+--+
| a | c | 1 |
| b | c | 4 |
| a | d | 2 |
| b | d | 5 |
| a | e | 3 |
| b | e | 6 |
+-----------+-----------+-----------+--+
-》创建表 create table col2row1( col1 string, col2 int, col3 int, col4 int )row format delimited fields terminated by '\t'; -》加载数据 load data local inpath '/export/datas/c2r1.txt' into table col2row1; 或者 drop table if exists row2col1; create table row2col1 as select 'a' as col1,1 as col2,2 as col3,3 as col4 union all select 'b' as col1,4 as col2,5 as col3,6 as col4 ;
select col1, 'c' as col2, col2 as col3 from col2row1
UNION ALL
select col1, 'd' as col2, col3 as col3 from col2row1
UNION ALL
select col1, 'e' as col2, col4 as col3 from col2row1;
select split(value_list,':')[0] as col1
,split(value_list,':')[1] as col2
,split(value_list,':')[2] as col3
from (select concat_ws(',',concat(col1,':c:',col2),concat(col1,':d:',col3),concat(col1,':e:',col4)) as value
from col2row1
)t
lateral view explode(split(value,','))v as value_list
;
select col1 -- ,tb1.a as tb1a -- ,tb2.a as tb2a -- ,tb3.a as tb3a ,col2 ,col3 from(select array(col1,col1,col1) as col1_list ,array('c','d','e') as col2_list ,array(col2,col3,col4) as col3_list from col2row1 )t lateral view posexplode(col1_list) tb1 as a,col1 lateral view posexplode(col2_list) tb2 as a,col2 lateral view posexplode(col3_list) tb3 as a,col3 where tb1.a=tb2.a and tb2.a=tb3.a ;
posexplode 我经常会用来 做膨胀数据使用~供参考
select a.a ,a.b ,date_add('2023-08-01',pos) as dt ,pos+1 as n_day from (select 1 as a ,2 as b union all select 3 as a ,4 as b )a lateral view posexplode(split(repeat('a,a',datediff('2023-08-31','2023-08-01')),',')) tab as pos,day -- lateral view posexplode(split(repeat('a,a',30),',')) tab as pos,day -- 这里如果直接写30会被炸开成31(n+1) -- pos是从0开始,到30(repeat次数) -- 为什么不是lateral view posexplode(split(repeat(',',datediff('2023-08-31','2023-08-01')),',')) tab as pos,day 呢??? -- 因为hive引擎下 和spark引擎下是不一样的,可以自行测试一下,hive 切出来不是你想要的。。。 -- select split(repeat(',',datediff('2023-08-31','2023-08-01')),','); -- [] -- select split(repeat('a,a',datediff('2023-08-31','2023-08-01')),','); -- ["a","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","a"] order by dt desc ;
多表查询中 Join:多表之间列合并 union:多表之间行的合并
案例:单列转多行
-》创建数据:vim /export/datas/c2r2.txt a b 1,2,3 c d 4,5,6 -》希望结果 +-------+-------+-------+--+ | col1 | col2 | col3 | +-------+-------+-------+--+ | a | b | 1 | | a | b | 2 | | a | b | 3 | | c | d | 4 | | c | d | 5 | | c | d | 6 | +-------+-------+-------+--+ -》创建表 create table col2row2( col1 string, col2 string, col3 string )row format delimited fields terminated by '\t'; -》加载数据 load data local inpath '/export/datas/c2r2.txt' into table col2row2;
select
col1,
col2,
lv.col3 as col3
from
col2row2
lateral view explode(split(col3, ',')) lv as col3;
SELECT
statisticLadder_json
,get_json_object(statisticLadder_json, '$.activityId') as activityId
,get_json_object(statisticLadder_json, '$.strategyId') as strategyId
FROM ( SELECT '[{"activityId":1,"strategyId":2,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}},{"activityId":2,"strategyId":3,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}}]' as str
) a
LATERAL view explode(split(regexp_replace(str,'^\\[|]$',''), ',(?!")')) b as statisticLadder_json
-- 结果
statisticLadder_json activityId strategyId
{"activityId":1,"strategyId":2,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}} 1 2
{"activityId":2,"strategyId":3,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}} 2 3
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。