赞
踩
常见分区表逻辑
df表
每天的分区都保存全量数据
比如dwd_order_df和dwd_order_di
insert overwrite dwd_order_df partiton (ds=‘${yesterday}’)
select from dwd_order_df a
full join dwd_order_di b
where a.ds=‘${yesterday}’-1
and b.ds=‘${yesterday}’-1;
di表
意思是每天的分区中保存的是昨天的全部数据
比如ods_dlick_di,dwd_dlick_di
insert overwrite dwd_click_di partition (ds=‘${yesterday}’)
select * from ods_click_di where ds=‘${yesterday}’;
di表比如要统计30天内的用户(user_id)点击pv
select count(user_id) ,–所有用户点击了多少次
count(distinct user_id) --多少个用户点了多少次
from dwd_order_di
where ds between ‘${yesterday}’-29 and ‘${yesterday}’
创建分区表
CREATE TABLE emp
(EMPNO bigint,
ENAME string,
JOB string,
MGR bigint,
HIREDATE string,
SAL bigint,
COMM bigint,
DEPTNO bigint)
PARTITINO BY(dt)
如果报Permission denied错误
可以使用hdfs dfs -chmod 777 xxx
分区表的好处,避免全表扫描
插入数据需要指定分区
分区就有点索引的意思了
多级分区
例如在每天再以小时分区分为24个区
可以按照原表中的某一个字段作为分区,但是注意这个字段不能存在于表中
例如:emp中我们按照deptno,将数据插入到emp_dynamic_partition中
默认最后一个字段为分区
动态分区直接insert 会报错,因为动态分区模式hive默认是关闭的开启命令为
set hive.exec.dynamic.partiton=true; set hive.exec.dynamic.partiton.mode=nonstrict;
但是每次执行命令前都需要手动进行开启,如果不想这么麻烦,可以将这两个命令配置到hive-site.xml
里去
varchar => string
int => bigint
double => double
datetime => datetime
cast(xxx as double)
这也是在创建表的时候发现的
build-in 内置函数
UDF user define function 用户自定义函数
UDF :一进一出的(abs,upper)
UDAF:多进多出(sum,count,avg)
UDTF:一进多出(explode,json_tuple)
所有内置函数君克通过desc function xxx;
查看具体用法
回顾:mysql里有hive里没有的是dayofyear
UDF进行一些配置后也可以用desc function 查看
时间函数 – 有人比较注重
current_date();年月日
current_timestamp();年月时分秒
unix_timestamp();从1970-01-01 00:00:00的秒数
from_unixtime(unix_timestamp(CURRENT_TIMESTAMP()),yyyy-mm-dd)
to_date(‘2022-04-04 21:15:30’);
dayofmonth(current_timestamp);
dayofweek(current_timestamp);
weekofyear(current_timestamp);
months_between(current_timestamp,‘2020-01-01’);两个日期相差天数
add_months(current_timestamp,5);增加月
datediff(current_timestamp,‘2020-01-01’);两个日期相差的天数
date_add(current_timestamp,5);增加天数
date_sub(current_timestamp,5);减少天数
last_day(current_timestamp);最后一天
trunc(current_timestamp,‘YYYY’);返回按照格式截取日期
字符函数
round(3.14,0)四舍五入
ceil(3.14);向上取整
floor(3.14)向下取整
处理json
json_tuple(‘ceo’,‘company’) as (ceo,company)
select case when phone is not null then phone else '00000000' end as phone from json_test
**lateral view** json_tuple(line,'phone') temp as phone;
select get_json_object(line,'$.address') as address
from json_test;
json数据举例
{"ceo":"马化腾","company":"腾讯","address":"深圳市"}
select concat_ws(',',cast(deptno as string),job) as deptno_job,concat_ws('-',collect_list(ename)) as ename_list
from emp
group by concat_ws(',',cast(deptno as string),job)
;
bigint转string
cast(deptno as string)
合在一起再炸开
select deptno_job,ename
from (select concat_ws(',',cast(deptno as string),job) as deptno_job,concat_ws('-',collect_list(ename)) as ename_list
from emp
group by concat_ws(',',cast(deptno as string),job)) x
LATERAL VIEW explode(split(ename_list,'-')) t as ename;
--区分大小写的情况下
select word,count(*)
from(select explode(split(word,' ')) as word
from hive_wc) x
group by word;
--全部为小写的情况下
select lower(word),count(*)
from(select explode(split(word,' ')) as word
from hive_wc) x
group by lower(word);
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。