赞
踩
查看系统内置函数
show functions;
查看某一自带函数的描述
desc function upper;
查看某一自带函数的详细描述
desc function extended upper;
数据准备:
create table stu_fun
(
id int,
name string,
class string,
sex string,
skill string,
scala double
)
row format delimited fields terminated by '\t'
stored as textfile;
load data local inpath '/root/stu_fun.txt' into table stu_fun;
语法结构:nvl(value,default_value)
函数描述:如果value为空则返回默认值,否则返回值
案例1:字段为空时,使用 -1 替换
select id, name, nvl(scala, -1) from stu_fun;
案例2:字段为空时,使用其他字段替换
select id, name, nvl(scala, id) from stu_fun;
语法结构:case col when b then c [when d then e]* [else f] end
函数描述:当 col_val = b 时,返回 c;当 col_val = d 时,返回 e;否则返回 f
案例:统计每个班的各性别人数
select class,
sum(case sex when '男' then 1 else 0 end) boy ,
sum(case sex when '女' then 1 else 0 end) gril
from stu_fun group by class;
语法结构:concat(str1, str2, ... strN)
函数描述:返回str1, str2,…strN
;或 concat(bin1, bin2,…binN)
,返回二进制数据 bin1, bin2,…binN
衍生函数:
concat_ws(separator, [string | array(string)]+)
函数描述:返回由分隔符分隔的字符串的拼接
注意:
CONCAT_WS must be "string or array<string>"
,也就是说如果包含其他类型的字段,需要放在array中
案例1:将多行合为一列
select concat(id, name, class) from stu_fun;
案例2:合并时,使用间隔符
select concat_ws(" / ", `array`(id, name, class)) from stu_fun;
案例3:将相同 class、sex 的学生合并
select
t1.class_sex,
concat_ws(" / ", collect_set(t1.name)) stus
from (
select stu_fun.name,
concat_ws(" - ", class, sex) class_sex
from stu_fun
) t1
group by t1.class_sex;
collect_set(col)
:接收基本类型,用于将某字段的值去重并汇总,产生 Array 类型字段。
语法格式:explode(col)
函数描述:将 col 的元素分隔为多行,或将映射的元素分隔为多行和多列
案例1:将每位同学技能拆开
select explode(split(skill, ",")) sks from stu_fun;
案例2:将每位同学技能拆开,并进行笛卡尔积
lateral view
用法:
lateral view udtf(expression) tableAlias as columnAlias
描述:用于和 split、explode 等 UDTF 一起使用,能够将一列数据拆成多行数据,在此基础上可以对差分后的数据进行聚合。
注意:拆分后得到的
columnAlias
为 array 类型.
select name,sks from stu_fun
lateral view
explode(split(skill, ",")) tmp_skill as sks;
案例3:统计不同技能,掌握的人数
PS:本来是想查出不同技能,具体掌握的学生姓名,并用 concat_ws
合并到一起,但没能实现,有好方法望留言!!
select map_keys(t2.sk_stu)[0], count(map_values(t2.sk_stu))
from (
select str_to_map(concat_ws(",", concat_ws(":", t1.sks, t1.name))) sk_stu
from (
select name, sks
from stu_fun
lateral view explode(split(skill, ",")) tmp_skill as sks
) t1
) t2
group by map_keys(t2.sk_stu);
窗口函数:又名开窗函数(over),属于分析函数的一种,用于指定分析函数工作的数据窗口大小,这个数据窗口大小可能会随行的变化而变化。
子句
准备数据:
create table order_fun(
name string,
dt string,
cost int
) row format delimited fields terminated by '\t';
load data local inpath '/root/order_fun.txt' into table order_fun;
案例1:查询某月购买过的顾客及总人数
select name, count(*) over()
from order_fun
where substr(dt, 1, 7) = "2021-11"
-- where month(dt)=11 and year(dt)=2021
group by name;
案例2:查询顾客购买明细及每月购买总额
select *, sum(cost) over(partition by month(dt))
from order_fun;
案例3:将每个顾客的cost按照日期进行累加
select *, -- 所有行相加 sum(cost) over () sample1, -- 按 name 分区,区内数据相加 sum(cost) over (partition by name) sample2, -- 按 name 分区,区内数据累加 sum(cost) over (partition by name order by dt) sample3, -- 同 sample3,由起点到当前行聚合 sum(cost) over (partition by name order by dt rows between UNBOUNDED PRECEDING and current row ) sample4, -- 当前行与前面一行做聚合 sum(cost) over (partition by name order by dt rows between 1 PRECEDING and current row ) sample5, -- 当前行与前面一行与后面一行做聚合 sum(cost) over (partition by name order by dt rows between 1 PRECEDING and 1 FOLLOWING) sample6, -- 当前行与后面所有行做聚合 sum(cost) over (partition by name order by dt rows between CURRENT ROW and UNBOUNDED FOLLOWING) sample7 from order_fun;
rows 必须跟在 order by 子句之后,对排序的结果进行限制,使用固定的行数来限制分区中的数据行数。
案例4:查看顾客上次购买时间
select *,
lag(dt, 1, '2021-11-17') over (partition by name order by dt) time1,
lag(dt, 2) over (partition by name order by dt) time2
from order_fun;
案例5:查询前20%时间的订单
select *
from (
select *,
ntile(5) over (order by dt) sorted
from order_fun
) t1
where sorted = 1;
rank()
:排序相同时会会重复,总数不变
dense_rank()
:排序相同时会重复,总数会减少
row_number()
:会根据顺序计算
准备数据:
create table stu_score_fun
(
name string,
subject string,
score int
) row format delimited fields terminated by '\t';
load data local inpath '/root/stu_score_fun.txt' into table stu_score_fun;
案例:对学生按成绩排名
select *,
rank() over (partition by subject order by score desc) rk,
dense_rank() over (partition by subject order by score desc) dr,
row_number() over (partition by subject order by score desc) rn
from stu_score_fun;
自定义函数(User Defined Function,UDF):除Hive内置的函数外,支持用户自定义函数。
自定义函数分类:
编程步骤:
hive-exec
org.apache.hadoop.hive.ql.udf.UDF
create [temporary] function db_name.func_name as '类的引用' using jar 'hdfsJar路径';
需求,日志中的IP解析出对应的地址。
1️⃣ 创建空的maven工程
2️⃣ 导入依赖:httpclient
(用于处理HTTP请求)、hive-exec
(hive所需依赖)
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.9</version>
</dependency>
</dependencies>
3️⃣ 实现 IP 的解析 IPGEOUtils
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; import java.util.Arrays; import java.util.List; public class IPGEOUtils { private static final String URL = "http://ip.ws.126.net/ipquery?ip="; private static CloseableHttpClient client = HttpClients.createDefault(); public static List<String> getGEO(String ip) throws IOException { HttpGet request = new HttpGet(URL + ip); if (client == null) { client = HttpClients.createDefault(); } CloseableHttpResponse response = client.execute(request); HttpEntity entity = response.getEntity(); String res = EntityUtils.toString(entity); String[] split = res.split("\""); String pro = split[1]; String city = split[3]; return Arrays.asList(pro, city); } }
4️⃣ 自定义 UDF 函数,继承 UDF,必须将方法名命名为 evaluate
import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.UDF; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; public class IP_Get extends UDF { private static Logger logger = LoggerFactory.getLogger(IP_Get.class); public String evaluate(String ip){ String city = null; try { city = IPGEOUtils.getGEO(ip).get(1); } catch (Exception e){ logger.warn("有点问题!", e); } return city; } }
5️⃣ 将项目打成 jar 包上传至服务器,再上传至 HDFS
hdfs dfs -put <jar_name> <HDFS_path>
6️⃣ 创建函数,并与 jar 包相关联(create temporary
可创建临时函数,及当前窗口有效)
create function get_city as 'IP_Get' using jar 'hdfs://node1:8020/IP_GEO.jar';
7️⃣ 测试该函数
select get_city(split(line, " ")[0]) from weblog00 limit 10;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。