赞
踩
首先明白是什么正确的json格式:
{"code":o} -- 错误的json字符串
{"code":true} -- 正确的json字符串
{"code":"o"} -- 正确的json字符串
{"code":{"code_1":"o_1","code_2":"o_2"}} -- 正确的json字符串
{"code":"{"code_1":"o_1","code_2":"o_2"}"} -- 错误的json字符串(明显是双引号错乱了)
{"code":[{"code_1":"o_1","code_2":"o_2"},{"code_3":"o_3","code_4":"o_4"}]} -- 正确的json字符串(放的json数组)
使用from_json
函数:
语法:
from_json(jsonStr, schema [, options]) -- 按照指定schema格式字符串,变为json对象
例子1:
select from_json('[{"text":"Tea"},{"text":"Apple"}]', 'ARRAY<STRUCT<text: STRING>>'); -- 输出:[{"text":"Tea"},{"text":"Apple"}]
select from_json('[{"text":"Tea"},{"text":"Apple"}]', 'ARRAY<STRUCT<text: STRING>>')[0]['text'] as q; -- 输出:tea
例子2:
https://learn.microsoft.com/zh-cn/azure/databricks/sql/language-manual/functions/from_json
event_action='display_list' and page_type = 0 and array_contains(from_json(params['templates'],'STRUCT<id: ARRAY<INT>>').id, 30)
若json数据以字符串的形式存放在在表中,则hive有专门的函数来获取json数据中key的value。分别是:
get_json_object(jsonStr, path)
:一次只能获取json字符串某个key的valuejson_tuple(jsonStr, key1, key2, ...)
:一次能获取json字符串多个key的value另外,若需要表加载json数据时,会解析所有字段变为规规矩矩的行列形式。
Jsonserde
to_json(str)
函数因此:
如果需要使用json的所有字段,则使用指定内置json解析器Jsonserde
如果需要使用json的部分字段,则使用 get_json_object(jsonStr, path)
、json_tuple(jsonStr, key1, key2, ...)
函数
get_json_object()函数是UDF,即 输入一行数据,输出一行数据。
缺点:一次只能获取json字符串某个key的value
特点:整个json数据以string
数据类型存放在表中,则可以使用该函数来获取value。如:
get_json_object()
执行图:
参考文档:https://blog.51cto.com/u_16213314/7440507
假设有一个Hive表employees,其中包含了员工的信息,其中的employee列是一个JSON字符串,具有以下结构:
'{
"id": 1,
"name": "John Doe",
"salary": 5000,
"skills": [
"Java",
"Python",
"SQL"
],
"address": {
"street": "123 Main St",
"city": "New York",
"state": "NY"
}
}'
如果只是转为json对象,可以使用
to_json(字符串)
将字符串转为 json 类型。
获取某个属性值:
SELECT get_json_object(employee, "$.name") AS name
FROM employees
获取数组元素
SELECT get_json_object(employee, "$.skills[0]") AS name
FROM employees
获取嵌套属性值
SELECT get_json_object(employee, "$.address.street") AS name
FROM employees
判断属性值是否存在
SELECT if(get_json_object(employee, "$.id") is null, '不存在', '存在')
FROM employees
select
to_json(map('code', 'daima')),
get_json_object(to_json(map('code', 'daima')), '$.code'),
get_json_object(to_json(map('code', 'daima')), '$.col'),
get_json_object(to_json(map('code', 'daima')), '$.col') is null,
get_json_object(to_json(map('code', 'daima')), '$.col') = ''
获取json数组中的一个对象
select get_json_object('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000","payChannelCode":"AOC"}]','$[0]')
-- 结果:{“payAmount”:“375000”,“payChannelCode”:“BOC”}
获取json数组所有对象的某个值:用*
表示所有对象
-- 1. 结果不分行
select get_json_object('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000","payChannelCode":"AOC"}]','$[*].payChannelCode')
-- 结果:[“BOC”,“AOC”]
-- 2. 结果分行:参考博客 https://blog.csdn.net/qq_41110377/article/details/124949265
SELECT
tab1.除了JSON数组外想要展示的字段,
GET_JSON_OBJECT(临时表名.列重命名,'$.想要获取的字段') as 字段重命名
FROM (
SELECT
除了JSON数组外想要展示的字段,
split(regexp_replace
(regexp_extract(JSON数组,'^\\[(.+)\\]$',1),--regexp_extract正则表达式解析函数
'\\}\\,\\{', '\\}\\|\\|\\{'),--regexp_replace替换
'\\|\\|') --split分割符
as str --处理json数组
FROM 表名
)
tab1
lateral view explode(tab1.str) 临时表名--随意取 as 列重命名--随意取;
常见标准化手段
select regexp_replace(
get_json_object('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000","payChannelCode":"AOC"}]','$[*].payChannelCode'),
'\\[|\\]|\"','')
-- 结果:BOC,AOC
;
分隔数组元素-- 1. 替换[]
select
regexp_replace('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000"}]','\\[|\\]','')
-- 结果:{“payAmount”:“375000”,“payChannelCode”:“BOC”},{“payAmount”:“376000”}
-- 2. 将 },{ 替换成 };{ 这样就可以用split函数切割成数组(替换的字符不能在数组值内出现)
select regexp_replace('{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000"}','\\}\,\\{','\\}\;\\{')
-- 3. 将上面两步合在一起为
select regexp_replace(regexp_replace('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000"}]','\\[|\\]',''),'\\}\,\\{','\\}\;\\{')
-- 结果:{“payAmount”:“375000”,“payChannelCode”:“BOC”};{“payAmount”:“376000”}
},{
分隔元素select regexp_replace('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000"}]','\\[\\{|\\}\\]|\"','')
select split(
regexp_replace('[{"payAmount":"375000","payChannelCode":"BOC"},{"payAmount":"376000"}]','\\[\\{|\\}\\]|\"',''),
'\\}\,\\{')
-- 结果:[“payAmount:375000,payChannelCode:BOC”,“payAmount:376000”]
例子:
① device.json
文件数据如下:
{"device":"device_30","deviceType":"kafka","signal":98.0,"time":1616817201390}
{"device":"device_40","deviceType":"route","signal":99.0,"time":1616817201887}
{"device":"device_21","deviceType":"bigdata","signal":77.0,"time":1616817202142}
{"device":"device_31","deviceType":"kafka","signal":98.0,"time":1616817202405}
{"device":"device_20","deviceType":"bigdata","signal":12.0,"time":1616817202513}
{"device":"device_54","deviceType":"bigdata","signal":14.0,"time":1616817202913}
{"device":"device_10","deviceType":"db","signal":39.0,"time":1616817203356}
{"device":"device_94","deviceType":"bigdata","signal":59.0,"time":1616817203771}
{"device":"device_32","deviceType":"kafka","signal":52.0,"time":1616817204010}
{"device":"device_21","deviceType":"bigdata","signal":85.0,"time":1616817204229}
{"device":"device_74","deviceType":"bigdata","signal":27.0,"time":1616817204720}
{"device":"device_91","deviceType":"bigdata","signal":50.0,"time":1616817205164}
{"device":"device_62","deviceType":"db","signal":89.0,"time":1616817205328}
{"device":"device_21","deviceType":"bigdata","signal":25.0,"time":1616817205457}
{"device":"device_76","deviceType":"bigdata","signal":62.0,"time":1616817205984}
{"device":"device_74","deviceType":"bigdata","signal":44.0,"time":1616817206571}
{"device":"device_42","deviceType":"route","signal":43.0,"time":1616817206681}
{"device":"device_32","deviceType":"kafka","signal":65.0,"time":1616817207131}
{"device":"device_32","deviceType":"kafka","signal":95.0,"time":1616817207714}
{"device":"device_71","deviceType":"bigdata","signal":45.0,"time":1616817207907}
{"device":"device_32","deviceType":"kafka","signal":81.0,"time":1616817208320}
{"device":"device_10","deviceType":"db","signal":81.0,"time":1616817208907}
{"device":"device_20","deviceType":"bigdata","signal":69.0,"time":1616817209287}
{"device":"device_61","deviceType":"db","signal":98.0,"time":1616817209785}
{"device":"device_30","deviceType":"kafka","signal":95.0,"time":1616817210104}
{"device":"device_43","deviceType":"route","signal":57.0,"time":1616817210540}
{"device":"device_10","deviceType":"db","signal":36.0,"time":1616817211134}
{"device":"device_20","deviceType":"bigdata","signal":75.0,"time":1616817211248}
{"device":"device_64","deviceType":"db","signal":68.0,"time":1616817211812}
{"device":"device_53","deviceType":"bigdata","signal":60.0,"time":1616817212237}
{"device":"device_52","deviceType":"bigdata","signal":57.0,"time":1616817212709}
{"device":"device_30","deviceType":"kafka","signal":75.0,"time":1616817213073}
{"device":"device_31","deviceType":"kafka","signal":83.0,"time":1616817213614}
{"device":"device_93","deviceType":"bigdata","signal":54.0,"time":1616817214101}
{"device":"device_20","deviceType":"bigdata","signal":84.0,"time":1616817214639}
② 建表并导入数据:
create table tb_json_test1 (
json string
);
-- 加载数据:上面tb_json_test1表没有使用 stored as 关键字,故使用的是textFile存储方式。
-- (textFile就是进行内容复制,而json数据是原始数据,并没有编码解码等,所以打开textFile和原数据一样。)
load data local inpath '/root/hivedata/device.json' into table tb_json_test1;
select * from tb_json_test1;
此时,存放在底层其实是这样:
"{\"device\":\"device_30\",\"deviceType\":\"kafka\",\"signal\":98.0,\"time\":1616817201390}"
- 1
③ 获取json数据key的value
select
--获取设备名称
get_json_object(json,"$.device") as device,
--获取设备类型
get_json_object(json,"$.deviceType") as deviceType,
--获取设备信号强度
get_json_object(json,"$.signal") as signal,
--获取时间
get_json_object(json,"$.time") as stime
from tb_json_test1;
数据[{“website”:“baidu.com”,“name”:“百度”},{“website”:“google.com”,“name”:“谷歌”}]
转换为:
name | website |
---|---|
百度 | baidu.com |
谷歌 | google.com |
关键点:因为数组元素之间用,
隔开,json的字段间页用,
隔开,所以要将数组元素之间的,
换一个符号,方便split
,一般换为;
-- 思路
-- 1. 使用 regexp_replace 函数将原数据转换为 {"website":"baidu.com","name":"百度"};{"website":"google.com","name":"谷歌"}
-- 2. 使用 split 函数按照 ';' 分割 {"website":"baidu.com","name":"百度"};{"website":"google.com","name":"谷歌"},返回 [{"website":"baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]
-- 3. 使用 explode 炸裂为
-- {"website":"baidu.com","name":"百度"}
-- {"website":"google.com","name":"谷歌"}
-- 4. 使用 json_tuple 解析数据
-- 实现
-- 1. 先将json数组中的元素解析出来,转化为每行显示
SELECT explode(split(regexp_replace(regexp_replace(
'[{"website":"baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]',
'\\[|\\]', ''), '\\}\\,\\{', '\\}\\;\\{'), '\\;'));
-- 2. 使用 json_tuple 解析数据
select json_tuple(json, 'website', 'name') as (website, name)
from (
select explode(split(regexp_replace(regexp_replace('[{"website":"baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]', '\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),'\\;'))
as json
) t1;
get_json_object(jsonStr, path)
获取json数据的key时,并不是直接传入key名,需要在key名前面加上$.
。($ 表示当前json对象)。比如:get_json_object(json,"$.device")
$
后面点多次。比如:SELECT get_json_object('{
"person": {
"name": "John",
"age": 30,
"address": {
"street": "123 Main St",
"city": "New York"
}
}
}', '$.person.address.city');
-- 输出:"New York"
json_tuple()函数是UDTF,即 输入一行数据,输出多行数据。
优点:一次能获取json字符串多个key的value
特点:整个json数据以string
数据类型存放在表中,则可以使用该函数来获取value。如:
tb_json_test1
表数据:select
json_tuple(json,"device","deviceType","signal","time") as (device,deviceType,signal,stime)
from tb_json_test1;
虽然json_tuple()是输入一行,输出一行,但是其属于UDTF
。而对于UDTF是不能直接查询表字段的,要使用侧视图才行。
-- 错误例子:json是原表字段,两张表的字段不能同时出现
select
json,
json_tuple(json,"device","deviceType","signal","time") as (device,deviceType,signal,stime)
from tb_json_test1;
-- 正确例子:表字段要出现,必须使用侧视图
select
a.json as json,
b.device as device,
b.signal as signal,
b.time as time
from tb_json_test1 a lateral view json_tuple(json,"device","deviceType","signal","time") b as (device,deviceType,signal,stime);
可以搭配as (xxx, xxx)
取别名:
select json_tuple(json, 'website', 'name') as (website, name)
from (
select explode(split(regexp_replace(regexp_replace('[{"website":"baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]', '\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),'\\;'))
as json
) t1
特点:在创建表时,只要指定使用JsonSerde解析器的表,则向该表加载.json
数据的文件时,就会将json数据解析为规规矩矩的行列形式。如:
create table tb_json_test2 (
device string,
deviceType string,
signal double,
time string
)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile;
-- 还是加载前面的device.json文件
load data local inpath '/root/hivedata/device.json' into table tb_json_test2;
select * from tb_json_test2;
结果:
缺点:有时候只需要获取json的部分字段,而使用内置的JsonSerde解析器会解析所有字段。
TO_JSON(struct_value)
:将Hive结构体类型转换为JSON字符串
Map结构体:
Struct结构体
方式一:使用cast()
强转函数
SELECT CAST(json_column AS STRING) FROM sample_data;
-- 比如:
SELECT CAST('{"device":"device_30","deviceType":"kafka","signal":98.0,"time":1616817201390}' AS STRING) FROM sample_data;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。