赞
踩
数据背景
hive (ods)> select * from jsont1;
OK
jsont1.username jsont1.age jsont1.sex jsont1.json
user1 18 male {"id":1,"ids":[101,102,103],"total_number":3}
user2 28 female {"id":2,"ids":[201,202,203],"total_number":3}
user3 38 male {"id":3,"ids":[301,302,303,304],"total_number":4}
user4 48 female {"id":4,"ids":[401,402,403,404,405],"total_number":5}
user5 58 male {"id":5,"ids":[501,502,503],"total_number":3}
user6 68 female {"id":6,"ids":[601,602,603],"total_number":3}
user7 78 male {"id":7,"ids":[701,702,703],"total_number":3}
NULL NULL NULL
Time taken: 0.933 seconds, Fetched: 8 row(s)
hive (ods)> select username,age,sex,get_json_object(json,"$.id") id ,get_json_object(json,"$.ids") ids ,get_json_object(json,"$.total_number") totalNumber
> from jsont1;
OK
username age sex id ids totalnumber
user1 18 male 1 [101,102,103] 3
user2 28 female 2 [201,202,203] 3
user3 38 male 3 [301,302,303,304] 4
user4 48 female 4 [401,402,403,404,405] 5
user5 58 male 5 [501,502,503] 3
user6 68 female 6 [601,602,603] 3
user7 78 male 7 [701,702,703] 3
NULL NULL NULL NULL NULL
Time taken: 0.562 seconds, Fetched: 8 row(s)
还可以取数组中的某个元素
hive (ods)> select username,age,sex,get_json_object(json,"$.id") id , > get_json_object(json,"$.ids[0]") ids0, > get_json_object(json,"$.ids[1]") ids1, > get_json_object(json,"$.ids[2]") ids2, > get_json_object(json,"$.ids[3]") ids3, > get_json_object(json,"$.ids[4]") ids4, > get_json_object(json,"$.ids[5]") ids5, > get_json_object(json,"$.total_number") totalNumber > from jsont1; OK username age sex id ids0 ids1 ids2 ids3 ids4 ids5 totalnumber user1 18 male 1 101 102 103 NULL NULL NULL 3 user2 28 female 2 201 202 203 NULL NULL NULL 3 user3 38 male 3 301 302 303 304 NULL NULL 4 user4 48 female 4 401 402 403 404 405 NULL 5 user5 58 male 5 501 502 503 NULL NULL NULL 3 user6 68 female 6 601 602 603 NULL NULL NULL 3 user7 78 male 7 701 702 703 NULL NULL NULL 3 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL Time taken: 6.033 seconds, Fetched: 8 row(s) hive (ods)>
hive (ods)> select json_tuple(json,'id','ids','total_number')
> from jsont1;
OK
c0 c1 c2
1 [101,102,103] 3
2 [201,202,203] 3
3 [301,302,303,304] 4
4 [401,402,403,404,405] 5
5 [501,502,503] 3
6 [601,602,603] 3
7 [701,702,703] 3
NULL NULL NULL
Time taken: 1.076 seconds, Fetched: 8 row(s)
以上处理方式缺点:
(1).通过hive内置函数解析出的json都为json字符串并非我们想要的字符数组.
我们仔细观察它们俩者之间的不同
"[101,102,103]"-->["101","102","103"]
如果我们要通过hql处理如下:
hive (ods)> select regexp_replace("[101,102,103]","\\[|\\]","");
OK
_c0
101,102,103
Time taken: 0.237 seconds, Fetched: 1 row(s)
hive (ods)> select split(regexp_replace("[101,102,103]","\\[|\\]",""),",");
OK
_c0
["101","102","103"]
(2)通过json_tuple()进行都分个取出,但是发现不能和原字段并列显示
hive (ods)> select username,age,sex,json_tuple(json,'id','ids','total_number')
> from jsont1;
FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions
ps:报错:我自己猜测是hive中没有json数组类型
补救措施 lateral view视图
通过下列方式可以(lateral view 拼接视图)
hive (ods)> select username,age,sex,id,ids,num
> from jsont1
> lateral view json_tuple(json,'id','ids','total_number') t1 as id,ids,num;
OK
username age sex id ids num
user1 18 male 1 [101,102,103] 3
user2 28 female 2 [201,202,203] 3
user3 38 male 3 [301,302,303,304] 4
user4 48 female 4 [401,402,403,404,405] 5
user5 58 male 5 [501,502,503] 3
user6 68 female 6 [601,602,603] 3
user7 78 male 7 [701,702,703] 3
NULL NULL NULL NULL NULL
Time taken: 0.64 seconds, Fetched: 8 row(s)
(3)如果我们需要对字符数组炸裂做进一步处理更加繁琐
hive (ods)> with tmp as (select username,age,sex,id,ids,num > from jsont1 > lateral view json_tuple(json,'id','ids','total_number') t1 as id,ids,num) > select username,age,sex,id,ids1,num > from tmp > lateral view explode(split(regexp_replace(ids,"\\[|\\]",""),",")) t1 as ids1; OK username age sex id ids1 num user1 18 male 1 101 3 user1 18 male 1 102 3 user1 18 male 1 103 3 user2 28 female 2 201 3 user2 28 female 2 202 3 user2 28 female 2 203 3 user3 38 male 3 301 4 user3 38 male 3 302 4 user3 38 male 3 303 4 user3 38 male 3 304 4 user4 48 female 4 401 5 user4 48 female 4 402 5 user4 48 female 4 403 5 user4 48 female 4 404 5 user4 48 female 4 405 5 user5 58 male 5 501 3 user5 58 male 5 502 3 user5 58 male 5 503 3 user6 68 female 6 601 3 user6 68 female 6 602 3 user6 68 female 6 603 3 user7 78 male 7 701 3 user7 78 male 7 702 3 user7 78 male 7 703 3 Time taken: 0.459 seconds, Fetched: 24 row(s) hive (ods)>
(4)只能处理简单的json,复杂的json不好处理
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.7</version>
</dependency>
public class ParseJsonArray extends UDF { public ArrayList<String> evaluate(String jsonStr, String key) { if (Strings.isNullOrEmpty(jsonStr)) { return null; } try { JSONObject jsonObject = JSON.parseObject(jsonStr); JSONArray jsonArray = jsonObject.getJSONArray(key); ArrayList<String> strings = new ArrayList<>(); for (Object o : jsonArray) { strings.add(o.toString()); } return strings; }catch (JsonException e){ return null; } } @Test public void junitParseJsonArray(){ String jsonStr = "{\"id\":1,\"ids\":[101,102,103],\"total_number\":3}"; String key = "ids"; ArrayList<String> list = evaluate(jsonStr, key); System.out.println(JSON.toJSONString(list));//["101","102","103"] } }
hive (ods)> add jar /root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar ;
Added [/root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar] to class path
Added resources: [/root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar]
hive (ods)> create temporary function json_array_me as "com.lagou.dw.flume.interceptor.hive.udf.ParseJsonArray";
OK
Time taken: 2.046 seconds
hive (ods)>
hive (ods)> select username,age,sex,json_array_me(json,“ids”) ids
> from jsont1;
OK
username age sex ids
user1 18 male ["101","102","103"]
user2 28 female ["201","202","203"]
user3 38 male ["301","302","303","304"]
user4 48 female ["401","402","403","404","405"]
user5 58 male ["501","502","503"]
user6 68 female ["601","602","603"]
user7 78 male ["701","702","703"]
NULL NULL NULL
Time taken: 0.748 seconds, Fetched: 8 row(s)
突然发现自定义的好处:不用繁琐解决解析json字符串和普通字段不能同时显示问题.
同时我们对解析后字符数据炸裂更加方便
hive (ods)> select username,age,sex,ids1 > from jsont1 > lateral view explode(json_array_me(json,"ids")) t1 as ids1; OK username age sex ids1 user1 18 male 101 user1 18 male 102 user1 18 male 103 user2 28 female 201 user2 28 female 202 user2 28 female 203 user3 38 male 301 user3 38 male 302 user3 38 male 303 user3 38 male 304 user4 48 female 401 user4 48 female 402 user4 48 female 403 user4 48 female 404 user4 48 female 405 user5 58 male 501 user5 58 male 502 user5 58 male 503 user6 68 female 601 user6 68 female 602 user6 68 female 603 user7 78 male 701 user7 78 male 702 user7 78 male 703 Time taken: 1.503 seconds, Fetched: 24 row(s)
拓展:我们如果需要将json都展开呢?
解决办法: 使用两次lateral view处理
hive (ods)> select username,age,sex,id,num,ids1 > from jsont1 > lateral view json_tuple(json,'id','total_number') t1 as id,num > lateral view explode(json_array_me(json,"ids")) t2 as ids1; OK username age sex id num ids1 user1 18 male 1 3 101 user1 18 male 1 3 102 user1 18 male 1 3 103 user2 28 female 2 3 201 user2 28 female 2 3 202 user2 28 female 2 3 203 user3 38 male 3 4 301 user3 38 male 3 4 302 user3 38 male 3 4 303 user3 38 male 3 4 304 user4 48 female 4 5 401 user4 48 female 4 5 402 user4 48 female 4 5 403 user4 48 female 4 5 404 user4 48 female 4 5 405 user5 58 male 5 3 501 user5 58 male 5 3 502 user5 58 male 5 3 503 user6 68 female 6 3 601 user6 68 female 6 3 602 user6 68 female 6 3 603 user7 78 male 7 3 701 user7 78 male 7 3 702 user7 78 male 7 3 703 Time taken: 0.568 seconds, Fetched: 24 row(s)
创建表可以指定表的序列化方式
TextFile:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
OrcSerde:org.apache.hadoop.hive.ql.io.orc.OrcSerde
Parquet:org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
例如:指定序列化方式org.apache.hive.hcatalog.data.JsonSerDe
[root@Linux122 source]# cat json_data.txt {"id":1,"ids":[101,102,103],"total_number":3} {"id":2,"ids":[201,202,203],"total_number":3} {"id":3,"ids":[301,302,303,304],"total_number":4} {"id":4,"ids":[401,402,403,404,405],"total_number":5} {"id":5,"ids":[501,502,503],"total_number":3} {"id":6,"ids":[601,602,603],"total_number":3} {"id":7,"ids":[701,702,703],"total_number":3} hive (ods)> create table json2( > id int, > ids array<string>, > total_number int) > row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'; OK Time taken: 0.702 seconds hive (ods)> load data local inpath '/root/data/source/json_data.txt' into table json2; Loading data to table ods.json2 OK Time taken: 17.651 seconds hive (ods)> select * from json2; OK json2.id json2.ids json2.total_number 1 ["101","102","103"] 3 2 ["201","202","203"] 3 3 ["301","302","303","304"] 4 4 ["401","402","403","404","405"] 5 5 ["501","502","503"] 3 6 ["601","602","603"] 3 7 ["701","702","703"] 3 Time taken: 0.94 seconds, Fetched: 7 row(s)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。