赞
踩
// 第二列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
// 第三列的列名
fieldNames.add(“value”);
// 第三列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
}
接下来将WordSplitSingleRow.java部署成临时函数并验证;
编码完成后,在pom.xml所在目录执行命令mvn clean package -U;
在target目录得到文件hiveudf-1.0-SNAPSHOT.jar
将jar下载到hive服务器,我这里放在此目录:/home/hadoop/udf/
在hive会话模式执行以下命令添加本地jar:
add jar /home/hadoop/udf/hiveudf-1.0-SNAPSHOT.jar;
create temporary function udf_wordsplitsinglerow as ‘com.bolingcavalry.hiveudf.udtf.WordSplitSingleRow’;
select udf_wordsplitsinglerow(string_field) from t16;
hive> select udf_wordsplitsinglerow(string_field) from t16;
OK
id key value
1 province guangdong
2 city shenzhen
3 can not split to valid array -
Time taken: 0.066 seconds, Fetched: 3 row(s)
select person_name,udf_wordsplitsinglerow(string_field) from t16;
hive> select person_name,udf_wordsplitsinglerow(string_field) from t16;
FAILED: SemanticException [Error 10081]: UDTF’s are not supported outside the SELECT clause, nor nested in expressions
select t.person_name, udtf_id, udtf_key, udtf_value
from (
select person_name, string_field
from t16
) t LATERAL VIEW udf_wordsplitsinglerow(t.string_field) v as udtf_id, udtf_key, udtf_value;
hive> select t.person_name, udtf_id, udtf_key, udtf_value
from (
select person_name, string_field
- 1
from t16
- 1
) t LATERAL VIEW udf_wordsplitsinglerow(t.string_field) v as udtf_id, udtf_key, udtf_value;
OK
t.person_name udtf_id udtf_key udtf_value
tom 1 province guangdong
jerry 2 city shenzhen
john 3 can not split to valid array -
Time taken: 0.122 seconds, Fetched: 3 row(s)
前面咱们试过了将string_field字段拆分成id、key、value三个字段,不过拆分后总行数还是不变,接下来的UDTF,是把string_field拆分成多条记录,然后每条记录都有三个字段;
需要导入新的数据到t16表,新建文本文件016_multi.txt,内容如下:
tom|1:province:guangdong,4:city:yangjiang
jerry|2:city:shenzhen
john|3
load data
local inpath ‘/home/hadoop/temp/202010/25/016_multi.txt’
overwrite into table t16;
package com.bolingcavalry.hiveudf.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
@Description: 把指定字段拆成多行,每行有多列
@author: willzhao E-mail: zq2599@gmail.com
@date: 2020/11/5 14:43
*/
public class WordSplitMultiRow extends GenericUDTF {
private PrimitiveObjectInspector stringOI = null;
private final static String[] EMPTY_ARRAY = {“NULL”, “NULL”, “NULL”};
/**
一列拆成多列的逻辑在此
@param args
@throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
String input = stringOI.getPrimitiveJavaObject(args[0]).toString();
// 无效字符串
if(StringUtils.isBlank(input)) {
forward(EMPTY_ARRAY);
} else {
// 用逗号分隔
String[] rowArray = input.split(“,”);
// 处理异常
if(null==rowArray || rowArray.length<1) {
String[] errRlt = new String[3];
errRlt[0] = input;
errRlt[1] = “can not split to valid row array”;
errRlt[2] = “-”;
forward(errRlt);
} else {
// rowArray的每个元素,都是"id
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。