这里分类和汇总了欣宸的全部原创(含配套源码):https://github.com/zq2599/blog_demos
hive> select explode(address) from t3;
OK
province guangdong
city shenzhen
province jiangsu
city nanjing
Time taken: 0.081 seconds, Fetched: 4 row(s)
名称 | 链接 | 备注 |
---|---|---|
项目主页 | 该项目在GitHub上的主页 | |
git仓库地址(https) | 该项目源码的仓库地址,https协议 | |
git仓库地址(ssh) | git@github.com:zq2599/blog_demos.git | 该项目源码的仓库地址,ssh协议 |
为了验证UDTF的功能,咱们要先把表和数据都准备好:
create table t16(
person_name string,
string_field string
)
row format delimited
fields terminated by '|'
stored as textfile;
tom|1:province:guangdong
jerry|2:city:shenzhen
john|3
load data
local inpath '/home/hadoop/temp/202010/25/016.txt'
overwrite into table t16;
package com.bolingcavalry.hiveudf.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* @Description: 把指定字段拆成多列
* @author: willzhao E-mail: zq2599@gmail.com
* @date: 2020/11/5 14:43
*/
public class WordSplitSingleRow extends GenericUDTF {
private PrimitiveObjectInspector stringOI = null;
private final static String[] EMPTY_ARRAY = {"NULL", "NULL", "NULL"};
/**
* 一列拆成多列的逻辑在此
* @param args
* @throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
String input = stringOI.getPrimitiveJavaObject(args[0]).toString();
// 无效字符串
if(StringUtils.isBlank(input)) {
forward(EMPTY_ARRAY);
} else {
// 分割字符串
String[] array = input.split(":");
// 如果字符串数组不合法,就返回原始字符串和错误提示
if(null==array || array.length<3) {
String[] errRlt = new String[3];
errRlt[0] = input;
errRlt[1] = "can not split to valid array";
errRlt[2] = "-";
forward(errRlt);
} else {
forward(array);
}
}
}
/**
* 释放资源在此执行,本例没有资源需要释放
* @throws HiveException
*/
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
// 当前UDTF只处理一个参数,在此判断传入的是不是一个参数
if (1 != inputFields.size()) {
throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
}
// 此UDTF只处理字符串类型
if(!Category.PRIMITIVE.equals(inputFields.get(0).getFieldObjectInspector().getCategory())) {
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
stringOI = (PrimitiveObjectInspector)inputFields.get(0).getFieldObjectInspector();
//列名集合
ArrayList<String> fieldNames = new ArrayList<String>();
//列对应的value值
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
// 第一列的列名
fieldNames.add("id");
// 第一列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
// 第二列的列名
fieldNames.add("key");
// 第二列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
// 第三列的列名
fieldNames.add("value");
// 第三列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
}
接下来将WordSplitSingleRow.java部署成临时函数并验证;
add jar /home/hadoop/udf/hiveudf-1.0-SNAPSHOT.jar;
create temporary function udf_wordsplitsinglerow as 'com.bolingcavalry.hiveudf.udtf.WordSplitSingleRow';
select udf_wordsplitsinglerow(string_field) from t16;
hive> select udf_wordsplitsinglerow(string_field) from t16;
OK
id key value
1 province guangdong
2 city shenzhen
3 can not split to valid array -
Time taken: 0.066 seconds, Fetched: 3 row(s)
select person_name,udf_wordsplitsinglerow(string_field) from t16;
hive> select person_name,udf_wordsplitsinglerow(string_field) from t16;
FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions
select t.person_name, udtf_id, udtf_key, udtf_value
from (
select person_name, string_field
from t16
) t LATERAL VIEW udf_wordsplitsinglerow(t.string_field) v as udtf_id, udtf_key, udtf_value;
hive> select t.person_name, udtf_id, udtf_key, udtf_value
> from (
> select person_name, string_field
> from t16
> ) t LATERAL VIEW udf_wordsplitsinglerow(t.string_field) v as udtf_id, udtf_key, udtf_value;
OK
t.person_name udtf_id udtf_key udtf_value
tom 1 province guangdong
jerry 2 city shenzhen
john 3 can not split to valid array -
Time taken: 0.122 seconds, Fetched: 3 row(s)
tom|1:province:guangdong,4:city:yangjiang
jerry|2:city:shenzhen
john|3
load data
local inpath '/home/hadoop/temp/202010/25/016_multi.txt'
overwrite into table t16;
package com.bolingcavalry.hiveudf.udtf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* @Description: 把指定字段拆成多行,每行有多列
* @author: willzhao E-mail: zq2599@gmail.com
* @date: 2020/11/5 14:43
*/
public class WordSplitMultiRow extends GenericUDTF {
private PrimitiveObjectInspector stringOI = null;
private final static String[] EMPTY_ARRAY = {"NULL", "NULL", "NULL"};
/**
* 一列拆成多列的逻辑在此
* @param args
* @throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
String input = stringOI.getPrimitiveJavaObject(args[0]).toString();
// 无效字符串
if(StringUtils.isBlank(input)) {
forward(EMPTY_ARRAY);
} else {
// 用逗号分隔
String[] rowArray = input.split(",");
// 处理异常
if(null==rowArray || rowArray.length<1) {
String[] errRlt = new String[3];
errRlt[0] = input;
errRlt[1] = "can not split to valid row array";
errRlt[2] = "-";
forward(errRlt);
} else {
// rowArray的每个元素,都是"id:key:value"这样的字符串
for(String singleRow : rowArray) {
// 要确保字符串有效
if(StringUtils.isBlank(singleRow)) {
forward(EMPTY_ARRAY);
} else {
// 分割字符串
String[] array = singleRow.split(":");
// 如果字符串数组不合法,就返回原始字符串和错误提示
if(null==array || array.length<3) {
String[] errRlt = new String[3];
errRlt[0] = input;
errRlt[1] = "can not split to valid array";
errRlt[2] = "-";
forward(errRlt);
} else {
forward(array);
}
}
}
}
}
}
/**
* 释放资源在此执行,本例没有资源需要释放
* @throws HiveException
*/
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
List<? extends StructField> inputFields = argOIs.getAllStructFieldRefs();
// 当前UDTF只处理一个参数,在此判断传入的是不是一个参数
if (1 != inputFields.size()) {
throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
}
// 此UDTF只处理字符串类型
if(!Category.PRIMITIVE.equals(inputFields.get(0).getFieldObjectInspector().getCategory())) {
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
stringOI = (PrimitiveObjectInspector)inputFields.get(0).getFieldObjectInspector();
//列名集合
ArrayList<String> fieldNames = new ArrayList<String>();
//列对应的value值
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
// 第一列的列名
fieldNames.add("id");
// 第一列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
// 第二列的列名
fieldNames.add("key");
// 第二列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
// 第三列的列名
fieldNames.add("value");
// 第三列的inspector类型为string型
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
}
接下来将WordSplitMultiRow.java部署成临时函数并验证;
drop temporary function if exists udf_wordsplitsinglerow;
delete jar /home/hadoop/udf/hiveudf-1.0-SNAPSHOT.jar;
add jar /home/hadoop/udf/hiveudf-1.0-SNAPSHOT.jar;
create temporary function udf_wordsplitmultirow as 'com.bolingcavalry.hiveudf.udtf.WordSplitMultiRow';
select udf_wordsplitmultirow(string_field) from t16;
hive> select udf_wordsplitmultirow(string_field) from t16;
OK
id key value
1 province guangdong
4 city yangjiang
2 city shenzhen
3 can not split to valid array -
Time taken: 0.041 seconds, Fetched: 4 row(s)
select t.person_name, udtf_id, udtf_key, udtf_value
from (
select person_name, string_field
from t16
) t LATERAL VIEW udf_wordsplitmultirow(t.string_field) v as udtf_id, udtf_key, udtf_value;
hive> select t.person_name, udtf_id, udtf_key, udtf_value
> from (
> select person_name, string_field
> from t16
> ) t LATERAL VIEW udf_wordsplitmultirow(t.string_field) v as udtf_id, udtf_key, udtf_value;
OK
t.person_name udtf_id udtf_key udtf_value
tom 1 province guangdong
tom 4 city yangjiang
jerry 2 city shenzhen
john 3 can not split to valid array -
Time taken: 0.056 seconds, Fetched: 4 row(s)
如果您不想自己搭建kubernetes环境,推荐使用腾讯云容器服务TKE:无需自建,即可在腾讯云上使用稳定, 安全,高效,灵活扩展的 Kubernetes 容器平台;
如果您希望自己的镜像可以通过外网上传和下载,推荐腾讯云容器镜像服务TCR:像数据加密存储,大镜像多节点快速分发,跨地域镜像同步
微信搜索「程序员欣宸」,我是欣宸,期待与您一同畅游Java世界...
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。