这里分类和汇总了欣宸的全部原创(含配套源码):https://github.com/zq2599/blog_demos
public abstract class AbstractGenericUDAFResolver
implements GenericUDAFResolver2
{
@SuppressWarnings("deprecation")
@Override
public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info)
throws SemanticException {
if (info.isAllColumns()) {
throw new SemanticException(
"The specified syntax for UDAF invocation is invalid.");
}
return getEvaluator(info.getParameters());
}
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] info)
throws SemanticException {
throw new SemanticException(
"This UDAF does not support the deprecated getEvaluator() method.");
}
}
名称 | 链接 | 备注 |
---|---|---|
项目主页 | 该项目在GitHub上的主页 | |
git仓库地址(https) | 该项目源码的仓库地址,https协议 | |
git仓库地址(ssh) | git@github.com:zq2599/blog_demos.git | 该项目源码的仓库地址,ssh协议 |
开发UDAF分为以下几步:
接下来就按照上述步骤开始操作;
package com.bolingcavalry.hiveudf.udaf;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
public class FieldLengthAggregationBuffer extends GenericUDAFEvaluator.AbstractAggregationBuffer {
private Integer value = 0;
public Integer getValue() {
return value;
}
public void setValue(Integer value) {
this.value = value;
}
public void add(int addValue) {
synchronized (value) {
value += addValue;
}
}
/**
* 合并值缓冲区大小,这里是用来保存字符串长度,因此设为4byte
* @return
*/
@Override
public int estimate() {
return JavaDataModel.PRIMITIVES1;
}
}
package com.bolingcavalry.hiveudf.udaf;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
/**
* @Description: 这里是UDAF的实际处理类
* @author: willzhao E-mail: zq2599@gmail.com
* @date: 2020/11/4 9:57
*/
public class FieldLengthUDAFEvaluator extends GenericUDAFEvaluator {
PrimitiveObjectInspector inputOI;
ObjectInspector outputOI;
PrimitiveObjectInspector integerOI;
/**
* 每个阶段都会被执行的方法,
* 这里面主要是把每个阶段要用到的输入输出inspector好,其他方法被调用时就能直接使用了
* @param m
* @param parameters
* @return
* @throws HiveException
*/
@Override
public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
super.init(m, parameters);
// COMPLETE或者PARTIAL1,输入的都是数据库的原始数据
if(Mode.PARTIAL1.equals(m) || Mode.COMPLETE.equals(m)) {
inputOI = (PrimitiveObjectInspector) parameters[0];
} else {
// PARTIAL2和FINAL阶段,都是基于前一个阶段init返回值作为parameters入参
integerOI = (PrimitiveObjectInspector) parameters[0];
}
outputOI = ObjectInspectorFactory.getReflectionObjectInspector(
Integer.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA
);
// 给下一个阶段用的,即告诉下一个阶段,自己输出数据的类型
return outputOI;
}
public AggregationBuffer getNewAggregationBuffer() throws HiveException {
return new FieldLengthAggregationBuffer();
}
/**
* 重置,将总数清理掉
* @param agg
* @throws HiveException
*/
public void reset(AggregationBuffer agg) throws HiveException {
((FieldLengthAggregationBuffer)agg).setValue(0);
}
/**
* 不断被调用执行的方法,最终数据都保存在agg中
* @param agg
* @param parameters
* @throws HiveException
*/
public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException {
if(null==parameters || parameters.length<1) {
return;
}
Object javaObj = inputOI.getPrimitiveJavaObject(parameters[0]);
((FieldLengthAggregationBuffer)agg).add(String.valueOf(javaObj).length());
}
/**
* group by的时候返回当前分组的最终结果
* @param agg
* @return
* @throws HiveException
*/
public Object terminate(AggregationBuffer agg) throws HiveException {
return ((FieldLengthAggregationBuffer)agg).getValue();
}
/**
* 当前阶段结束时执行的方法,返回的是部分聚合的结果(map、combiner)
* @param agg
* @return
* @throws HiveException
*/
public Object terminatePartial(AggregationBuffer agg) throws HiveException {
return terminate(agg);
}
/**
* 合并数据,将总长度加入到缓存对象中(combiner或reduce)
* @param agg
* @param partial
* @throws HiveException
*/
public void merge(AggregationBuffer agg, Object partial) throws HiveException {
((FieldLengthAggregationBuffer) agg).add((Integer)integerOI.getPrimitiveJavaObject(partial));
}
}
package com.bolingcavalry.hiveudf.udaf;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
public class FieldLength extends AbstractGenericUDAFResolver {
@Override
public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException {
return new FieldLengthUDAFEvaluator();
}
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] info) throws SemanticException {
return new FieldLengthUDAFEvaluator();
}
}
至此,编码完成,接下来是部署和体验;
本次部署的注册方式是临时函数,如果您想注册为永久函数,请参考前文;
add jar /home/hadoop/udf/hiveudf-1.0-SNAPSHOT.jar;
create temporary function udf_fieldlength as 'com.bolingcavalry.hiveudf.udaf.FieldLength';
hive> select * from address;
OK
1 guangdong guangzhou
2 guangdong shenzhen
3 shanxi xian
4 shanxi hanzhong
6 jiangshu nanjing
select province, count(city), udf_fieldlength(city) from address group by province;
执行结果如下,可见guangdong的guangzhou和shenzhen总长度为17,jiangsu的nanjing为7,shanxi的xian和hanzhong总长度12,符合预期:
Total MapReduce CPU Time Spent: 2 seconds 730 msec
OK
guangdong 2 17
jiangshu 1 7
shanxi 2 12
Time taken: 28.484 seconds, Fetched: 3 row(s)
至此,UDAF的学习和实践就完成了,咱们掌握了多进一出的函数开发,由于涉及到多个阶段和外部调用的逻辑,使得UDAF的开发难度略大,接下来的文章是一进多出的开发,会简单一些。