Part01:Hive UDF概述
Hive自定义函数(UDF)允许用户扩展HiveQL的功能,处理内置函数无法满足需求的场景,Hive支持三种类型的UDF:
-
UDF(User-Defined Function):一进一出函数
-
UDAF(User-Defined Function):多进一出聚合函数
-
UDTF(User-Defined Table-Generating Function):一进多出表生成函数
Part02:开发环境准备
Maven依赖配置
xml
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.1</version>
</dependency>
</dependencies>
Part03:UDF开发案例
Case01:字符串处理UDF
开发一个将手机号中间4位替换为^_^的函数。
scala
package com.example.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
@Description(
name = "mask_phone",
value = "Returns masked phone number, e.g., 138****1234",
extended = "Example: SELECT mask_phone('13800138000') -> 138^_^8000"
)
public class MaskPhoneUDF extends UDF {
public Text evaluate(Text input) {
if (input == null) {
return null;
}
String phone = input.toString();
if (phone.length() != 11) {
return new Text("Invalid Phone");
}
// 替换中间4位为^_^
String masked = phone.substring(0, 3) + "^_^" + phone.substring(7);
return new Text(masked);
}
// 重载方法,支持多种输入类型
public Text evaluate(String input) {
if (input == null) {
return null;
}
return evaluate(new Text(input));
}
}
Case02:JSON解析UDF
开发解析JSON字符串的UDF
scala
package com.example.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.json.JSONObject;
public class JsonExtractUDF extends UDF {
public Text evaluate(Text jsonText, Text field) {
if (jsonText == null || field == null) {
return null;
}
try {
JSONObject jsonObj = new JSONObject(jsonText.toString());
String value = jsonObj.optString(field.toString());
return new Text(value);
} catch (Exception e) {
return null;
}
}
}
Part04:UDAF开发案例
开发计算平均值的UDAF
java
package com.example.hive.udaf;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
public class AverageUDAF extends UDAF {
public static class AverageState {
private double sum = 0;
private long count = 0;
}
public static class AverageEvaluator implements UDAFEvaluator {
private AverageState state;
public AverageEvaluator() {
super();
state = new AverageState();
init();
}
// 初始化
public void init() {
state.sum = 0;
state.count = 0;
}
// 映射阶段:处理单个行
public boolean iterate(DoubleWritable value) {
if (value != null) {
state.sum += value.get();
state.count++;
}
return true;
}
// 返回部分聚合结果
public AverageState terminatePartial() {
return state.count == 0 ? null : state;
}
// 合并部分聚合结果
public boolean merge(AverageState other) {
if (other != null) {
state.sum += other.sum;
state.count += other.count;
}
return true;
}
// 返回最终结果
public DoubleWritable terminate() {
return state.count == 0 ? null :
new DoubleWritable(state.sum / state.count);
}
}
}
Part05:UDTF开发案例
开发将CSV字符串拆分为多行的UDTF。
java
package com.example.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.List;
public class ExplodeCSVUDTF extends GenericUDTF {
@Override
public StructObjectInspector initialize(ObjectInspector[] args)
throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentLengthException("ExplodeCSV takes only one argument");
}
// 定义输出列名和类型
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("value");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(
fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
String input = args[0].toString();
String[] tokens = input.split(",");
for (String token : tokens) {
String[] output = new String[1];
output[0] = token.trim();
forward(output);
}
}
@Override
public void close() throws HiveException {
// 清理资源
}
}
Part06:编译和部署
编译打包
go
mvn clean package
上传JAR到HDFS
arduino
hdfs dfs -put hive-udf-1.0.0.jar /user/hive/udfs/
在Hive中注册函数
sql
-- 添加JAR文件
ADD JAR hdfs:///user/hive/udfs/hive-udf-1.0.0.jar;
-- 创建临时函数
CREATE TEMPORARY FUNCTION mask_phone AS 'com.example.hive.udf.MaskPhoneUDF';
CREATE TEMPORARY FUNCTION json_extract AS 'com.example.hive.udf.JsonExtractUDF';
CREATE TEMPORARY FUNCTION average_udaf AS 'com.example.hive.udaf.AverageUDAF';
CREATE TEMPORARY FUNCTION explode_csv AS 'com.example.hive.udtf.ExplodeCSVUDTF';
-- 创建永久函数(推荐生产环境使用)
CREATE FUNCTION default.mask_phone AS 'com.example.hive.udf.MaskPhoneUDF'
USING JAR 'hdfs:///user/hive/udfs/hive-udf-1.0.0.jar';
Part07:使用案例
测试数据准备
sql
-- 创建测试表
CREATE TABLE user_info (
id INT,
name STRING,
phone STRING,
json_data STRING,
scores STRING
);
-- 插入测试数据
INSERT INTO user_info VALUES
(1, '张三', '13800138000', '{"age":25,"city":"北京"}', '85,90,78'),
(2, '李四', '13900139000', '{"age":30,"city":"上海"}', '92,88,95'),
(3, '王五', '13700137000', '{"age":28,"city":"广州"}', '76,85,90');
UDF使用案例
sql
-- 使用mask_phone函数
SELECT name, phone, mask_phone(phone) AS masked_phone
FROM user_info;
-- 使用json_extract函数
SELECT name,
json_extract(json_data, 'age') AS age,
json_extract(json_data, 'city') AS city
FROM user_info;
UDAF使用案例
sql
-- 创建测试表
CREATE TABLE student_scores (
student_id INT,
subject STRING,
score DOUBLE
);
INSERT INTO student_scores VALUES
(1, 'Math', 85.0), (1, 'English', 90.0), (2, 'Math', 92.0),
(2, 'English', 88.0), (3, 'Math', 76.0), (3, 'English', 85.0);
-- 使用自定义聚合函数
SELECT subject, average_udaf(score) AS avg_score
FROM student_scores
GROUP BY subject;
UDTF使用案例
sql
-- 使用explode_csv函数
SELECT id, name, exploded_score
FROM user_info
LATERAL VIEW explode_csv(scores) exploded AS exploded_score;
Part08:最佳实践和注意事项
性能优化
scala
// 使用对象重用减少GC压力
public class EfficientUDF extends UDF {
private final Text result = new Text();
public Text evaluate(Text input) {
if (input == null) {
return null;
}
// 重用result对象
result.set(input.toString().toUpperCase());
return result;
}
}
异常处理
scala
public class SafeUDF extends UDF {
public Text evaluate(Text input) {
try {
if (input == null) return null;
// 业务逻辑
return new Text(process(input.toString()));
} catch (Exception e) {
// 记录日志但不抛出异常,避免任务失败
return new Text("ERROR");
}
}
}
数据类型处理
typescript
public class TypeSafeUDF extends UDF {
// 支持多种输入类型
public Text evaluate(Text input) {
return process(input);
}
public Text evaluate(String input) {
return process(new Text(input));
}
public Text evaluate(Integer input) {
return process(new Text(input.toString()));
}
private Text process(Text input) {
// 统一处理逻辑
return input;
}
}
Part09:调试和测试
单元测试示例
java
import org.junit.Test;
import static org.junit.Assert.*;
public class MaskPhoneUDFTest {
@Test
public void testMaskPhone() {
MaskPhoneUDF udf = new MaskPhoneUDF();
// 测试正常手机号
assertEquals("138****8000", udf.evaluate("13800138000").toString());
// 测试异常情况
assertEquals("Invalid Phone", udf.evaluate("123").toString());
// 测试null输入
assertNull(udf.evaluate((Text)null));
}
}
Part10:总结
通过本文的案例,我们详细介绍了Hive三种自定义函数的开发和使用:
- UDF:适合简单到的数据转换和处理
- UDAF:适合复杂的聚合计算
- UDTF:适合将单行数据扩展为多行
在实际开发中,建议:
- 合理选择函数类型
- 注意性能优化
- 做好异常处理
- 编写完整的单元测试
- 使用永久函数便于管理