低版本hive(1.2.1)UDF实现清除历史分区数据

目标:通过UDF实现对表历史数据清除

入参:表名、保留天数N

一、pom文件

xml 复制代码
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.example</groupId>
  <artifactId>hive-udf-example</artifactId>
  <version>1.0-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>hive-udf-example</name>
  <description>Hive UDF for deleting partitions by date</description>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <!-- Hive Exec (Hive 1.2.1版本) -->
    <dependency>
      <groupId>org.apache.hive</groupId>
      <artifactId>hive-exec</artifactId>
      <version>1.2.1</version>
    </dependency>

    <!-- Hive Metastore (Hive 1.2.1版本) -->
    <dependency>
      <groupId>org.apache.hive</groupId>
      <artifactId>hive-metastore</artifactId>
      <version>1.2.1</version>
    </dependency>

    <!-- Hadoop Client (Hive 1.2.1默认依赖Hadoop 2.7.3) -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.7.3</version>
    </dependency>


    <!-- SLF4J 日志 -->
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
      <version>1.7.25</version>
    </dependency>
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-log4j12</artifactId>
      <version>1.7.25</version>
    </dependency>

    <!-- 打包 -->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>


  </dependencies>

  <build>
    <plugins>
      <!-- 编译插件 -->
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.8.1</version>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
        </configuration>
      </plugin>
    </plugins>
  </build>
</project>

二、java代码

java 复制代码
package org.udf;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.conf.Configuration;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;

@Description(
        name = "del_dt",
        value = "通过删除HDFS文件并同步元数据的方式删除表N天前的分区 - 入参:表名, N(天数)"
)
public class del_dt extends UDF {
    /*
    UDF复用需要修改 dbName(数仓schema) 和 sdf(分区格式)两个参数
     */

    private static final Logger LOG = LoggerFactory.getLogger(del_dt.class);

    public String evaluate(String tableName, int days) {
        if (tableName == null || days < 0) {
            return "错误:表名不能为空且天数不能为负数";
        }

        Configuration conf = new Configuration();
        FileSystem fs = null;
        HiveMetaStoreClient client = null;
        int deletedCount = 0;

        try {
            // 获取HDFS文件系统
            fs = FileSystem.get(conf);

            // 获取Hive元数据客户端
            HiveConf hiveConf = new HiveConf(conf, this.getClass());
            client = new HiveMetaStoreClient(hiveConf);

            // 解析表名(处理db.table格式)                                                    -- 需要修改的变量
            String dbName = "bjsythzczcpt";
            String tableOnlyName = tableName;
            if (tableName.contains(".")) {
                String[] parts = tableName.split("\\.");
                dbName = parts[0];
                tableOnlyName = parts[1];
            }

            // 检查表是否存在
            if (!client.tableExists(dbName, tableOnlyName)) {
                return "错误:表 " + tableName + " 不存在";
            }

            // 获取表信息
            Table table = client.getTable(dbName, tableOnlyName);

            // 检查表是否为分区表
            List<FieldSchema> partitionKeys = table.getPartitionKeys();
            if (partitionKeys == null || partitionKeys.isEmpty()) {
                return "错误:表 " + tableName + " 不是分区表";
            }

            // 检查是否包含日期分区列(假设为dt)
            boolean hasDatePartition = false;
            for (FieldSchema key : partitionKeys) {
                if (key.getName().equalsIgnoreCase("dt")) {
                    hasDatePartition = true;
                    break;
                }
            }

            if (!hasDatePartition) {
                return "错误:表 " + tableName + " 不包含日期分区列(dt)";
            }

            // 计算N天前的日期
            Calendar cal = Calendar.getInstance();
            cal.add(Calendar.DAY_OF_YEAR, -days);
            Date cutoffDate = cal.getTime();
            SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");               // 分区格式
            String cutoffDateStr = sdf.format(cutoffDate);

            // 获取表的所有分区
            List<Partition> partitions = client.listPartitions(dbName, tableOnlyName, (short) -1);

            // 获取表的存储描述符(用于构建分区路径)
            String tableLocation = table.getSd().getLocation();

            // 遍历分区并删除N天前的分区
            for (Partition partition : partitions) {
                Map<String, String> partitionValues = getPartitionValues(client, partition);
                String dtValue = partitionValues.get("dt");

                if (dtValue != null) {
                    try {
                        Date partitionDate = sdf.parse(dtValue);
                        if (partitionDate.before(cutoffDate)) {
                            // 构建分区路径
                            String partitionPath = buildPartitionPath(tableLocation, partition.getValues(), partitionKeys);
                            Path hdfsPath = new Path(partitionPath);

                            // 删除HDFS上的分区数据
                            if (fs.exists(hdfsPath)) {
                                fs.delete(hdfsPath, true);
                                LOG.info("成功删除HDFS分区路径: {}", partitionPath);

                                // 从元数据中删除分区
                                client.dropPartition(dbName, tableOnlyName, partition.getValues(), true);

                                deletedCount++;
                                LOG.info("成功删除分区: {}", partition.getValues());
                            }
                        }
                    } catch (Exception e) {
                        LOG.error("处理分区失败 ({}): {}", partition.getValues(), e.getMessage());
                    }
                }
            }

            return "操作完成:成功删除 " + deletedCount + " 个分区";

        } catch (IOException | TException e) {
            LOG.error("执行失败: {}", e.getMessage());
            return "错误:执行失败 - " + e.getMessage();
        } finally {
            // 关闭资源
            if (fs != null) {
                try {
                    fs.close();
                } catch (IOException e) {
                    LOG.error("关闭HDFS连接失败: {}", e.getMessage());
                }
            }
            if (client != null) {
                client.close();
            }
        }
    }

    // 解析分区值(修正静态方法问题)
    private Map<String, String> getPartitionValues(HiveMetaStoreClient client, Partition partition) {
        Map<String, String> values = new HashMap<>();
        List<String> partitionVals = partition.getValues();

        try {
            // 使用已创建的client实例获取表信息
            Table table = client.getTable(partition.getDbName(), partition.getTableName());
            List<FieldSchema> partitionKeys = table.getPartitionKeys();

            for (int i = 0; i < Math.min(partitionKeys.size(), partitionVals.size()); i++) {
                values.put(partitionKeys.get(i).getName(), partitionVals.get(i));
            }
        } catch (TException e) {
            LOG.error("获取分区键失败: {}", e.getMessage());
        }

        return values;
    }

    // 构建分区路径
    private String buildPartitionPath(String tableLocation, List<String> partitionValues, List<FieldSchema> partitionKeys) {
        StringBuilder pathBuilder = new StringBuilder(tableLocation);

        for (int i = 0; i < partitionValues.size(); i++) {
            if (i < partitionKeys.size()) {
                pathBuilder.append("/")
                        .append(partitionKeys.get(i).getName())
                        .append("=")
                        .append(partitionValues.get(i));
            }
        }

        return pathBuilder.toString();
    }
}

三、函数创建与修改

sql 复制代码
-- 创建函数
add jar hdfs:///hdfs路径/jar包名.jar;
CREATE  FUNCTION del_dt AS 'org.udf.del_dt';

-- 修改函数
DELETE jar hdfs:///hdfs路径/jar包名.jar;
add jar hdfs:///hdfs路径/jar包名.jar;
drop FUNCTION del_dt;
CREATE  FUNCTION del_dt AS 'org.udf.del_dt';

四、调用示例;

sql 复制代码
-- 删除dwd_abc_df表历史分区数据,保留最近7天分区
hive> SELECT del_dt('dwd_abc_df',7);
-- 结果输出
OK
操作完成:成功删除 0 个分区
Time taken: 0.192 seconds
相关推荐
tsyjjOvO1 天前
SpringMVC 从入门到精通
数据仓库·hive·hadoop
Francek Chen1 天前
【大数据存储与管理】分布式数据库HBase:05 HBase运行机制
大数据·数据库·hadoop·分布式·hdfs·hbase
zzzzzwbetter1 天前
Hadoop完全分布式部署-Master的NameNode以及Slaver2的DataNode未启动
大数据·hadoop·分布式
weixin_449310841 天前
ETL转换和数据写入小满OKKICRM的技术细节
数据仓库·php·etl
IvanCodes1 天前
Hive IDE连接及UDF实战
ide·hive·hadoop
yumgpkpm1 天前
华为昇腾910B 开源软件GPUStack的介绍(Cloudera CDH、CDP)
人工智能·hadoop·elasticsearch·flink·kafka·企业微信·big data
lifewange2 天前
Hive数据库
数据库·hive·hadoop
五月天的尾巴3 天前
hive数据库模糊查询表名
hive·查询表名
蓝魔Y3 天前
hive—1.1、执行优化
hive
快乐非自愿3 天前
OpenClaw 生态适配:Hadoop/Hive 技能现状与企业级集成方案
大数据·hive·hadoop·分布式·openclaw