Linux下GCC和C++实现统计Clickhouse数据仓库指定表中各字段的空值、空字符串或零值比例

Clickhouse SQL 统计脚本

sql 复制代码
/* 创建结果表(如果不存在) */
CREATE TABLE IF NOT EXISTS field_empty_stats (
    id UInt32 AUTO_INCREMENT,
    database String,
    table_name String,
    column_name String,
    column_type String,
    empty_count UInt64,
    total_rows UInt64,
    empty_percent Float64,
    run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);

/* 字段空值统计查询 */
WITH tables AS (
    SELECT 
        database,
        name AS table_name
    FROM system.tables 
    WHERE database = 'your_database'
      AND table_name IN ('table1', 'table2')  -- 替换为实际表名
)
SELECT 
    t.database,
    t.table_name,
    c.name AS column_name,
    c.type AS column_type,
    CASE 
        WHEN c.type LIKE 'String%' THEN
            (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') 
            FROM {db:Identifier}.{table:Identifier})
        WHEN c.type IN (
            'UInt8','UInt16','UInt32','UInt64',
            'Int8','Int16','Int32','Int64',
            'Float32','Float64','Decimal'
        ) THEN
            (SELECT sum(IFNULL({col:Identifier}, 0) = 0) 
            FROM {db:Identifier}.{table:Identifier})
        ELSE 
            (SELECT sum({col:Identifier} IS NULL) 
            FROM {db:Identifier}.{table:Identifier})
    END AS empty_count,
    (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows,
    ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent
FROM system.columns c
JOIN tables t 
    ON c.database = t.database AND c.table = t.table_name
SETTINGS 
    allow_experimental_analyzer = 1,
    format_template = 'SELECT \'{database}\', \'{table_name}\', \'{column_name}\', \'{column_type}\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'

C++ 控制台程序

cpp 复制代码
#include <clickhouse/client.h>
#include <iostream>
#include <vector>
#include <ctime>
#include <iomanip>
#include <sstream>

using namespace clickhouse;

// 获取当前时间戳字符串
std::string currentDateTime() {
    auto now = std::time(nullptr);
    auto tm = *std::localtime(&now);
    std::ostringstream oss;
    oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S");
    return oss.str();
}

int main(int argc, char* argv[]) {
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <database> <table1> [table2 ...]\n";
        return 1;
    }

    const std::string database = argv[1];
    std::vector<std::string> tables;
    for (int i = 2; i < argc; ++i) {
        tables.push_back(argv[i]);
    }

    // 连接Clickhouse
    ClientOptions opts;
    opts.SetHost("localhost");
    opts.SetPort(9000);
    opts.SetUser("default");
    opts.SetPassword("");
    Client client(opts);

    // 构建动态SQL
    std::ostringstream sql;
    sql << "WITH tables AS ("
        << "  SELECT database, name AS table_name "
        << "  FROM system.tables "
        << "  WHERE database = '" << database << "' "
        << "    AND table_name IN (";
    
    for (size_t i = 0; i < tables.size(); ++i) {
        sql << "'" << tables[i] << "'";
        if (i < tables.size() - 1) sql << ", ";
    }
    
    sql << ") )"
        << "INSERT INTO field_empty_stats "
        << "(database, table_name, column_name, column_type, empty_count, total_rows, empty_percent) "
        << "SELECT "
        << "  t.database, "
        << "  t.table_name, "
        << "  c.name AS column_name, "
        << "  c.type AS column_type, "
        << "  CASE "
        << "    WHEN c.type LIKE 'String%' THEN "
        << "      (SELECT sum(TRIM(IFNULL({col:Identifier}, '')) = '') "
        << "       FROM {db:Identifier}.{table:Identifier}) "
        << "    WHEN c.type IN ( "
        << "      'UInt8','UInt16','UInt32','UInt64', "
        << "      'Int8','Int16','Int32','Int64', "
        << "      'Float32','Float64','Decimal' "
        << "    ) THEN "
        << "      (SELECT sum(IFNULL({col:Identifier}, 0) = 0) "
        << "      FROM {db:Identifier}.{table:Identifier}) "
        << "    ELSE "
        << "      (SELECT sum({col:Identifier} IS NULL) "
        << "       FROM {db:Identifier}.{table:Identifier}) "
        << "  END AS empty_count, "
        << "  (SELECT count() FROM {db:Identifier}.{table:Identifier}) AS total_rows, "
        << "  ROUND(empty_count * 100.0 / total_rows, 2) AS empty_percent "
        << "FROM system.columns c "
        << "JOIN tables t ON c.database = t.database AND c.table = t.table_name "
        << "SETTINGS "
        << "  allow_experimental_analyzer = 1, "
        << "  format_template = "
        << "'SELECT \\'{database}\\', \\'{table_name}\\', \\'{column_name}\\', "
        << "\\'{column_type}\\', ${{col:Identifier}}, ${{table:Identifier}}, ${{db:Identifier}}'";

    try {
        // 执行查询
        client.Execute(sql.str());
        
        std::cout << "Successfully analyzed " << tables.size() 
                  << " tables in database: " << database << "\n";
        std::cout << "Results written to field_empty_stats at: " 
                  << currentDateTime() << std::endl;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }

    return 0;
}

使用说明

  1. 创建结果表
sql 复制代码
CREATE TABLE IF NOT EXISTS field_empty_stats (
    id UInt32 AUTO_INCREMENT,
    database String,
    table_name String,
    column_name String,
    column_type String,
    empty_count UInt64,
    total_rows UInt64,
    empty_percent Float64,
    run_time DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (database, table_name, column_name, run_time);
  1. 编译程序
bash 复制代码
g++ -std=c++17 -o field_analyzer field_analyzer.cpp -lclickhouse-cpp-lib
  1. 运行程序
bash 复制代码
./field_analyzer your_database table1 table2 table3

处理逻辑说明

字段类型 空值判定逻辑 统计内容
String TRIM(IFNULL(column, '')) = '' 空字符串
数值类型 IFNULL(column, 0) = 0 零值
其他类型 column IS NULL NULL值

结果表结构

列名 类型 说明
id UInt32 自增ID
database String 数据库名称
table_name String 表名称
column_name String 字段名称
column_type String 字段类型
empty_count UInt64 空值/零值/NULL记录数
total_rows UInt64 表总行数
empty_percent Float64 空值百分比(保留两位小数)
run_time DateTime 分析执行时间

注意事项

  1. 确保Clickhouse服务器运行在默认端口9000(或修改代码中的端口)
  2. 需要安装clickhouse-cpp客户端库
  3. 程序自动处理以下数据类型:
    • 字符串类型:空字符串统计
    • 数值类型:零值统计
    • 其他类型:NULL值统计
  4. 结果表使用MergeTree引擎,按分析时间排序
  5. 百分比计算保留两位小数

这个解决方案实现了:

  • 动态分析指定数据库的多个表
  • 自动识别字段类型并应用相应的空值逻辑
  • 结果写入包含自增ID和时间戳的统计表
  • 命令行参数指定数据库和表名
  • 完整的错误处理机制
相关推荐
2301_1472583691 小时前
7月2日作业
java·linux·服务器
格调UI成品2 小时前
预警系统安全体系构建:数据加密、权限分级与误报过滤方案
大数据·运维·网络·数据库·安全·预警
2301_803554523 小时前
c++中类的前置声明
java·开发语言·c++
xuanzdhc5 小时前
Linux 基础IO
linux·运维·服务器
愚润求学5 小时前
【Linux】网络基础
linux·运维·网络
bantinghy6 小时前
Linux进程单例模式运行
linux·服务器·单例模式
小和尚同志7 小时前
29.4k!使用 1Panel 来管理你的服务器吧
linux·运维
帽儿山的枪手7 小时前
为什么Linux需要3种NAT地址转换?一探究竟
linux·网络协议·安全
shadon1789 天前
回答 如何通过inode client的SSLVPN登录之后,访问需要通过域名才能打开的服务
linux
小米里的大麦9 天前
014 Linux 2.6内核进程调度队列(了解)
linux·运维·驱动开发