二百七十三、Kettle——ClickHouse中增量导入数据准确性统计表数据(1天1次)

一、目的

在数据质量模块,需要对原始数据的准确性进行统计

二、Hive中原有代码

2.1 表结构

复制代码
--42、数据准确性统计表 dwd_data_accuracy
create  table  if not exists  hurys_db.dwd_data_accuracy(
    data_type               int        comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
    device_no               string     comment '设备编号',
    field_name              string     comment '字段名',
    data_unreasonable_rate  float      comment '数据不合理率',
    data_null_rate          float      comment '数据空值率'
)
comment '数据准确性统计表'
partitioned by (day string)
stored as orc
;

2.2 SQL代码

复制代码
insert  overwrite  table  hurys_db.dwd_data_accuracy  partition(day)
select
       t1.data_type,
       t1.device_no,
       t1.field_name,
       round((sum(case when t1.field_value is not null then 1 else 0 end)/t2.count_device_all),2)  data_unreasonable_rate,
       round((sum(case when t1.field_value is null then 1 else 0 end)/t2.count_device_all),2) data_null_rate ,
       t1.day
from hurys_db.dwd_data_clean_record_queue as t1
left join (select
                device_no,
                day,
                count(device_no) count_device_all
           from hurys_db.ods_queue
           where day='2024-09-04'
           group by device_no, day
          ) as  t2
on t2.device_no=t1.device_no and t2.day=t1.day
where t2.count_device_all is not null
group by t1.data_type, t1.device_no, t1.field_name, t2.count_device_all, t1.day

三、ClickHouse中现有代码

3.1 表结构

复制代码
--42、八大类基础数据准确性统计表(长期存储)
create  table  if not exists  hurys_jw.dwd_data_accuracy(
    data_type               Int32            comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
    device_no               String           comment '设备编号',
    field_name              String           comment '字段名',
    data_unreasonable_rate  Decimal(10, 6)   comment '数据不合理率',
    data_null_rate          Decimal(10, 6)   comment '数据空值率',
    day                     Date             comment '日期'
)
ENGINE = MergeTree
PARTITION BY day
PRIMARY KEY day
ORDER BY day
SETTINGS index_granularity = 8192;

3.2 SQL代码

复制代码
--静态排队
select
       data_type,
       device_no,
       field_name,
       round(count_field_unreasonable / count_device_all,6) data_unreasonable_rate,
       round(count_field_null / count_device_all,6) data_null_rate,
       cast(day as String) day
from (select
       t1.data_type,
       t1.device_no,
       t1.field_name,
       sum(case when field_name is not null  then 1 else 0 end) count_field_unreasonable,
       sum(case when field_name is null  then 1 else 0 end) count_field_null,
       t2.count_device_all,
       t1.day
from hurys_jw.dwd_data_clean_record_queue as t1
left join (select
                device_no,
                DATE(create_time) day,
                count(device_no) count_device_all
           from hurys_jw.ods_queue
           where day='2024-10-22'
           group by device_no, day
          ) as  t2
on t2.device_no=t1.device_no and t2.day=t1.day
where t2.count_device_all > 0
group by t1.data_type, t1.device_no, t1.field_name, t2.count_device_all, t1.day)
;

3.3 Kettle任务

3.3.1 newtime

3.3.2 替换NULL值

3.3.3 静态排队

select

data_type,

device_no,

field_name,

round(count_field_unreasonable / count_device_all,6) data_unreasonable_rate,

round(count_field_null / count_device_all,6) data_null_rate,

cast(day as String) day

from (select

t1.data_type,

t1.device_no,

t1.field_name,

sum(case when field_name is not null then 1 else 0 end) count_field_unreasonable,

sum(case when field_name is null then 1 else 0 end) count_field_null,

t2.count_device_all,

t1.day

from hurys_jw.dwd_data_clean_record_queue as t1

left join (select

device_no,

DATE(create_time) day,

count(device_no) count_device_all

from hurys_jw.ods_queue

where day > ?

group by device_no, day

) as t2

on t2.device_no=t1.device_no and t2.day=t1.day

where t2.count_device_all > 0

group by t1.data_type, t1.device_no, t1.field_name, t2.count_device_all, t1.day)

;

其他clickhouse输入控件代码类似

3.3.4 字段选择

3.3.5 clickhouse输出

3.3.6 执行任务

3.3.7 海豚调度(1天1次)

搞定!就是Hive中原有SQL语句和ClickHouse现有SQL语句很大不同,改造起来有点烦,尤其碰上管卡!

相关推荐
天道有情战天下1 天前
ClickHouse使用Docker部署
clickhouse·docker·容器
冷雨夜中漫步2 天前
ClickHouse常见问题——ClickHouseKeeper配置listen_host后不生效
java·数据库·clickhouse
qq_339191142 天前
docker 启动一个clickhouse , docker 创建ck数据库
clickhouse·docker·容器
Kookoos5 天前
ABP + ClickHouse 实时 OLAP:物化视图与写入聚合
clickhouse·c#·linq·abp vnext·实时olap
喂完待续6 天前
【Big Data】AI赋能的ClickHouse 2.0:从JIT编译到LLM查询优化,下一代OLAP引擎进化路径
大数据·数据库·clickhouse·数据分析·olap·big data·序列晋升
阿里云大数据AI技术14 天前
鹰角网络基于阿里云 EMR Serverless StarRocks 的实时分析工程实践
starrocks·clickhouse·阿里云·emr·实时分析
更深兼春远16 天前
flink+clinkhouse安装部署
大数据·clickhouse·flink
Sais_Z20 天前
ClickHouse的学习与了解
数据库·clickhouse
风中凌乱23 天前
ClickHouse-Backup的安装与部署
clickhouse
风中凌乱23 天前
clickhouse集群的安装与部署
clickhouse