二百七十四、Kettle——ClickHouse中对错误数据表中进行数据修复(实时)

一、目的

在完成数据清洗、错误数据之后,需要根据修复规则对错误数据进行修复

二、Hive中原有代码

复制代码
insert into table  hurys_db.dwd_queue  partition(day)
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else a2.avg_queue_len  end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else a2.avg_queue_head end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else a2.avg_queue_tail end as queue_tail,
       a3.day
from hurys_db.dwd_queue_error as a3
right join (select
       a1.device_no,
       a1.create_time,
       a1.lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len),2)       avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,7),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,14),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,21),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
) as a1
group by a1.device_no, a1.create_time, a1.lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.create_time and a3.lane_no=a2.lane_no
where a3.day='2024-09-04'
;

三、ClickHouse中现有代码

复制代码
--43、静态排队字段数据修复
    --修复策略:使用前三周同期数据取平均进行修复
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else cast(a2.avg_queue_len   as Decimal(10,2)) end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head  as Decimal(10,2)) end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail  as Decimal(10,2)) end as queue_tail,
       cast(a3.day as String) day
from hurys_jw.dwd_queue_error as a3
right join (
select
       device_no,
       start_time,
       lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len) ,2)      avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(
select
t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time
where t1.device_no is not null)
where lane_no is not null
group by device_no, start_time, lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no
where a3.day >= ?
;

注意:Hive中原有SQL语句和ClickHouse现有SQL语句很大不同

四、Kettle任务

方框标记是修复数据,高频率执行

下面其他几个是修复记录任务

4.1 newtime

4.2 替换NULL值

4.3 clickhouse输入

select

a3.id,

a3.device_no,

a3.source_device_type,

a3.sn,

a3.model,

a3.create_time,

a3.lane_no,

a3.lane_type,

case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,

case when a3.queue_len between 0 and 500 then a3.queue_len else cast(a2.avg_queue_len as Decimal(10,2)) end as queue_len,

case when a3.queue_head between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head as Decimal(10,2)) end as queue_head,

case when a3.queue_tail between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail as Decimal(10,2)) end as queue_tail,

cast(a3.day as String) day

from hurys_jw.dwd_queue_error as a3

right join (

select

device_no,

start_time,

lane_no,

round(avg(queue_count),0) avg_queue_count,

round(avg(queue_len) ,2) avg_queue_len,

round(avg(queue_head),2) avg_queue_head,

round(avg(queue_tail),2) avg_queue_tail

from(

select

t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time

where t1.device_no is not null

union all

select

t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time

where t1.device_no is not null

union all

select

t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time

where t1.device_no is not null)

where lane_no is not null

group by device_no, start_time, lane_no

) as a2

on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no

where a3.day >= ?

;

4.4 字段选择

4.5 clickhouse输出

4.6 执行SQL脚本

由于是对每一天的错误进行修复,因此每次执行后需要先删除这个分区的错误数据。因为每次执行清洗后都会先执行错误数据任务!

4.7 执行任务

4.8 海豚调度

注意在DWD层静态排队数据清洗、DWD层静态排队错误数据之后

相关推荐
Jiutwo2 天前
ClickHouse 入门
数据库·clickhouse
飞奔的屎壳郎4 天前
DM适配连接kettle迁移工具(资源库+数据源配置)
数据库·etl·kettle·dm
neeef_se5 天前
clickhouse-介绍、安装、数据类型、sql
数据库·sql·clickhouse
zhangjin12228 天前
Apache Hop从入门到精通 第二课 Apache Hop 核心概念/术语
kettle·apache hop·apache hop 核心概念·apache hop 术语
昊昊该干饭了8 天前
数仓建模(五)选择数仓技术栈:Hive & ClickHouse & 其它
hive·hadoop·clickhouse
大饼酥9 天前
ClickHouse大数据准实时更新
clickhouse
柚几哥哥9 天前
从 MySQL 到 ClickHouse 的迁移与优化——支持上亿级数据量的复杂检索
数据库·mysql·clickhouse
程序员老石9 天前
ClickHouse-CPU、内存参数设置
android·java·clickhouse
程序员老石11 天前
Clickhouse基础(一)
java·开发语言·clickhouse
柚几哥哥12 天前
SpringBoot多数据源架构实现
spring boot·后端·clickhouse·架构·数据库架构