二百七十四、Kettle——ClickHouse中对错误数据表中进行数据修复(实时)

一、目的

在完成数据清洗、错误数据之后,需要根据修复规则对错误数据进行修复

二、Hive中原有代码

复制代码
insert into table  hurys_db.dwd_queue  partition(day)
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else a2.avg_queue_len  end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else a2.avg_queue_head end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else a2.avg_queue_tail end as queue_tail,
       a3.day
from hurys_db.dwd_queue_error as a3
right join (select
       a1.device_no,
       a1.create_time,
       a1.lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len),2)       avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,7),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,14),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,21),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
) as a1
group by a1.device_no, a1.create_time, a1.lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.create_time and a3.lane_no=a2.lane_no
where a3.day='2024-09-04'
;

三、ClickHouse中现有代码

复制代码
--43、静态排队字段数据修复
    --修复策略:使用前三周同期数据取平均进行修复
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else cast(a2.avg_queue_len   as Decimal(10,2)) end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head  as Decimal(10,2)) end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail  as Decimal(10,2)) end as queue_tail,
       cast(a3.day as String) day
from hurys_jw.dwd_queue_error as a3
right join (
select
       device_no,
       start_time,
       lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len) ,2)      avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(
select
t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time
where t1.device_no is not null)
where lane_no is not null
group by device_no, start_time, lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no
where a3.day >= ?
;

注意:Hive中原有SQL语句和ClickHouse现有SQL语句很大不同

四、Kettle任务

方框标记是修复数据,高频率执行

下面其他几个是修复记录任务

4.1 newtime

4.2 替换NULL值

4.3 clickhouse输入

select

a3.id,

a3.device_no,

a3.source_device_type,

a3.sn,

a3.model,

a3.create_time,

a3.lane_no,

a3.lane_type,

case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,

case when a3.queue_len between 0 and 500 then a3.queue_len else cast(a2.avg_queue_len as Decimal(10,2)) end as queue_len,

case when a3.queue_head between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head as Decimal(10,2)) end as queue_head,

case when a3.queue_tail between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail as Decimal(10,2)) end as queue_tail,

cast(a3.day as String) day

from hurys_jw.dwd_queue_error as a3

right join (

select

device_no,

start_time,

lane_no,

round(avg(queue_count),0) avg_queue_count,

round(avg(queue_len) ,2) avg_queue_len,

round(avg(queue_head),2) avg_queue_head,

round(avg(queue_tail),2) avg_queue_tail

from(

select

t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time

where t1.device_no is not null

union all

select

t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time

where t1.device_no is not null

union all

select

t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time

where t1.device_no is not null)

where lane_no is not null

group by device_no, start_time, lane_no

) as a2

on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no

where a3.day >= ?

;

4.4 字段选择

4.5 clickhouse输出

4.6 执行SQL脚本

由于是对每一天的错误进行修复,因此每次执行后需要先删除这个分区的错误数据。因为每次执行清洗后都会先执行错误数据任务!

4.7 执行任务

4.8 海豚调度

注意在DWD层静态排队数据清洗、DWD层静态排队错误数据之后

相关推荐
吹老师个人app编程教学13 小时前
clickhouse-题库
clickhouse
fusugongzi13 小时前
clickhouse一直重启,日志提示structure needs cleaning
clickhouse
arnold6615 小时前
深入探索 ClickHouse:性能优化之道
clickhouse·性能优化
孤独天狼2 天前
Clickhouse(Centos)
clickhouse
Favor_Yang2 天前
C# 连接ClickHouse 数据库
数据库·clickhouse·c#
fusugongzi3 天前
clickhouse优化记录
clickhouse
吹老师个人app编程教学3 天前
clickhouse-副本和分片
java·服务器·clickhouse
天地风雷水火山泽4 天前
二百七十九、ClickHouse——用Kettle对DWD层清洗数据进行增量补全
clickhouse
天地风雷水火山泽4 天前
二百八十、ClickHouse——用Kettle对DWD层补全的清洗数据进行记录
大数据·clickhouse·kettle
wending-Y4 天前
clickhouse 查询优化思路
clickhouse