二百七十四、Kettle——ClickHouse中对错误数据表中进行数据修复(实时)

一、目的

在完成数据清洗、错误数据之后,需要根据修复规则对错误数据进行修复

二、Hive中原有代码

复制代码
insert into table  hurys_db.dwd_queue  partition(day)
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else a2.avg_queue_len  end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else a2.avg_queue_head end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else a2.avg_queue_tail end as queue_tail,
       a3.day
from hurys_db.dwd_queue_error as a3
right join (select
       a1.device_no,
       a1.create_time,
       a1.lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len),2)       avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,7),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,14),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no,
t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_db.dwd_queue as t1
right join hurys_db.dwd_queue_error as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no
    and  concat(date_sub(t2.create_time,21),substr(t2.create_time,11,10)) = t1.create_time
where t1.device_no is not null
) as a1
group by a1.device_no, a1.create_time, a1.lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.create_time and a3.lane_no=a2.lane_no
where a3.day='2024-09-04'
;

三、ClickHouse中现有代码

复制代码
--43、静态排队字段数据修复
    --修复策略:使用前三周同期数据取平均进行修复
select
       a3.id,
       a3.device_no,
       a3.source_device_type,
       a3.sn,
       a3.model,
       a3.create_time,
       a3.lane_no,
       a3.lane_type,
       case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
       case when a3.queue_len   between 0 and 500 then a3.queue_len  else cast(a2.avg_queue_len   as Decimal(10,2)) end as queue_len,
       case when a3.queue_head  between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head  as Decimal(10,2)) end as queue_head,
       case when a3.queue_tail  between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail  as Decimal(10,2)) end as queue_tail,
       cast(a3.day as String) day
from hurys_jw.dwd_queue_error as a3
right join (
select
       device_no,
       start_time,
       lane_no,
       round(avg(queue_count),0)     avg_queue_count,
       round(avg(queue_len) ,2)      avg_queue_len,
       round(avg(queue_head),2)      avg_queue_head,
       round(avg(queue_tail),2)      avg_queue_tail
from(
select
t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
       device_no,lane_no,create_time,
       (create_time - interval 7 day) create_time_7,
       (create_time - interval 14 day)create_time_14,
       (create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no  and  t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time
where t1.device_no is not null)
where lane_no is not null
group by device_no, start_time, lane_no
    ) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no
where a3.day >= ?
;

注意:Hive中原有SQL语句和ClickHouse现有SQL语句很大不同

四、Kettle任务

方框标记是修复数据,高频率执行

下面其他几个是修复记录任务

4.1 newtime

4.2 替换NULL值

4.3 clickhouse输入

select

a3.id,

a3.device_no,

a3.source_device_type,

a3.sn,

a3.model,

a3.create_time,

a3.lane_no,

a3.lane_type,

case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,

case when a3.queue_len between 0 and 500 then a3.queue_len else cast(a2.avg_queue_len as Decimal(10,2)) end as queue_len,

case when a3.queue_head between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head as Decimal(10,2)) end as queue_head,

case when a3.queue_tail between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail as Decimal(10,2)) end as queue_tail,

cast(a3.day as String) day

from hurys_jw.dwd_queue_error as a3

right join (

select

device_no,

start_time,

lane_no,

round(avg(queue_count),0) avg_queue_count,

round(avg(queue_len) ,2) avg_queue_len,

round(avg(queue_head),2) avg_queue_head,

round(avg(queue_tail),2) avg_queue_tail

from(

select

t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time

where t1.device_no is not null

union all

select

t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time

where t1.device_no is not null

union all

select

t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail

from hurys_jw.dwd_queue as t1

inner join(select

device_no,lane_no,create_time,

(create_time - interval 7 day) create_time_7,

(create_time - interval 14 day)create_time_14,

(create_time - interval 21 day)create_time_21

from hurys_jw.dwd_queue_error) as t2

on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time

where t1.device_no is not null)

where lane_no is not null

group by device_no, start_time, lane_no

) as a2

on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no

where a3.day >= ?

;

4.4 字段选择

4.5 clickhouse输出

4.6 执行SQL脚本

由于是对每一天的错误进行修复,因此每次执行后需要先删除这个分区的错误数据。因为每次执行清洗后都会先执行错误数据任务!

4.7 执行任务

4.8 海豚调度

注意在DWD层静态排队数据清洗、DWD层静态排队错误数据之后

相关推荐
risc1234564 天前
【ClickHouse】RollingBitmap
clickhouse
斯特凡今天也很帅4 天前
clickhouse如何查看操作记录,从日志来查看写入是否成功
数据库·clickhouse
袖清暮雨9 天前
ClickHouse讲解
大数据·数据库·数据仓库·clickhouse·oracle
江枫渔火L10 天前
使用clickhouse的ReplacingMergeTree引擎表做活跃玩家信息表
数据库·clickhouse
潇凝子潇12 天前
Doris ClickHouse Greenplum 对比
clickhouse·doris·greenplum
递归尽头是星辰14 天前
ClickHouse核心优势分析与场景实战
大数据·数据仓库·clickhouse·实时分析·实时查询
鲁尼的小宝贝15 天前
基于Flink的数据中台管理平台
java·大数据·clickhouse·flink·yarn
问道飞鱼19 天前
【大数据知识】今天聊聊Clickhouse部署方案
大数据·clickhouse·部署
Fireworkitte22 天前
ClickHouse详解
clickhouse
知其_所以然23 天前
使用docker安装clickhouse集群
clickhouse·docker·容器