一、目的
在完成数据修复后,需要生成修复记录
二、Hive中原有代码
2.1 表结构
--52、数据补全以及数据修复记录表 dwd_data_correction_record
create table if not exists hurys_db.dwd_data_correction_record(
data_type int comment '数据类型 1:转向比,2:统计,3:评价,4:区域,6:静态排队,7:动态排队',
device_no string comment '设备编号',
id string comment '唯一ID',
create_time timestamp comment '创建时间',
record_type int comment '记录类型 0:补全,1:修复'
)
comment '数据补全以及数据修复记录表'
partitioned by (day string)
stored as orc
;
2.2 SQL代码
--6 静态排队数据修复记录
insert into table hurys_db.dwd_data_correction_record partition(day)
select
'6' data_type,
t1.device_no,
t1.id,
t1.create_time,
'1' record_type,
t1.day
from hurys_db.dwd_queue_error as t1
right join hurys_db.dwd_queue as t2
on t1.id=t2.id and t1.device_no=t2.device_no
where t1.id is not null and t1.day='2024-09-04'
;
三、ClickHouse中现有代码
3.1 表结构
--52、数据补全以及数据修复记录表 dwd_data_correction_record
create table if not exists hurys_jw.dwd_data_correction_record(
data_type Int32 comment '数据类型 1:转向比,2:统计,3:评价,4:区域,6:静态排队,7:动态排队',
device_no String comment '设备编号',
id String comment '唯一ID',
create_time DateTime comment '创建时间',
record_type Int32 comment '记录类型 0:补全,1:修复',
day Date comment '日期'
)
ENGINE = MergeTree
PARTITION BY day
PRIMARY KEY (day,id)
ORDER BY (day,id)
SETTINGS index_granularity = 8192;
3.2 SQL代码
--6 静态排队数据修复记录
select
'6' data_type,
t1.device_no,
t1.id,
t1.create_time,
'1' record_type,
cast(t1.day as String) day
from hurys_jw.dwd_queue_error as t1
inner join hurys_jw.dwd_queue as t2
on t1.id=t2.id and t1.device_no=t2.device_no
where t1.id is not null --and t1.create_time > ?
group by t1.device_no, t1.id, t1.create_time, t1.day
;
四、Kettle任务
由于修复记录必须是数据完成修复后执行,但是又不能每天执行一次,因为数据修复任务最后会删除错误数据表当天分区数据
4.1 newtime 2
4.2 替换NULL值 2
4.3 clickhouse输入 2
select
'6' data_type,
t1.device_no,
t1.create_time,
'1' record_type,
cast(t1.day as String) day
from hurys_jw.dwd_queue_error as t1
inner join hurys_jw.dwd_queue as t2
on t1.id=t2.id and t1.device_no=t2.device_no
where t1.id is not null and t1.create_time > ?
group by t1.device_no, t1.id, t1.create_time, t1.day
;
4.4 字段选择 2
4.5 clickhouse输出 2
4.6 执行任务
修复记录和数据修复任务放在一个kettle任务里