一、目的
在数据质量模块,需要对原始数据的重复性进行统计
Hive中原有SQL语句和ClickHouse现有SQL语句很大不同
二、Hive中原有代码
2.1 表结构
--41、八大类基础数据重复性统计表 事件+事件资源不需要重复
create table if not exists hurys_db.dwd_data_duplicate(
data_type int comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
device_no string comment '设备编号',
data_duplicate float comment '数据重复率'
)
comment '数据重复性统计表'
partitioned by (day string)
stored as orc
;
2.2 SQL代码
insert overwrite table hurys_db.dwd_data_duplicate partition(day)
select
'6' data_type,
device_no,
round(sum(num)/count_num,2) data_duplicate,
day
from (select
device_no,
create_time,
lane_no,
count(1) num,
count_num,
day
from (select device_no,
create_time,
lane_no,
count(device_no) over (partition by device_no,day) count_num,
day
from hurys_db.ods_queue
where day = '2024-09-04'
) as t1
group by device_no, create_time, lane_no, count_num, day
having count(1) > 1
) as t3
group by device_no, count_num, day;
三、ClickHouse中现有代码
3.1 表结构
--41、八大类基础数据重复性统计表(长期存储)
create table if not exists hurys_jw.dwd_data_duplicate(
data_type Int32 comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
device_no String comment '设备编号',
data_duplicate Decimal(10, 2) comment '数据重复率',
day Date comment '日期'
)
ENGINE = MergeTree
PARTITION BY day
PRIMARY KEY day
ORDER BY day
SETTINGS index_granularity = 8192;
3.2 SQL代码
select
'6' data_type,
device_no,
round(sum(num)/count_num,2) data_duplicate,
day
from (select
device_no,
create_time,
lane_no,
count(1) num,
count_num,
day
from (select device_no,
create_time,
lane_no,
count(device_no) over (partition by device_no,DATE(create_time)) AS count_num,
DATE(create_time) day
from hurys_jw.ods_queue
where day = '2024-10-22' -- where day > ?
) as t1
group by device_no, create_time, lane_no, count_num, day
having count(1) > 1
) as t3
group by device_no, count_num, day;
3.3 Kettle任务
3.3.1 newtime
3.3.2 替换NULL值
3.3.3 clickhouse输入1
select
'6' data_type,
device_no,
round(sum(num)/count_num,2) data_duplicate,
cast(day as String) day
from (select
device_no,
create_time,
lane_no,
count(1) num,
count_num,
day
from (select device_no,
create_time,
lane_no,
count(device_no) over (partition by device_no,DATE(create_time)) AS count_num,
DATE(create_time) day
from hurys_jw.ods_queue
where day > ?
) as t1
group by device_no, create_time, lane_no, count_num, day
having count(1) > 1
) as t3
group by device_no, count_num, day
;
其他clickhouse输入控件代码类似
3.3.4 字段选择
3.3.5 clickhouse输出
3.3.6 执行任务
3.3.7 海豚调度(1天1次)
ClickHosue的SQL语句与Hive真的好多地方不一样,尤其是函数!