一、目的
在Hive中对每种业务数据的清洗字段的异常数据进行记录
例如这张图,上面是原始数据,下面是每台雷达每天的异常字段的记录
二、实施步骤
(一)建表
create table if not exists dws_data_clean_record_queue(
data_type int comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
device_no string comment '设备编号',
create_time timestamp comment '创建时间',
field_name string comment '字段名',
field_value string comment '字段值'
)
comment '静态排队数据清洗记录表'
partitioned by (day string)
stored as orc
;
(二)SQL
1、运行比较简单,但代码比较难
with t3 as(
select
device_no,
case when device_no is null then CONCAT('device_no:','null') END AS device_no_value,
create_time,
case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END AS lane_no_value,
case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING)) END AS queue_len_value,
case when queue_head < 0 or queue_head > 500 then CONCAT('queue_head:', CAST(queue_head AS STRING)) END AS queue_head_value,
case when queue_tail < 0 or queue_tail > 500 then CONCAT('queue_tail:', CAST(queue_tail AS STRING)) END AS queue_tail_value,
case when queue_count < 0 or queue_count > 100 then CONCAT('queue_count:', CAST(queue_count AS STRING)) END AS queue_count_value,
concat_ws(',',
case when device_no is null then CONCAT('device_no:','null') end ,
case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END ,
case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING)) END,
case when queue_head < 0 or queue_head > 500 then CONCAT('queue_head:', CAST(queue_head AS STRING)) END,
case when queue_tail < 0 or queue_tail > 500 then CONCAT('queue_tail:', CAST(queue_tail AS STRING)) END,
case when queue_count < 0 or queue_count > 100 then CONCAT('queue_count:', CAST(queue_count AS STRING)) END
) AS kv_pairs ,
day
from (select
t1.device_no,
substr(create_time,1,19) create_time ,
get_json_object(list_json,'$.laneNo') lane_no,
get_json_object(list_json,'$.queueLen') queue_len,
get_json_object(list_json,'$.queueHead') queue_head,
get_json_object(list_json,'$.queueTail') queue_tail,
get_json_object(list_json,'$.queueCount') queue_count,
date(t1.create_time) day
from (
select
get_json_object(queue_json,'$.deviceNo') device_no,
get_json_object(queue_json,'$.createTime') create_time,
get_json_object(queue_json,'$.queueList') queue_list
from hurys_dc_ods.ods_queue
where day='2024-05-15'
) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
'\\[|\\]','') ,
'\\}\\,\\{','\\}\\;\\{'),
'\\;')
)list_queue as list_json
) as t2
)
insert overwrite table hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
'6' data_type,
t3.device_no,
create_time,
split(pair, ':')[0] AS field_name,
split(pair, ':')[1] AS field_value,
day
from t3
lateral view explode(split(t3.kv_pairs , ',')) exploded_table AS pair
where device_no_value is not null or queue_len_value is not null or lane_no_value is not null
or queue_head_value is not null or queue_tail_value is not null or queue_count_value is not null
;
注意
(1)t1是解析JSON一级原始数据
(2)t2是解析JSON二级原始数据,得到所有的字段
(3)t3是利用case when和CONCAT,对每个检测字段的字段名和异常值进行拼接。最重要的是,利用concat_ws生成各种检测字段的键值对kv_pairs
(4)最后,则是利用lateral view explode(split)对键值对进行炸开,然后切分每个字段,形成field_name和field_value
(5)另外,最后where指定条件,键值对里面的字段总要非空
2、运行比较耗资源、但代码简单
with t2 as(
select
t1.device_no,
substr(create_time,1,19) create_time ,
get_json_object(list_json,'$.laneNo') lane_no,
get_json_object(list_json,'$.queueLen') queue_len,
get_json_object(list_json,'$.queueHead') queue_head,
get_json_object(list_json,'$.queueTail') queue_tail,
get_json_object(list_json,'$.queueCount') queue_count,
date(t1.create_time) day
from (
select
get_json_object(queue_json,'$.deviceNo') device_no,
get_json_object(queue_json,'$.createTime') create_time,
get_json_object(queue_json,'$.queueList') queue_list
from hurys_dc_ods.ods_queue
where day='2024-05-15'
) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
'\\[|\\]','') ,
'\\}\\,\\{','\\}\\;\\{'),
'\\;')
)list_queue as list_json
)
insert overwrite table hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
'6' data_type,
device_no,
create_time,
'device_no' field_name ,
case when device_no is null then device_no END AS field_value ,
day
from t2
where device_no is null
union all
select
'6' data_type,
device_no,
create_time,
'lane_no' field_name ,
case when lane_no < 0 or lane_no >255 then lane_no END AS field_value ,
day
from t2
where lane_no < 0 or lane_no >255
union all
select
'6' data_type,
device_no,
create_time,
'queue_len' field_name ,
case when queue_len < 0 or queue_len > 500 then queue_len END AS field_value ,
day
from t2
where queue_len < 0 or queue_len > 500
union all
select
'6' data_type,
device_no,
create_time,
'queue_head' field_name ,
case when queue_head < 0 or queue_head > 500 then queue_head END AS field_value ,
day
from t2
where queue_head < 0 or queue_head > 500
union all
select
'6' data_type,
device_no,
create_time,
'queue_tail' field_name ,
case when queue_tail < 0 or queue_tail > 500 then queue_tail END AS field_value ,
day
from t2
where queue_tail < 0 or queue_tail > 500
union all
select
'6' data_type,
device_no,
create_time,
'queue_count' field_name ,
case when queue_count < 0 or queue_count > 100 then queue_count END AS field_value ,
day
from t2
where queue_count < 0 or queue_count > 100
;
(1)特点:每个字段union判断 笨方法
(三)查看表结果
花了一天时间终于搞定,又前进了一小步!加油啊,少年
2024年5月22日续写
(四)海豚调度任务T+1插入
#! /bin/bash
source /etc/profile
nowdate=`date --date='0 days ago' "+%Y%m%d"`
yesdate=`date -d yesterday +%Y-%m-%d`
hive -e "
use hurys_dc_dws;
set hive.vectorized.execution.enabled=false;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.exec.max.dynamic.partitions=2000;
with t3 as(
select
device_no,
case when device_no is null then CONCAT('device_no:','null') END AS device_no_value,
create_time,
case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END AS lane_no_value,
case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING)) END AS queue_len_value,
case when queue_head < 0 or queue_head > 500 then CONCAT('queue_head:', CAST(queue_head AS STRING)) END AS queue_head_value,
case when queue_tail < 0 or queue_tail > 500 then CONCAT('queue_tail:', CAST(queue_tail AS STRING)) END AS queue_tail_value,
case when queue_count < 0 or queue_count > 100 then CONCAT('queue_count:', CAST(queue_count AS STRING)) END AS queue_count_value,
concat_ws(',',
case when device_no is null then CONCAT('device_no:','null') end ,
case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END ,
case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING)) END,
case when queue_head < 0 or queue_head > 500 then CONCAT('queue_head:', CAST(queue_head AS STRING)) END,
case when queue_tail < 0 or queue_tail > 500 then CONCAT('queue_tail:', CAST(queue_tail AS STRING)) END,
case when queue_count < 0 or queue_count > 100 then CONCAT('queue_count:', CAST(queue_count AS STRING)) END
) AS kv_pairs ,
day
from (select
t1.device_no,
substr(create_time,1,19) create_time ,
get_json_object(list_json,'$.laneNo') lane_no,
get_json_object(list_json,'$.queueLen') queue_len,
get_json_object(list_json,'$.queueHead') queue_head,
get_json_object(list_json,'$.queueTail') queue_tail,
get_json_object(list_json,'$.queueCount') queue_count,
date(t1.create_time) day
from (
select
get_json_object(queue_json,'$.deviceNo') device_no,
get_json_object(queue_json,'$.createTime') create_time,
get_json_object(queue_json,'$.queueList') queue_list
from hurys_dc_ods.ods_queue
where day= '$yesdate'
) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
'\\\\[|\\\\]','') ,
'\\\\}\\\\,\\\\{','\\\\}\\\\;\\\\{'),
'\\\\;')
)list_queue as list_json
) as t2
where day = '$yesdate'
)
insert overwrite table hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
'6' data_type,
t3.device_no,
create_time,
split(pair, ':')[0] AS field_name,
split(pair, ':')[1] AS field_value,
day
from t3
lateral view explode(split(t3.kv_pairs , ',')) exploded_table AS pair
where device_no_value is not null or queue_len_value is not null or lane_no_value is not null
or queue_head_value is not null or queue_tail_value is not null or queue_count_value is not null
"
(五)海豚任务执行以及表数据验证
1、海豚任务执行
2、表数据验证
2024-05-21表分区的数据已经有啦!!!