二百三十七、Hive——DWS层生成每个清洗字段的异常情况记录

一、目的

在Hive中对每种业务数据的清洗字段的异常数据进行记录

例如这张图,上面是原始数据,下面是每台雷达每天的异常字段的记录

二、实施步骤

(一)建表

复制代码
create  table  if not exists  dws_data_clean_record_queue(
    data_type      int        comment '1:转向比,2:统计,3:评价,4:区域,5:过车,6:静态排队,7:动态排队,8:轨迹,9:事件数据,10:事件资源',
    device_no      string     comment '设备编号',
    create_time    timestamp  comment '创建时间',
    field_name     string     comment '字段名',
    field_value    string     comment '字段值'
)
comment '静态排队数据清洗记录表'
partitioned by (day string)
stored as orc
;

(二)SQL

1、运行比较简单,但代码比较难

复制代码
with t3 as(
select
       device_no,
       case when device_no is null then CONCAT('device_no:','null')  END AS device_no_value,
       create_time,
       case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END AS lane_no_value,
       case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END AS queue_len_value,
       case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END AS queue_head_value,
       case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END AS queue_tail_value,
       case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END AS queue_count_value,
       concat_ws(',',
                case when device_no is null then CONCAT('device_no:','null') end ,
                case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END ,
                case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END,
                case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END,
                case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END,
                case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END
                ) AS kv_pairs  ,
       day
from (select
        t1.device_no,
        substr(create_time,1,19)                        create_time ,
        get_json_object(list_json,'$.laneNo')           lane_no,
        get_json_object(list_json,'$.queueLen')         queue_len,
        get_json_object(list_json,'$.queueHead')        queue_head,
        get_json_object(list_json,'$.queueTail')        queue_tail,
        get_json_object(list_json,'$.queueCount')       queue_count,
        date(t1.create_time)                            day
    from (
    select
       get_json_object(queue_json,'$.deviceNo')         device_no,
       get_json_object(queue_json,'$.createTime')       create_time,
       get_json_object(queue_json,'$.queueList')        queue_list
    from hurys_dc_ods.ods_queue
        where day='2024-05-15'
        ) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
                                                '\\[|\\]','') ,
                        '\\}\\,\\{','\\}\\;\\{'),
                  '\\;')
          )list_queue as list_json
    )  as t2
        )
insert  overwrite  table  hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
    '6' data_type,
    t3.device_no,
    create_time,
    split(pair, ':')[0] AS field_name,
    split(pair, ':')[1] AS field_value,
    day
from t3
lateral view explode(split(t3.kv_pairs , ',')) exploded_table AS pair
where device_no_value is not null or queue_len_value is not null or lane_no_value is not null
or queue_head_value is not null or queue_tail_value is not null or queue_count_value is not null
;

注意

(1)t1是解析JSON一级原始数据

(2)t2是解析JSON二级原始数据,得到所有的字段

(3)t3是利用case when和CONCAT,对每个检测字段的字段名和异常值进行拼接。最重要的是,利用concat_ws生成各种检测字段的键值对kv_pairs

(4)最后,则是利用lateral view explode(split)对键值对进行炸开,然后切分每个字段,形成field_name和field_value

(5)另外,最后where指定条件,键值对里面的字段总要非空

2、运行比较耗资源、但代码简单

复制代码
with t2 as(
    select
        t1.device_no,
        substr(create_time,1,19)                        create_time ,
        get_json_object(list_json,'$.laneNo')           lane_no,
        get_json_object(list_json,'$.queueLen')         queue_len,
        get_json_object(list_json,'$.queueHead')        queue_head,
        get_json_object(list_json,'$.queueTail')        queue_tail,
        get_json_object(list_json,'$.queueCount')       queue_count,
        date(t1.create_time)                            day
from (
    select
       get_json_object(queue_json,'$.deviceNo')         device_no,
       get_json_object(queue_json,'$.createTime')       create_time,
       get_json_object(queue_json,'$.queueList')        queue_list
    from hurys_dc_ods.ods_queue
        where day='2024-05-15'
) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
                                                '\\[|\\]','') ,
                        '\\}\\,\\{','\\}\\;\\{'),
                  '\\;')
          )list_queue as list_json
)
insert  overwrite  table  hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
       '6' data_type,
       device_no,
       create_time,
       'device_no' field_name ,
       case when device_no is null then device_no END AS field_value ,
       day
from t2
where device_no is null
union all
select
       '6' data_type,
       device_no,
       create_time,
       'lane_no'  field_name ,
       case when lane_no < 0 or lane_no >255 then lane_no END AS field_value ,
       day
from t2
where lane_no < 0 or lane_no >255
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_len'  field_name ,
       case when queue_len < 0 or queue_len > 500 then queue_len END AS field_value ,
       day
from t2
where  queue_len < 0 or queue_len > 500
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_head'  field_name ,
       case when queue_head < 0 or queue_head > 500 then queue_head END AS field_value ,
       day
from t2
where  queue_head < 0 or queue_head > 500
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_tail'  field_name ,
       case when queue_tail < 0 or queue_tail > 500 then queue_tail END AS field_value ,
       day
from t2
where  queue_tail < 0 or queue_tail > 500
union all
select
       '6' data_type,
       device_no,
       create_time,
       'queue_count'  field_name ,
       case when queue_count < 0 or queue_count > 100 then queue_count END AS field_value ,
       day
from t2
where queue_count < 0 or queue_count > 100
;

(1)特点:每个字段union判断 笨方法

(三)查看表结果

花了一天时间终于搞定,又前进了一小步!加油啊,少年

2024年5月22日续写

(四)海豚调度任务T+1插入

#! /bin/bash

source /etc/profile

nowdate=`date --date='0 days ago' "+%Y%m%d"`

yesdate=`date -d yesterday +%Y-%m-%d`

hive -e "

use hurys_dc_dws;

set hive.vectorized.execution.enabled=false;

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

set hive.exec.max.dynamic.partitions.pernode=1000;

set hive.exec.max.dynamic.partitions=2000;

with t3 as(

复制代码
select
       device_no,
       case when device_no is null then CONCAT('device_no:','null')  END AS device_no_value,
       create_time,
       case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END AS lane_no_value,
       case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END AS queue_len_value,
       case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END AS queue_head_value,
       case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END AS queue_tail_value,
       case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END AS queue_count_value,
       concat_ws(',',
                case when device_no is null then CONCAT('device_no:','null') end ,
                case when lane_no < 0 or lane_no >255 then CONCAT('lane_no:', CAST(lane_no AS STRING)) END ,
                case when queue_len < 0 or queue_len > 500 then CONCAT('queue_len:', CAST(queue_len AS STRING))  END,
                case when queue_head < 0 or queue_head > 500 then  CONCAT('queue_head:', CAST(queue_head AS STRING))  END,
                case when queue_tail < 0 or queue_tail > 500 then  CONCAT('queue_tail:', CAST(queue_tail AS STRING))  END,
                case when queue_count < 0 or queue_count > 100  then  CONCAT('queue_count:', CAST(queue_count AS STRING))  END
                ) AS kv_pairs  ,
       day
from (select
        t1.device_no,
        substr(create_time,1,19)                        create_time ,
        get_json_object(list_json,'$.laneNo')           lane_no,
        get_json_object(list_json,'$.queueLen')         queue_len,
        get_json_object(list_json,'$.queueHead')        queue_head,
        get_json_object(list_json,'$.queueTail')        queue_tail,
        get_json_object(list_json,'$.queueCount')       queue_count,
        date(t1.create_time)                            day
    from (
    select
       get_json_object(queue_json,'$.deviceNo')         device_no,
       get_json_object(queue_json,'$.createTime')       create_time,
       get_json_object(queue_json,'$.queueList')        queue_list
    from hurys_dc_ods.ods_queue
        where day= '$yesdate'
        ) as t1
lateral view explode(split(regexp_replace(regexp_replace(queue_list,
                                                '\\\\[|\\\\]','') ,    
                                 '\\\\}\\\\,\\\\{','\\\\}\\\\;\\\\{'), 
                   '\\\\;')
          )list_queue as list_json
    )  as t2
where day = '$yesdate'
        )
insert  overwrite  table  hurys_dc_dws.dws_data_clean_record_queue partition(day)
select
    '6' data_type,
    t3.device_no,
    create_time,
    split(pair, ':')[0] AS field_name,
    split(pair, ':')[1] AS field_value,
    day
from t3
lateral view explode(split(t3.kv_pairs , ',')) exploded_table AS pair
where device_no_value is not null or queue_len_value is not null or lane_no_value is not null
or queue_head_value is not null or queue_tail_value is not null or queue_count_value is not null
"

(五)海豚任务执行以及表数据验证

1、海豚任务执行

2、表数据验证

2024-05-21表分区的数据已经有啦!!!

相关推荐
Lx35215 分钟前
如何正确选择Hadoop数据压缩格式:Gzip vs LZO vs Snappy
大数据·hadoop
让头发掉下来2 小时前
Hive 创建事务表的方法
大数据·hive·hadoop
Q_Q19632884752 小时前
python基于Hadoop的超市数据分析系统
开发语言·hadoop·spring boot·python·django·flask·node.js
计算机毕业设计木哥3 小时前
计算机毕设大数据选题推荐 基于spark+Hadoop+python的贵州茅台股票数据分析系统【源码+文档+调试】
大数据·hadoop·python·计算机网络·spark·课程设计
W.A委员会7 小时前
SpringMVC
数据仓库·hive·hadoop·spring
王小王-12315 小时前
基于Hadoop的全国农产品批发价格数据分析与可视化与价格预测研究
大数据·hive·hadoop·flume·hadoop农产品价格分析·农产品批发价格·农产品价格预测
计算机毕业设计木哥20 小时前
基于大数据spark的医用消耗选品采集数据可视化分析系统【Hadoop、spark、python】
大数据·hadoop·python·信息可视化·spark·课程设计
Lx3521 天前
MapReduce性能调优:从理论到实践的经验总结
大数据·hadoop·后端
TG_yunshuguoji2 天前
华为云数据仓库服务核心优势指南
数据仓库·云原生·华为云·数据库服务
典学长编程2 天前
JavaWeb从入门到精通!第二天!(Servlet)
数据仓库·servlet·javaweb