如何利用Paimon做流量定时检查? --- 试试标签表

graph TD A["Source: 网络流量"] --> B["Flink预聚合层"] B --> C["Paimon Aggregation Table"] C --> D["Flink流式读取"] D --> E["实时告警处理"] E --> F["Elasticsearch"] B --> G["Paimon Append Table"] G --> H["批处理分析"] H --> F

主表

sql 复制代码
-- 创建源表
CREATE TABLE source_table (  
    dt TIMESTAMP(3),  
    sip STRING,  
    sport INT,  
    dip STRING,  
    dport INT,  
    protocol STRING,  
    flow_size BIGINT,  
    WATERMARK FOR dt AS dt - INTERVAL '5' SECOND  
) WITH (  
    'connector' = 'kafka',  
    'topic' = 'network-flow',  
    'properties.bootstrap.servers' = 'localhost:9092',  
    'format' = 'json'  
);  
  
  1. 创建每日流量聚合表

创建一个按天聚合的表来存储每日总流量:

sql 复制代码
-- 创建每日流量聚合表
CREATE TABLE daily_flow_summary (  
    dt_date STRING,  
    total_flow_bytes BIGINT,  
    last_check_time TIMESTAMP(3),  
    PRIMARY KEY (dt_date) NOT ENFORCED  
) WITH (  
    'merge-engine' = 'aggregation',  
    'fields.total_flow_bytes.aggregate-function' = 'sum',  
    'fields.last_check_time.aggregate-function' = 'last_value',  
    'changelog-producer' = 'lookup'  
);  
  
-- 创建每日流量明细表(用于存储贡献最大的记录)
CREATE TABLE daily_flow_details (  
    dt_date STRING,  
    sip STRING,  
    sport INT,  
    dip STRING,  
    dport INT,  
    protocol STRING,  
    flow_bytes BIGINT,  
    PRIMARY KEY (dt_date, sip, sport, dip, dport, protocol) NOT ENFORCED  
) WITH (  
    'merge-engine' = 'aggregation',  
    'fields.flow_bytes.aggregate-function' = 'sum',  
    'changelog-producer' = 'lookup'  
);
  1. 数据聚合作业

从原始流量数据聚合到每日汇总:

sql 复制代码
-- 聚合每日总流量
INSERT INTO daily_flow_summary  
SELECT   
    DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
    SUM(flow_size) as total_flow_bytes,  
    CURRENT_TIMESTAMP as last_check_time  
FROM source_table  
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd');  
  
-- 聚合每日明细数据
INSERT INTO daily_flow_details  
SELECT   
    DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
    sip, sport, dip, dport, protocol,  
    SUM(flow_size) as flow_bytes  
FROM source_table  
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd'), sip, sport, dip, dport, protocol;
  1. 告警检查作业(每小时执行)

使用Paimon的定时标签功能来实现每小时检查:

sql 复制代码
-- 创建告警配置表
CREATE TABLE alert_config (  
    config_key STRING,  
    threshold_value BIGINT,  
    PRIMARY KEY (config_key) NOT ENFORCED  
);  
  
-- 插入流量阈值配置(例如:每天100GB)
INSERT INTO alert_config VALUES ('daily_flow_threshold', 107374182400);  
  
-- 创建告警结果表
CREATE TABLE flow_alert_results (  
    dt_date STRING,  
    total_flow_bytes BIGINT,  
    threshold_value BIGINT,  
    alert_triggered BOOLEAN,  
    top_contributors STRING, -- JSON格式存储贡献最大的明细
    alert_time TIMESTAMP(3),  
    PRIMARY KEY (dt_date, alert_time) NOT ENFORCED  
) WITH (  
    'connector' = 'elasticsearch-7',  
    'hosts' = 'http://localhost:9200',  
    'index' = 'daily-flow-alerts'  
);
  1. 告警检查和明细提取逻辑
sql 复制代码
-- 每小时执行的告警检查作业
INSERT INTO flow_alert_results  
SELECT   
    s.dt_date,  
    s.total_flow_bytes,  
    c.threshold_value,  
    CASE WHEN s.total_flow_bytes >= c.threshold_value THEN true ELSE false END as alert_triggered,  
    CASE   
        WHEN s.total_flow_bytes >= c.threshold_value THEN  
            -- 获取贡献最大的前10条明细,转换为JSON格式
            (SELECT LISTAGG(  
                CONCAT('{"sip":"', sip, '","sport":', sport, ',"dip":"', dip, '","dport":', dport, ',"protocol":"', protocol, '","flow_bytes":', flow_bytes, '}'),   
                ','  
            )   
            FROM (  
                SELECT sip, sport, dip, dport, protocol, flow_bytes,  
                       ROW_NUMBER() OVER (ORDER BY flow_bytes DESC) as rn  
                FROM daily_flow_details d  
                WHERE d.dt_date = s.dt_date  
            ) WHERE rn <= 10)  
        ELSE NULL   
    END as top_contributors,  
    CURRENT_TIMESTAMP as alert_time  
FROM daily_flow_summary s  
CROSS JOIN alert_config c  
WHERE c.config_key = 'daily_flow_threshold'  
  AND s.dt_date = DATE_FORMAT(CURRENT_TIMESTAMP, 'yyyy-MM-dd')  
  AND s.total_flow_bytes >= c.threshold_value; -- 只有满足阈值才插入告警
  1. 定时作业配置
sql 复制代码
-- 设置表属性以支持每小时自动检查
ALTER TABLE daily_flow_summary SET (  
    'tag.automatic-creation' = 'process-time',  
    'tag.creation-period' = 'hourly',  
    'tag.creation-delay' = '5 m'  
);  
  
-- 创建定时检查作业的完整SQL
EXECUTE STATEMENT SET  
BEGIN  
    -- 持续聚合数据
    INSERT INTO daily_flow_summary  
    SELECT   
        DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
        SUM(flow_size) as total_flow_bytes,  
        CURRENT_TIMESTAMP as last_check_time  
    FROM source_table  
    GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd');  
      
    INSERT INTO daily_flow_details    
    SELECT   
        DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
        sip, sport, dip, dport, protocol,  
        SUM(flow_size) as flow_bytes  
    FROM source_table  
    GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd'), sip, sport, dip, dport, protocol;  
      
    -- 告警检查(基于流式读取,每小时触发)
    INSERT INTO flow_alert_results  
    SELECT   
        s.dt_date,  
        s.total_flow_bytes,  
        c.threshold_value,  
        true as alert_triggered,  
        (SELECT CONCAT('[', LISTAGG(  
            CONCAT('{"sip":"', sip, '","sport":', sport, ',"dip":"', dip, '","dport":', dport, ',"protocol":"', protocol, '","flow_bytes":', flow_bytes, '}'),   
            ','  
        ), ']')  
        FROM (  
            SELECT sip, sport, dip, dport, protocol, flow_bytes,  
                   ROW_NUMBER() OVER (ORDER BY flow_bytes DESC) as rn  
            FROM daily_flow_details d  
            WHERE d.dt_date = s.dt_date  
        ) WHERE rn <= 10) as top_contributors,  
        CURRENT_TIMESTAMP as alert_time  
    FROM daily_flow_summary s  
    CROSS JOIN alert_config c  
    WHERE c.config_key = 'daily_flow_threshold'  
      AND s.total_flow_bytes >= c.threshold_value;  
END;
相关推荐
行百里er4 小时前
WebSocket 在 Spring Boot 中的实战解析:实时通信的技术利器
spring boot·后端·websocket
柳杉5 小时前
建议收藏 | 2026年AI工具封神榜:从Sora到混元3D,生产力彻底爆发
前端·人工智能·后端
仙俊红5 小时前
spring的IoC(控制反转)面试题
java·后端·spring
小楼v5 小时前
说说常见的限流算法及如何使用Redisson实现多机限流
java·后端·redisson·限流算法
与遨游于天地6 小时前
NIO的三个组件解决三个问题
java·后端·nio
czlczl200209256 小时前
Guava Cache 原理与实战
java·后端·spring
Yuer20257 小时前
什么是 Rust 语境下的“量化算子”——一个工程对象的最小定义
开发语言·后端·rust·edca os·可控ai
短剑重铸之日7 小时前
《7天学会Redis》Day 5 - Redis Cluster集群架构
数据库·redis·后端·缓存·架构·cluster
计算机程序设计小李同学7 小时前
基于SSM框架的动画制作及分享网站设计
java·前端·后端·学习·ssm
+VX:Fegn08958 小时前
计算机毕业设计|基于springboot + vue小型房屋租赁系统(源码+数据库+文档)
数据库·vue.js·spring boot·后端·课程设计