如何利用Paimon做流量定时检查? --- 试试标签表

graph TD A["Source: 网络流量"] --> B["Flink预聚合层"] B --> C["Paimon Aggregation Table"] C --> D["Flink流式读取"] D --> E["实时告警处理"] E --> F["Elasticsearch"] B --> G["Paimon Append Table"] G --> H["批处理分析"] H --> F

主表

sql 复制代码
-- 创建源表
CREATE TABLE source_table (  
    dt TIMESTAMP(3),  
    sip STRING,  
    sport INT,  
    dip STRING,  
    dport INT,  
    protocol STRING,  
    flow_size BIGINT,  
    WATERMARK FOR dt AS dt - INTERVAL '5' SECOND  
) WITH (  
    'connector' = 'kafka',  
    'topic' = 'network-flow',  
    'properties.bootstrap.servers' = 'localhost:9092',  
    'format' = 'json'  
);  
  
  1. 创建每日流量聚合表

创建一个按天聚合的表来存储每日总流量:

sql 复制代码
-- 创建每日流量聚合表
CREATE TABLE daily_flow_summary (  
    dt_date STRING,  
    total_flow_bytes BIGINT,  
    last_check_time TIMESTAMP(3),  
    PRIMARY KEY (dt_date) NOT ENFORCED  
) WITH (  
    'merge-engine' = 'aggregation',  
    'fields.total_flow_bytes.aggregate-function' = 'sum',  
    'fields.last_check_time.aggregate-function' = 'last_value',  
    'changelog-producer' = 'lookup'  
);  
  
-- 创建每日流量明细表(用于存储贡献最大的记录)
CREATE TABLE daily_flow_details (  
    dt_date STRING,  
    sip STRING,  
    sport INT,  
    dip STRING,  
    dport INT,  
    protocol STRING,  
    flow_bytes BIGINT,  
    PRIMARY KEY (dt_date, sip, sport, dip, dport, protocol) NOT ENFORCED  
) WITH (  
    'merge-engine' = 'aggregation',  
    'fields.flow_bytes.aggregate-function' = 'sum',  
    'changelog-producer' = 'lookup'  
);
  1. 数据聚合作业

从原始流量数据聚合到每日汇总:

sql 复制代码
-- 聚合每日总流量
INSERT INTO daily_flow_summary  
SELECT   
    DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
    SUM(flow_size) as total_flow_bytes,  
    CURRENT_TIMESTAMP as last_check_time  
FROM source_table  
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd');  
  
-- 聚合每日明细数据
INSERT INTO daily_flow_details  
SELECT   
    DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
    sip, sport, dip, dport, protocol,  
    SUM(flow_size) as flow_bytes  
FROM source_table  
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd'), sip, sport, dip, dport, protocol;
  1. 告警检查作业(每小时执行)

使用Paimon的定时标签功能来实现每小时检查:

sql 复制代码
-- 创建告警配置表
CREATE TABLE alert_config (  
    config_key STRING,  
    threshold_value BIGINT,  
    PRIMARY KEY (config_key) NOT ENFORCED  
);  
  
-- 插入流量阈值配置(例如:每天100GB)
INSERT INTO alert_config VALUES ('daily_flow_threshold', 107374182400);  
  
-- 创建告警结果表
CREATE TABLE flow_alert_results (  
    dt_date STRING,  
    total_flow_bytes BIGINT,  
    threshold_value BIGINT,  
    alert_triggered BOOLEAN,  
    top_contributors STRING, -- JSON格式存储贡献最大的明细
    alert_time TIMESTAMP(3),  
    PRIMARY KEY (dt_date, alert_time) NOT ENFORCED  
) WITH (  
    'connector' = 'elasticsearch-7',  
    'hosts' = 'http://localhost:9200',  
    'index' = 'daily-flow-alerts'  
);
  1. 告警检查和明细提取逻辑
sql 复制代码
-- 每小时执行的告警检查作业
INSERT INTO flow_alert_results  
SELECT   
    s.dt_date,  
    s.total_flow_bytes,  
    c.threshold_value,  
    CASE WHEN s.total_flow_bytes >= c.threshold_value THEN true ELSE false END as alert_triggered,  
    CASE   
        WHEN s.total_flow_bytes >= c.threshold_value THEN  
            -- 获取贡献最大的前10条明细,转换为JSON格式
            (SELECT LISTAGG(  
                CONCAT('{"sip":"', sip, '","sport":', sport, ',"dip":"', dip, '","dport":', dport, ',"protocol":"', protocol, '","flow_bytes":', flow_bytes, '}'),   
                ','  
            )   
            FROM (  
                SELECT sip, sport, dip, dport, protocol, flow_bytes,  
                       ROW_NUMBER() OVER (ORDER BY flow_bytes DESC) as rn  
                FROM daily_flow_details d  
                WHERE d.dt_date = s.dt_date  
            ) WHERE rn <= 10)  
        ELSE NULL   
    END as top_contributors,  
    CURRENT_TIMESTAMP as alert_time  
FROM daily_flow_summary s  
CROSS JOIN alert_config c  
WHERE c.config_key = 'daily_flow_threshold'  
  AND s.dt_date = DATE_FORMAT(CURRENT_TIMESTAMP, 'yyyy-MM-dd')  
  AND s.total_flow_bytes >= c.threshold_value; -- 只有满足阈值才插入告警
  1. 定时作业配置
sql 复制代码
-- 设置表属性以支持每小时自动检查
ALTER TABLE daily_flow_summary SET (  
    'tag.automatic-creation' = 'process-time',  
    'tag.creation-period' = 'hourly',  
    'tag.creation-delay' = '5 m'  
);  
  
-- 创建定时检查作业的完整SQL
EXECUTE STATEMENT SET  
BEGIN  
    -- 持续聚合数据
    INSERT INTO daily_flow_summary  
    SELECT   
        DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
        SUM(flow_size) as total_flow_bytes,  
        CURRENT_TIMESTAMP as last_check_time  
    FROM source_table  
    GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd');  
      
    INSERT INTO daily_flow_details    
    SELECT   
        DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,  
        sip, sport, dip, dport, protocol,  
        SUM(flow_size) as flow_bytes  
    FROM source_table  
    GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd'), sip, sport, dip, dport, protocol;  
      
    -- 告警检查(基于流式读取,每小时触发)
    INSERT INTO flow_alert_results  
    SELECT   
        s.dt_date,  
        s.total_flow_bytes,  
        c.threshold_value,  
        true as alert_triggered,  
        (SELECT CONCAT('[', LISTAGG(  
            CONCAT('{"sip":"', sip, '","sport":', sport, ',"dip":"', dip, '","dport":', dport, ',"protocol":"', protocol, '","flow_bytes":', flow_bytes, '}'),   
            ','  
        ), ']')  
        FROM (  
            SELECT sip, sport, dip, dport, protocol, flow_bytes,  
                   ROW_NUMBER() OVER (ORDER BY flow_bytes DESC) as rn  
            FROM daily_flow_details d  
            WHERE d.dt_date = s.dt_date  
        ) WHERE rn <= 10) as top_contributors,  
        CURRENT_TIMESTAMP as alert_time  
    FROM daily_flow_summary s  
    CROSS JOIN alert_config c  
    WHERE c.config_key = 'daily_flow_threshold'  
      AND s.total_flow_bytes >= c.threshold_value;  
END;
相关推荐
Java水解12 分钟前
Spring Boot 配置文件深度解析
spring boot·后端
狗头大军之江苏分军18 分钟前
Node.js 性能优化实践,但老板只关心是否能跑
前端·后端
李拾叁的摸鱼日常27 分钟前
Java泛型基本用法与PECS原则详解
java·后端·面试
狗头大军之江苏分军28 分钟前
Node.js 真香,但每次部署都想砸电脑
前端·javascript·后端
帅那个帅1 小时前
go的雪花算法代码分享
开发语言·后端·golang
酒酿萝卜皮1 小时前
Elastic Search 聚合查询
后端
程序员清风1 小时前
阿里二面:新生代垃圾回收为啥使用标记复制算法?
java·后端·面试
sino爱学习1 小时前
Java 三元表达式(?:)的常见坑总结
java·后端
❀͜͡傀儡师1 小时前
Spring Boot函数式编程:轻量级路由函数替代传统Controller
java·spring boot·后端