graph TD
A["Source: 网络流量"] --> B["Flink预聚合层"]
B --> C["Paimon Aggregation Table"]
C --> D["Flink流式读取"]
D --> E["实时告警处理"]
E --> F["Elasticsearch"]
B --> G["Paimon Append Table"]
G --> H["批处理分析"]
H --> F
主表
sql
-- 创建源表
CREATE TABLE source_table (
dt TIMESTAMP(3),
sip STRING,
sport INT,
dip STRING,
dport INT,
protocol STRING,
flow_size BIGINT,
WATERMARK FOR dt AS dt - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = 'network-flow',
'properties.bootstrap.servers' = 'localhost:9092',
'format' = 'json'
);
- 创建每日流量聚合表
创建一个按天聚合的表来存储每日总流量:
sql
-- 创建每日流量聚合表
CREATE TABLE daily_flow_summary (
dt_date STRING,
total_flow_bytes BIGINT,
last_check_time TIMESTAMP(3),
PRIMARY KEY (dt_date) NOT ENFORCED
) WITH (
'merge-engine' = 'aggregation',
'fields.total_flow_bytes.aggregate-function' = 'sum',
'fields.last_check_time.aggregate-function' = 'last_value',
'changelog-producer' = 'lookup'
);
-- 创建每日流量明细表(用于存储贡献最大的记录)
CREATE TABLE daily_flow_details (
dt_date STRING,
sip STRING,
sport INT,
dip STRING,
dport INT,
protocol STRING,
flow_bytes BIGINT,
PRIMARY KEY (dt_date, sip, sport, dip, dport, protocol) NOT ENFORCED
) WITH (
'merge-engine' = 'aggregation',
'fields.flow_bytes.aggregate-function' = 'sum',
'changelog-producer' = 'lookup'
);
- 数据聚合作业
从原始流量数据聚合到每日汇总:
sql
-- 聚合每日总流量
INSERT INTO daily_flow_summary
SELECT
DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,
SUM(flow_size) as total_flow_bytes,
CURRENT_TIMESTAMP as last_check_time
FROM source_table
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd');
-- 聚合每日明细数据
INSERT INTO daily_flow_details
SELECT
DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,
sip, sport, dip, dport, protocol,
SUM(flow_size) as flow_bytes
FROM source_table
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd'), sip, sport, dip, dport, protocol;
- 告警检查作业(每小时执行)
使用Paimon的定时标签功能来实现每小时检查:
sql
-- 创建告警配置表
CREATE TABLE alert_config (
config_key STRING,
threshold_value BIGINT,
PRIMARY KEY (config_key) NOT ENFORCED
);
-- 插入流量阈值配置(例如:每天100GB)
INSERT INTO alert_config VALUES ('daily_flow_threshold', 107374182400);
-- 创建告警结果表
CREATE TABLE flow_alert_results (
dt_date STRING,
total_flow_bytes BIGINT,
threshold_value BIGINT,
alert_triggered BOOLEAN,
top_contributors STRING, -- JSON格式存储贡献最大的明细
alert_time TIMESTAMP(3),
PRIMARY KEY (dt_date, alert_time) NOT ENFORCED
) WITH (
'connector' = 'elasticsearch-7',
'hosts' = 'http://localhost:9200',
'index' = 'daily-flow-alerts'
);
- 告警检查和明细提取逻辑
sql
-- 每小时执行的告警检查作业
INSERT INTO flow_alert_results
SELECT
s.dt_date,
s.total_flow_bytes,
c.threshold_value,
CASE WHEN s.total_flow_bytes >= c.threshold_value THEN true ELSE false END as alert_triggered,
CASE
WHEN s.total_flow_bytes >= c.threshold_value THEN
-- 获取贡献最大的前10条明细,转换为JSON格式
(SELECT LISTAGG(
CONCAT('{"sip":"', sip, '","sport":', sport, ',"dip":"', dip, '","dport":', dport, ',"protocol":"', protocol, '","flow_bytes":', flow_bytes, '}'),
','
)
FROM (
SELECT sip, sport, dip, dport, protocol, flow_bytes,
ROW_NUMBER() OVER (ORDER BY flow_bytes DESC) as rn
FROM daily_flow_details d
WHERE d.dt_date = s.dt_date
) WHERE rn <= 10)
ELSE NULL
END as top_contributors,
CURRENT_TIMESTAMP as alert_time
FROM daily_flow_summary s
CROSS JOIN alert_config c
WHERE c.config_key = 'daily_flow_threshold'
AND s.dt_date = DATE_FORMAT(CURRENT_TIMESTAMP, 'yyyy-MM-dd')
AND s.total_flow_bytes >= c.threshold_value; -- 只有满足阈值才插入告警
- 定时作业配置
sql
-- 设置表属性以支持每小时自动检查
ALTER TABLE daily_flow_summary SET (
'tag.automatic-creation' = 'process-time',
'tag.creation-period' = 'hourly',
'tag.creation-delay' = '5 m'
);
-- 创建定时检查作业的完整SQL
EXECUTE STATEMENT SET
BEGIN
-- 持续聚合数据
INSERT INTO daily_flow_summary
SELECT
DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,
SUM(flow_size) as total_flow_bytes,
CURRENT_TIMESTAMP as last_check_time
FROM source_table
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd');
INSERT INTO daily_flow_details
SELECT
DATE_FORMAT(dt, 'yyyy-MM-dd') as dt_date,
sip, sport, dip, dport, protocol,
SUM(flow_size) as flow_bytes
FROM source_table
GROUP BY DATE_FORMAT(dt, 'yyyy-MM-dd'), sip, sport, dip, dport, protocol;
-- 告警检查(基于流式读取,每小时触发)
INSERT INTO flow_alert_results
SELECT
s.dt_date,
s.total_flow_bytes,
c.threshold_value,
true as alert_triggered,
(SELECT CONCAT('[', LISTAGG(
CONCAT('{"sip":"', sip, '","sport":', sport, ',"dip":"', dip, '","dport":', dport, ',"protocol":"', protocol, '","flow_bytes":', flow_bytes, '}'),
','
), ']')
FROM (
SELECT sip, sport, dip, dport, protocol, flow_bytes,
ROW_NUMBER() OVER (ORDER BY flow_bytes DESC) as rn
FROM daily_flow_details d
WHERE d.dt_date = s.dt_date
) WHERE rn <= 10) as top_contributors,
CURRENT_TIMESTAMP as alert_time
FROM daily_flow_summary s
CROSS JOIN alert_config c
WHERE c.config_key = 'daily_flow_threshold'
AND s.total_flow_bytes >= c.threshold_value;
END;