1. 项目概述与业务背景
1.1 电商实时数仓需求分析
构建支持实时决策的电商数据分析平台。
sql
-- 业务需求矩阵
CREATE TABLE business_requirements (
requirement_id STRING,
business_scenario STRING,
data_freshness STRING,
query_latency STRING,
data_volume STRING,
priority INT
) WITH ('connector' = 'blackhole');
INSERT INTO business_requirements VALUES
('REQ-001', '实时大屏监控', '秒级', '亚秒级', '高', 1),
('REQ-002', '实时推荐系统', '秒级', '毫秒级', '极高', 1),
('REQ-003', '风控欺诈检测', '毫秒级', '毫秒级', '高', 1),
('REQ-004', '运营实时报表', '分钟级', '秒级', '中', 2),
('REQ-005', '用户行为分析', '小时级', '分钟级', '极高', 2),
('REQ-006', '库存预警系统', '秒级', '秒级', '中', 1);
-- 技术架构选型
CREATE TABLE architecture_decision (
component STRING,
technology STRING,
version STRING,
rationale STRING
);
INSERT INTO architecture_decision VALUES
('计算引擎', 'Apache Flink', '1.16', '流批一体,Exactly-Once语义'),
('消息队列', 'Apache Kafka', '3.4', '高吞吐,持久化,重放能力'),
('数据存储', 'Apache Doris', '1.2', 'OLAP分析,高并发查询'),
('监控告警', 'Prometheus + Grafana', '最新', '生态完善,可视化强大'),
('调度系统', 'DolphinScheduler', '3.1', '可视化工作流调度');
2. 数据源接入层
2.1 多数据源实时接入
统一数据接入管道设计。
sql
-- 1. 用户行为数据源(Kafka)
CREATE TABLE user_behavior_source (
user_id BIGINT,
item_id BIGINT,
category_id BIGINT,
behavior STRING, -- view, cart, buy, fav
event_time TIMESTAMP(3),
proc_time AS PROCTIME(),
WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = 'user-behavior',
'properties.bootstrap.servers' = 'kafka:9092',
'properties.group.id' = 'flink-real-time-warehouse',
'scan.startup.mode' = 'latest-offset',
'format' = 'json',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true'
);
-- 2. 订单交易数据源
CREATE TABLE order_transaction_source (
order_id BIGINT,
user_id BIGINT,
item_id BIGINT,
order_amount DECIMAL(10,2),
order_status STRING, -- created, paid, shipped, completed, cancelled
province STRING,
city STRING,
order_time TIMESTAMP(3),
update_time TIMESTAMP(3),
WATERMARK FOR order_time AS order_time - INTERVAL '10' SECOND
) WITH (
'connector' = 'kafka',
'topic' = 'order-transactions',
'properties.bootstrap.servers' = 'kafka:9092',
'format' = 'avro-confluent',
'avro-confluent.schema.registry.url' = 'http://schema-registry:8081'
);
-- 3. 商品维度数据(JDBC维表)
CREATE TABLE product_dimension (
product_id BIGINT,
product_name STRING,
category_id BIGINT,
category_name STRING,
brand STRING,
price DECIMAL(10,2),
stock_quantity INT,
status STRING,
update_time TIMESTAMP(3),
PRIMARY KEY (product_id) NOT ENFORCED
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql:3306/ecommerce',
'table-name' = 'products',
'username' = 'flink_user',
'password' = '${DB_PASSWORD}',
'lookup.cache' = 'PARTITIONED',
'lookup.partitioned-cache.expired-after-write' = '30min',
'lookup.max-retries' = '3'
);
-- 4. 用户维度数据
CREATE TABLE user_dimension (
user_id BIGINT,
user_name STRING,
gender STRING,
age INT,
province STRING,
city STRING,
registration_date DATE,
vip_level INT,
last_login_time TIMESTAMP(3),
PRIMARY KEY (user_id) NOT ENFORCED
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql:3306/ecommerce',
'table-name' = 'users',
'username' = 'flink_user',
'password' = '${DB_PASSWORD}',
'lookup.cache' = 'PARTITIONED',
'lookup.partitioned-cache.expired-after-write' = '1h'
);
2.2 数据质量监控
实时数据质量校验管道。
sql
-- 数据质量监控表
CREATE TABLE data_quality_monitor (
source_topic STRING,
check_time TIMESTAMP(3),
total_records BIGINT,
valid_records BIGINT,
invalid_records BIGINT,
null_fields BIGINT,
format_errors BIGINT,
freshness_seconds BIGINT,
quality_score DOUBLE
) WITH (
'connector' = 'elasticsearch',
'hosts' = 'http://elasticsearch:9200',
'index' = 'data-quality-monitor'
);
-- 实时数据质量检查
INSERT INTO data_quality_monitor
SELECT
'user-behavior' AS source_topic,
TUMBLE_START(event_time, INTERVAL '1' MINUTE) AS check_time,
COUNT(*) AS total_records,
SUM(CASE
WHEN user_id IS NOT NULL
AND item_id IS NOT NULL
AND behavior IN ('view', 'cart', 'buy', 'fav')
THEN 1 ELSE 0
END) AS valid_records,
SUM(CASE
WHEN user_id IS NULL
OR item_id IS NULL
OR behavior NOT IN ('view', 'cart', 'buy', 'fav')
THEN 1 ELSE 0
END) AS invalid_records,
SUM(CASE
WHEN user_id IS NULL OR item_id IS NULL
THEN 1 ELSE 0
END) AS null_fields,
SUM(CASE
WHEN behavior NOT IN ('view', 'cart', 'buy', 'fav')
THEN 1 ELSE 0
END) AS format_errors,
AVG(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - event_time))) AS freshness_seconds,
(SUM(CASE
WHEN user_id IS NOT NULL
AND item_id IS NOT NULL
AND behavior IN ('view', 'cart', 'buy', 'fav')
THEN 1 ELSE 0
END) * 100.0 / COUNT(*)) AS quality_score
FROM user_behavior_source
GROUP BY TUMBLE(event_time, INTERVAL '1' MINUTE);
3. 实时ETL处理层
3.1 数据清洗与标准化
实时数据清洗管道。
sql
-- 1. 用户行为数据清洗
CREATE TABLE cleaned_user_behavior (
user_id BIGINT,
item_id BIGINT,
category_id BIGINT,
behavior STRING,
event_time TIMESTAMP(3),
processed_time TIMESTAMP(3),
data_quality STRING -- HIGH, MEDIUM, LOW
) WITH (
'connector' = 'kafka',
'topic' = 'cleaned-user-behavior',
'properties.bootstrap.servers' = 'kafka:9092',
'format' = 'avro'
);
INSERT INTO cleaned_user_behavior
SELECT
user_id,
item_id,
category_id,
-- 行为标准化
CASE
WHEN behavior = 'pv' THEN 'view'
WHEN behavior = 'add_to_cart' THEN 'cart'
WHEN behavior = 'purchase' THEN 'buy'
WHEN behavior = 'favorite' THEN 'fav'
ELSE behavior
END AS behavior,
event_time,
CURRENT_TIMESTAMP AS processed_time,
CASE
WHEN user_id IS NOT NULL AND item_id IS NOT NULL THEN 'HIGH'
WHEN user_id IS NULL OR item_id IS NULL THEN 'LOW'
ELSE 'MEDIUM'
END AS data_quality
FROM user_behavior_source
WHERE behavior IS NOT NULL;
-- 2. 订单数据清洗与丰富
CREATE TABLE enriched_orders (
order_id BIGINT,
user_id BIGINT,
item_id BIGINT,
product_name STRING,
category_name STRING,
brand STRING,
order_amount DECIMAL(10,2),
order_status STRING,
province STRING,
city STRING,
user_vip_level INT,
order_time TIMESTAMP(3),
update_time TIMESTAMP(3),
processing_time TIMESTAMP(3)
) WITH (
'connector' = 'kafka',
'topic' = 'enriched-orders',
'properties.bootstrap.servers' = 'kafka:9092',
'format' = 'avro'
);
-- 订单数据关联维表
INSERT INTO enriched_orders
SELECT
o.order_id,
o.user_id,
o.item_id,
p.product_name,
p.category_name,
p.brand,
o.order_amount,
o.order_status,
o.province,
o.city,
u.vip_level AS user_vip_level,
o.order_time,
o.update_time,
CURRENT_TIMESTAMP AS processing_time
FROM order_transaction_source o
LEFT JOIN product_dimension FOR SYSTEM_TIME AS OF o.proc_time AS p
ON o.item_id = p.product_id
LEFT JOIN user_dimension FOR SYSTEM_TIME AS OF o.proc_time AS u
ON o.user_id = u.user_id
WHERE o.order_status IS NOT NULL;
3.2 实时指标计算
核心业务指标实时聚合。
sql
-- 1. 实时流量指标
CREATE TABLE realtime_traffic_metrics (
metric_time TIMESTAMP(3),
time_granularity STRING, -- MINUTE, HOUR, DAY
pv_count BIGINT,
uv_count BIGINT,
avg_session_duration DOUBLE,
bounce_rate DOUBLE
) WITH (
'connector' = 'doris',
'table.identifier' = 'realtime.realtime_traffic_metrics',
'fenodes' = 'doris-fe:8030',
'username' = 'flink',
'password' = '${DORIS_PASSWORD}'
);
-- 分钟级流量统计
INSERT INTO realtime_traffic_metrics
SELECT
window_start AS metric_time,
'MINUTE' AS time_granularity,
COUNT(*) AS pv_count,
COUNT(DISTINCT user_id) AS uv_count,
AVG(session_duration) AS avg_session_duration,
SUM(CASE WHEN page_count = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS bounce_rate
FROM (
SELECT
user_id,
TUMBLE_START(event_time, INTERVAL '1' MINUTE) AS window_start,
TUMBLE_END(event_time, INTERVAL '1' MINUTE) AS window_end,
COUNT(*) AS page_count,
MAX(EXTRACT(EPOCH FROM event_time)) - MIN(EXTRACT(EPOCH FROM event_time)) AS session_duration
FROM cleaned_user_behavior
WHERE behavior = 'view'
GROUP BY
TUMBLE(event_time, INTERVAL '1' MINUTE),
user_id
)
GROUP BY window_start, window_end;
-- 2. 实时交易指标
CREATE TABLE realtime_transaction_metrics (
metric_time TIMESTAMP(3),
time_granularity STRING,
order_count BIGINT,
gmv DECIMAL(15,2),
aov DECIMAL(10,2),
paid_order_count BIGINT,
conversion_rate DOUBLE
) WITH (
'connector' = 'doris',
'table.identifier' = 'realtime.realtime_transaction_metrics',
'fenodes' = 'doris-fe:8030',
'username' = 'flink',
'password' = '${DORIS_PASSWORD}'
);
INSERT INTO realtime_transaction_metrics
SELECT
window_start AS metric_time,
'HOUR' AS time_granularity,
COUNT(*) AS order_count,
SUM(order_amount) AS gmv,
AVG(order_amount) AS aov,
SUM(CASE WHEN order_status = 'paid' THEN 1 ELSE 0 END) AS paid_order_count,
SUM(CASE WHEN order_status = 'paid' THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS conversion_rate
FROM (
SELECT
order_id,
order_amount,
order_status,
TUMBLE_START(order_time, INTERVAL '1' HOUR) AS window_start,
TUMBLE_END(order_time, INTERVAL '1' HOUR) AS window_end
FROM enriched_orders
WHERE order_time IS NOT NULL
)
GROUP BY window_start, window_end;
-- 3. 实时用户行为漏斗
CREATE TABLE user_behavior_funnel (
funnel_stage STRING,
user_count BIGINT,
conversion_rate DOUBLE,
dropoff_rate DOUBLE,
metric_time TIMESTAMP(3)
) WITH (
'connector' = 'doris',
'table.identifier' = 'realtime.user_behavior_funnel',
'fenodes' = 'doris-fe:8030',
'username' = 'flink',
'password' = '${DORIS_PASSWORD}'
);
WITH user_behavior_sequence AS (
SELECT
user_id,
LISTAGG(behavior, '->') WITHIN GROUP (ORDER BY event_time) AS behavior_sequence,
MAX(event_time) AS last_event_time
FROM cleaned_user_behavior
WHERE event_time > CURRENT_TIMESTAMP - INTERVAL '1' HOUR
GROUP BY user_id
)
INSERT INTO user_behavior_funnel
SELECT
funnel_stage,
COUNT(DISTINCT user_id) AS user_count,
COUNT(DISTINCT user_id) * 100.0 / LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY step) AS conversion_rate,
(LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY step) - COUNT(DISTINCT user_id)) * 100.0 /
LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY step) AS dropoff_rate,
CURRENT_TIMESTAMP AS metric_time
FROM (
SELECT 1 AS step, 'view' AS funnel_stage, user_id
FROM user_behavior_sequence WHERE behavior_sequence LIKE '%view%'
UNION ALL
SELECT 2, 'cart', user_id
FROM user_behavior_sequence WHERE behavior_sequence LIKE '%view%->cart%'
UNION ALL
SELECT 3, 'buy', user_id
FROM user_behavior_sequence WHERE behavior_sequence LIKE '%view%->cart%->buy%'
) GROUP BY funnel_stage, step
ORDER BY step;
4. 实时数仓应用层
4.1 实时大屏数据模型
面向实时监控的数据集市。
sql
-- 1. 实时运营大屏视图
CREATE TABLE realtime_operation_dashboard (
update_time TIMESTAMP(3),
-- 流量指标
today_pv BIGINT,
today_uv BIGINT,
today_gmv DECIMAL(15,2),
today_order_count BIGINT,
-- 实时分钟级数据
current_minute_pv BIGINT,
current_minute_uv BIGINT,
current_minute_gmv DECIMAL(15,2),
-- 环比数据
pv_growth_rate DOUBLE,
uv_growth_rate DOUBLE,
gmv_growth_rate DOUBLE,
-- 地域分布
top_provinces MAP<STRING, BIGINT>,
top_categories MAP<STRING, BIGINT>
) WITH (
'connector' = 'doris',
'table.identifier' = 'dashboard.realtime_operation',
'fenodes' = 'doris-fe:8030',
'username' = 'flink',
'password' = '${DORIS_PASSWORD}'
);
-- 实时大屏数据聚合
INSERT INTO realtime_operation_dashboard
SELECT
CURRENT_TIMESTAMP AS update_time,
-- 今日累计
today_stats.pv AS today_pv,
today_stats.uv AS today_uv,
today_stats.gmv AS today_gmv,
today_stats.order_count AS today_order_count,
-- 当前分钟
current_minute_stats.pv AS current_minute_pv,
current_minute_stats.uv AS current_minute_uv,
current_minute_stats.gmv AS current_minute_gmv,
-- 环比增长率
(today_stats.pv - yesterday_stats.pv) * 100.0 / yesterday_stats.pv AS pv_growth_rate,
(today_stats.uv - yesterday_stats.uv) * 100.0 / yesterday_stats.uv AS uv_growth_rate,
(today_stats.gmv - yesterday_stats.gmv) * 100.0 / yesterday_stats.gmv AS gmv_growth_rate,
-- 地域分布
MAP_AGG(province, province_count) AS top_provinces,
MAP_AGG(category, category_count) AS top_categories
FROM (
-- 今日累计统计
SELECT
COUNT(*) AS pv,
COUNT(DISTINCT user_id) AS uv,
COALESCE(SUM(order_amount), 0) AS gmv,
COUNT(DISTINCT order_id) AS order_count
FROM (
SELECT user_id, NULL AS order_id, NULL AS order_amount
FROM cleaned_user_behavior
WHERE DATE(event_time) = CURRENT_DATE
UNION ALL
SELECT user_id, order_id, order_amount
FROM enriched_orders
WHERE DATE(order_time) = CURRENT_DATE
)
) today_stats
CROSS JOIN (
-- 昨日同期统计
SELECT
COUNT(*) AS pv,
COUNT(DISTINCT user_id) AS uv,
COALESCE(SUM(order_amount), 0) AS gmv
FROM (
SELECT user_id, NULL AS order_amount
FROM cleaned_user_behavior
WHERE DATE(event_time) = CURRENT_DATE - INTERVAL '1' DAY
UNION ALL
SELECT user_id, order_amount
FROM enriched_orders
WHERE DATE(order_time) = CURRENT_DATE - INTERVAL '1' DAY
)
) yesterday_stats
CROSS JOIN (
-- 当前分钟统计
SELECT
COUNT(*) AS pv,
COUNT(DISTINCT user_id) AS uv,
COALESCE(SUM(order_amount), 0) AS gmv
FROM (
SELECT user_id, NULL AS order_amount
FROM cleaned_user_behavior
WHERE event_time >= CURRENT_TIMESTAMP - INTERVAL '1' MINUTE
UNION ALL
SELECT user_id, order_amount
FROM enriched_orders
WHERE order_time >= CURRENT_TIMESTAMP - INTERVAL '1' MINUTE
)
) current_minute_stats
CROSS JOIN (
-- 地域分布
SELECT province, COUNT(*) AS province_count
FROM enriched_orders
WHERE DATE(order_time) = CURRENT_DATE
GROUP BY province
ORDER BY province_count DESC
LIMIT 10
) province_stats
CROSS JOIN (
-- 品类分布
SELECT category_name AS category, COUNT(*) AS category_count
FROM enriched_orders
WHERE DATE(order_time) = CURRENT_DATE
GROUP BY category_name
ORDER BY category_count DESC
LIMIT 10
) category_stats;
-- 2. 实时用户画像更新
CREATE TABLE realtime_user_profile (
user_id BIGINT,
last_active_time TIMESTAMP(3),
total_pv BIGINT,
total_buy_count BIGINT,
total_gmv DECIMAL(15,2),
favorite_category STRING,
preferred_brand STRING,
avg_order_value DECIMAL(10,2),
recent_behaviors ARRAY<STRING>,
update_time TIMESTAMP(3),
PRIMARY KEY (user_id) NOT ENFORCED
) WITH (
'connector' = 'doris',
'table.identifier' = 'user_profile.realtime_user_profile',
'fenodes' = 'doris-fe:8030',
'username' = 'flink',
'password' = '${DORIS_PASSWORD}'
);
-- 用户画像实时更新
INSERT INTO realtime_user_profile
SELECT
user_id,
MAX(event_time) AS last_active_time,
COUNT(*) AS total_pv,
SUM(CASE WHEN behavior = 'buy' THEN 1 ELSE 0 END) AS total_buy_count,
COALESCE(SUM(order_amount), 0) AS total_gmv,
-- 最喜欢的品类
FIRST_VALUE(category_name ORDER BY category_count DESC) AS favorite_category,
-- 最喜欢的品牌
FIRST_VALUE(brand ORDER BY brand_count DESC) AS preferred_brand,
-- 平均订单价值
CASE
WHEN SUM(CASE WHEN behavior = 'buy' THEN 1 ELSE 0 END) > 0
THEN COALESCE(SUM(order_amount), 0) / SUM(CASE WHEN behavior = 'buy' THEN 1 ELSE 0 END)
ELSE 0
END AS avg_order_value,
-- 最近行为序列
ARRAY_AGG(behavior ORDER BY event_time DESC LIMIT 10) AS recent_behaviors,
CURRENT_TIMESTAMP AS update_time
FROM (
SELECT
u.user_id,
u.behavior,
u.event_time,
o.order_amount,
p.category_name,
p.brand,
COUNT(*) OVER (PARTITION BY u.user_id, p.category_name) AS category_count,
COUNT(*) OVER (PARTITION BY u.user_id, p.brand) AS brand_count
FROM cleaned_user_behavior u
LEFT JOIN enriched_orders o ON u.user_id = o.user_id AND u.item_id = o.item_id
LEFT JOIN product_dimension p ON u.item_id = p.product_id
WHERE u.event_time > CURRENT_TIMESTAMP - INTERVAL '7' DAY
)
GROUP BY user_id;
4.2 实时预警与监控
业务异常实时检测。
sql
-- 1. 实时业务监控告警
CREATE TABLE business_alert_events (
alert_id STRING,
alert_type STRING,
alert_level STRING, -- CRITICAL, WARNING, INFO
alert_message STRING,
metric_value DOUBLE,
threshold_value DOUBLE,
trigger_time TIMESTAMP(3),
related_entities ARRAY<STRING>
) WITH (
'connector' = 'kafka',
'topic' = 'business-alerts',
'properties.bootstrap.servers' = 'kafka:9092',
'format' = 'json'
);
-- GMV异常下降告警
INSERT INTO business_alert_events
SELECT
MD5(CONCAT('gmv_drop_', CAST(CURRENT_TIMESTAMP AS STRING))) AS alert_id,
'GMV_ANOMALY' AS alert_type,
CASE
WHEN drop_rate > 0.3 THEN 'CRITICAL'
WHEN drop_rate > 0.2 THEN 'WARNING'
ELSE 'INFO'
END AS alert_level,
'GMV同比下降' || CAST(drop_rate * 100 AS STRING) || '%' AS alert_message,
current_gmv AS metric_value,
historical_avg_gmv AS threshold_value,
CURRENT_TIMESTAMP AS trigger_time,
ARRAY['all'] AS related_entities
FROM (
SELECT
current.gmv AS current_gmv,
historical.avg_gmv AS historical_avg_gmv,
(historical.avg_gmv - current.gmv) / historical.avg_gmv AS drop_rate
FROM (
-- 当前小时GMV
SELECT COALESCE(SUM(order_amount), 0) AS gmv
FROM enriched_orders
WHERE order_time >= DATE_TRUNC('HOUR', CURRENT_TIMESTAMP)
) current
CROSS JOIN (
-- 历史同期平均GMV
SELECT AVG(gmv) AS avg_gmv
FROM (
SELECT DATE_TRUNC('HOUR', order_time) AS hour_time,
SUM(order_amount) AS gmv
FROM enriched_orders
WHERE order_time >= CURRENT_TIMESTAMP - INTERVAL '30' DAY
GROUP BY DATE_TRUNC('HOUR', order_time)
)
) historical
WHERE historical.avg_gmv > 0
)
WHERE drop_rate > 0.1; -- 下降超过10%触发告警
-- 2. 实时库存预警
CREATE TABLE inventory_alert (
product_id BIGINT,
product_name STRING,
current_stock INT,
safety_stock INT,
alert_type STRING,
created_time TIMESTAMP(3)
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql:3306/alerts',
'table-name' = 'inventory_alerts'
);
INSERT INTO inventory_alert
SELECT
p.product_id,
p.product_name,
p.stock_quantity AS current_stock,
10 AS safety_stock, -- 安全库存阈值
CASE
WHEN p.stock_quantity <= 5 THEN 'CRITICAL'
WHEN p.stock_quantity <= 10 THEN 'WARNING'
ELSE 'NORMAL'
END AS alert_type,
CURRENT_TIMESTAMP AS created_time
FROM product_dimension p
WHERE p.stock_quantity <= 10; -- 库存低于安全库存
5. 数据服务与API层
5.1 实时数据服务
面向应用的实时数据接口。
sql
-- 1. 实时查询视图
CREATE TABLE realtime_query_service (
query_id STRING,
user_id BIGINT,
query_type STRING,
parameters MAP<STRING, STRING>,
result JSON,
query_time TIMESTAMP(3),
response_time_ms BIGINT
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://mysql:3306/query_service',
'table-name' = 'realtime_queries'
);
-- 用户实时行为查询函数
CREATE FUNCTION get_realtime_user_behavior AS 'com.ecommerce.udf.UserBehaviorFunction';
-- 实时推荐数据准备
CREATE TABLE realtime_recommendation_data (
user_id BIGINT,
item_id BIGINT,
score DOUBLE,
reason STRING,
generated_time TIMESTAMP(3),
expire_time TIMESTAMP(3)
) WITH (
'connector' = 'doris',
'table.identifier' = 'recommendation.realtime_scores',
'fenodes' = 'doris-fe:8030',
'username' = 'flink',
'password' = '${DORIS_PASSWORD}'
);
-- 实时推荐分数计算
INSERT INTO realtime_recommendation_data
SELECT
user_id,
item_id,
-- 综合评分算法
(view_score * 0.3 + cart_score * 0.5 + buy_score * 0.8 + recency_score * 0.2) AS score,
'realtime_behavior' AS reason,
CURRENT_TIMESTAMP AS generated_time,
CURRENT_TIMESTAMP + INTERVAL '1' HOUR AS expire_time
FROM (
SELECT
user_id,
item_id,
-- 浏览行为分数
LOG(COUNT_IF(behavior = 'view') + 1) * 0.1 AS view_score,
-- 加购行为分数
COUNT_IF(behavior = 'cart') * 0.3 AS cart_score,
-- 购买行为分数
COUNT_IF(behavior = 'buy') * 0.5 AS buy_score,
-- 时间衰减分数
EXP(-1 * (EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - MAX(event_time))) / 3600)) AS recency_score
FROM cleaned_user_behavior
WHERE event_time > CURRENT_TIMESTAMP - INTERVAL '1' DAY
GROUP BY user_id, item_id
)
WHERE view_score + cart_score + buy_score > 0;
6. 运维监控与治理
6.1 数据管道监控
全链路数据质量与性能监控。
sql
-- 1. 数据管道健康监控
CREATE TABLE pipeline_health_monitor (
pipeline_name STRING,
checkpoint_id BIGINT,
checkpoint_duration BIGINT,
records_processed BIGINT,
records_lag BIGINT,
last_update_time TIMESTAMP(3),
status STRING
) WITH (
'connector' = 'elasticsearch',
'hosts' = 'http://elasticsearch:9200',
'index' = 'pipeline-health'
);
-- 2. 业务数据一致性检查
CREATE TABLE data_consistency_check (
check_id STRING,
check_item STRING,
source_count BIGINT,
target_count BIGINT,
discrepancy BIGINT,
check_time TIMESTAMP(3),
status STRING
) WITH ('connector' = 'jdbc');
INSERT INTO data_consistency_check
SELECT
MD5(CONCAT('consistency_', check_item, '_', CAST(CURRENT_DATE AS STRING))) AS check_id,
check_item,
source_count,
target_count,
ABS(source_count - target_count) AS discrepancy,
CURRENT_TIMESTAMP AS check_time,
CASE
WHEN ABS(source_count - target_count) * 100.0 / source_count < 1 THEN 'PASS'
ELSE 'FAIL'
END AS status
FROM (
SELECT
'user_behavior' AS check_item,
source.count AS source_count,
target.count AS target_count
FROM (
SELECT COUNT(*) AS count
FROM user_behavior_source
WHERE event_time > CURRENT_TIMESTAMP - INTERVAL '1' HOUR
) source
CROSS JOIN (
SELECT COUNT(*) AS count
FROM cleaned_user_behavior
WHERE event_time > CURRENT_TIMESTAMP - INTERVAL '1' HOUR
) target
);
-- 3. 资源使用监控
CREATE TABLE resource_usage_monitor (
taskmanager_id STRING,
cpu_usage DOUBLE,
memory_usage DOUBLE,
network_usage DOUBLE,
checkpoint_size BIGINT,
measurement_time TIMESTAMP(3)
) WITH ('connector' = 'prometheus');
6.2 灾备与数据恢复
数据管道容灾方案。
sql
-- 1. 检查点状态监控
CREATE TABLE checkpoint_monitor (
job_id STRING,
checkpoint_id BIGINT,
checkpoint_type STRING,
duration_ms BIGINT,
size_bytes BIGINT,
status STRING,
trigger_time TIMESTAMP(3)
) WITH ('connector' = 'jdbc');
-- 2. 数据回溯管道
CREATE TABLE data_backfill_pipeline (
backfill_id STRING,
start_time TIMESTAMP(3),
end_time TIMESTAMP(3),
source_topic STRING,
target_table STRING,
records_processed BIGINT,
status STRING,
created_time TIMESTAMP(3)
) WITH ('connector' = 'jdbc');
-- 数据回溯作业示例
INSERT INTO data_backfill_pipeline
SELECT
'backfill_' || DATE_FORMAT(CURRENT_DATE, 'yyyyMMdd') AS backfill_id,
TIMESTAMP '2023-01-01 00:00:00' AS start_time,
CURRENT_TIMESTAMP AS end_time,
'user-behavior' AS source_topic,
'cleaned_user_behavior' AS target_table,
COUNT(*) AS records_processed,
'COMPLETED' AS status,
CURRENT_TIMESTAMP AS created_time
FROM user_behavior_source
WHERE event_time BETWEEN TIMESTAMP '2023-01-01 00:00:00' AND CURRENT_TIMESTAMP;
7. 项目部署与配置
7.1 生产环境配置
高可用生产环境配置。
yaml
# flink-conf.yaml 生产配置
# 检查点配置
execution.checkpointing.interval: 30000
execution.checkpointing.timeout: 600000
execution.checkpointing.min-pause: 5000
execution.checkpointing.max-concurrent-checkpoints: 1
execution.checkpointing.externalized-checkpoint-retention: RETAIN_ON_CANCELLATION
# 状态后端配置
state.backend: rocksdb
state.backend.incremental: true
state.checkpoints.dir: hdfs:///flink/checkpoints/real-time-warehouse
state.savepoints.dir: hdfs:///flink/savepoints/real-time-warehouse
# 高可用配置
high-availability: zookeeper
high-availability.storageDir: hdfs:///flink/ha/real-time-warehouse
high-availability.zookeeper.quorum: zk1:2181,zk2:2181,zk3:2181
high-availability.cluster-id: real-time-warehouse-prod
# 内存配置
taskmanager.memory.process.size: 4096m
taskmanager.memory.managed.size: 1024m
taskmanager.numberOfTaskSlots: 4
# 网络配置
taskmanager.memory.network.min: 64m
taskmanager.memory.network.max: 256m
7.2 作业提交脚本
自动化部署脚本。
bash
#!/bin/bash
# deploy-real-time-warehouse.sh
# 环境变量
FLINK_HOME="/opt/flink"
JOB_JAR="/opt/jobs/real-time-warehouse.jar"
CONFIG_FILE="/opt/configs/real-time-warehouse-config.yaml"
# 提交实时数仓作业
$FLINK_HOME/bin/flink run-application \
--target kubernetes-application \
-Dexecution.runtime-mode=STREAMING \
-Dparallelism.default=8 \
-Dstate.backend=rocksdb \
-Dstate.checkpoints.dir=hdfs:///flink/checkpoints/real-time-warehouse \
-c com.ecommerce.RealTimeWarehouseJob \
$JOB_JAR \
--config $CONFIG_FILE
# 健康检查
echo "Waiting for job to start..."
sleep 30
# 检查作业状态
JOB_STATUS=$($FLINK_HOME/bin/flink list -r | grep "RealTimeWarehouseJob" | awk '{print $4}')
if [ "$JOB_STATUS" == "RUNNING" ]; then
echo "✅ Real-time warehouse job deployed successfully"
else
echo "❌ Job deployment failed"
exit 1
fi
8. 总结
项目成果
业务价值
实时决策支持:秒级数据延迟,支持实时业务决策
用户体验优化:实时推荐和个性化服务提升转化率
运营效率提升:自动化监控告警减少人工干预
数据驱动文化:为全公司提供统一实时数据服务
技术亮点
流批一体架构:基于Flink SQL的统一数据处理
端到端Exactly-Once:保证数据准确性和一致性
水平扩展能力:支持业务量快速增长
完善的监控体系:全链路可观测性保障稳定性
最佳实践
渐进式建设:从核心场景开始,逐步扩展
数据质量优先:内置数据质量监控和告警
自动化运维:完善的CI/CD和监控告警体系
文档和规范:详细的开发规范和使用文档
这个实时数仓项目为企业提供了完整的数据驱动能力,从数据接入、实时处理到数据服务,形成了闭环的实时数据价值链。