1. DDL基础概念与语法结构
1.1 DDL在流处理中的特殊意义
与传统数据库的DDL不同,Flink SQL的DDL需要处理动态表的特殊需求:
流式DDL核心特征
- 连接器配置定义数据流入流出方式
- 时间属性声明支持事件时间处理
- 水印策略配置处理乱序事件
1.2 基本DDL语法结构
sql
CREATE TABLE [IF NOT EXISTS] [catalog_name.][db_name.]table_name
(
{ <physical_column_definition> | <metadata_column_definition> | <computed_column_definition> }[ , ...n]
[ <watermark_definition> ]
[ <table_constraint> ][ , ...n]
)
[COMMENT table_comment]
[PARTITIONED BY (partition_column_name1, partition_column_name2, ...)]
[ <distribution> ]
WITH (key1=val1, key2=val2, ...)
[ LIKE source_table [( <like_options> )] | AS select_query ]
2. 列定义与数据类型映射
2.1 物理列定义规范
sql
-- 完整的列定义语法
CREATE TABLE full_column_definition (
-- 基础列定义
user_id BIGINT COMMENT '用户ID,主键标识',
user_name VARCHAR(100) NOT NULL COMMENT '用户名,必填字段',
-- 复杂数据类型
tags ARRAY<VARCHAR(20)> COMMENT '用户标签数组',
preferences MAP<VARCHAR, VARCHAR> COMMENT '用户偏好设置',
address ROW<
province VARCHAR(10),
city VARCHAR(10),
detail VARCHAR(100)
> COMMENT '用户地址信息',
-- 数值类型精度控制
balance DECIMAL(10, 2) COMMENT '账户余额,精度到分',
score DOUBLE COMMENT '用户评分,浮点数'
) WITH (...);
2.2 计算列与元数据列
计算列(Generated Column)
sql
-- 基于其他列的计算列
CREATE TABLE computed_columns (
price DOUBLE,
quantity INT,
-- 计算列:总金额
total_amount AS price * quantity,
event_time TIMESTAMP(3),
-- 计算列:事件日期(从时间戳提取)
event_date AS CAST(event_time AS DATE),
json_data VARCHAR(500),
-- 计算列:从JSON提取字段
user_id AS JSON_VALUE(json_data, '$.user_id')
) WITH (...);
-- 计算列可用于主键和时间属性
CREATE TABLE computed_primary_key (
first_name VARCHAR(50),
last_name VARCHAR(50),
full_name AS first_name || ' ' || last_name PRIMARY KEY NOT ENFORCED,
processing_time AS PROCTIME()
);
元数据列(Metadata Column)
sql
-- 访问连接器的元数据信息
CREATE TABLE kafka_with_metadata (
user_id BIGINT,
message VARCHAR(500),
-- Kafka元数据
topic METADATA FROM 'topic' VIRTUAL,
partition METADATA FROM 'partition' VIRTUAL,
offset METADATA FROM 'offset' BIGINT,
timestamp METADATA FROM 'timestamp' TIMESTAMP(3)
) WITH (
'connector' = 'kafka',
'topic' = 'user_events',
'properties.bootstrap.servers' = 'localhost:9092',
'format' = 'json'
);
3. 时间属性与水印策略
3.1 处理时间定义
sql
-- 处理时间(系统时间)定义方式
CREATE TABLE processing_time_tables (
-- 方式1:计算列
proc_time1 AS PROCTIME(),
-- 方式2:虚拟列
proc_time2 TIMESTAMP_LTZ(3) NOT NULL METADATA FROM 'timestamp'
) WITH (...);
-- 使用处理时间的窗口聚合
SELECT
window_start,
COUNT(*) as pv
FROM TUMBLE(TABLE processing_time_tables, DESCRIPTOR(proc_time1), INTERVAL '1' HOUR)
GROUP BY window_start;
3.2 事件时间与水印配置
基本水印定义
sql
CREATE TABLE event_time_basic (
event_time TIMESTAMP(3),
-- 水印策略:事件时间减5秒
WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
) WITH (...);
4. 主键与约束定义
4.1 主键约束配置
sql
-- 主键定义(用于更新和删除操作)
CREATE TABLE upsert_kafka_table (
user_id BIGINT,
user_name VARCHAR(100),
last_login TIMESTAMP(3),
-- 主键约束(NOT ENFORCED表示逻辑约束)
PRIMARY KEY (user_id) NOT ENFORCED
) WITH (
'connector' = 'upsert-kafka',
'topic' = 'user_profile',
'properties.bootstrap.servers' = 'localhost:9092',
'key.format' = 'json',
'value.format' = 'json'
);
-- 复合主键
CREATE TABLE composite_primary_key (
tenant_id VARCHAR(20),
resource_id VARCHAR(50),
metric_value DOUBLE,
PRIMARY KEY (tenant_id, resource_id) NOT ENFORCED
);
5. 连接器配置详解
5.1 Kafka连接器完整配置
sql
-- Kafka源表配置
CREATE TABLE kafka_source_table (
user_id BIGINT,
event_type VARCHAR(20),
event_time TIMESTAMP(3),
WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
) WITH (
-- 基础连接配置
'connector' = 'kafka',
'topic' = 'user_events',
'properties.bootstrap.servers' = 'localhost:9092',
-- 消费配置
'properties.group.id' = 'flink-sql-consumer',
'scan.startup.mode' = 'earliest-offset', -- latest-offset, timestamp等
-- 格式配置
'format' = 'json',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true',
-- 性能调优
'scan.topic-partition-discovery.interval' = '1min',
'properties.auto.offset.reset' = 'earliest'
);
-- Kafka结果表配置
CREATE TABLE kafka_sink_table (
window_start TIMESTAMP(3),
window_end TIMESTAMP(3),
pv BIGINT
) WITH (
'connector' = 'kafka',
'topic' = 'pageview_stats',
'properties.bootstrap.servers' = 'localhost:9092',
-- 生产配置
'sink.delivery-guarantee' = 'exactly-once', -- at-least-once
'properties.transaction.timeout.ms' = '900000',
'format' = 'json'
);
5.2 JDBC连接器配置
sql
-- JDBC源表(维表)
CREATE TABLE jdbc_dim_table (
product_id VARCHAR(20),
product_name VARCHAR(100),
category VARCHAR(50),
price DECIMAL(10,2),
PRIMARY KEY (product_id) NOT ENFORCED
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://localhost:3306/product_db',
'table-name' = 'products',
'username' = 'flink_user',
'password' = 'flink_pass',
-- 查询优化
'scan.partition.column' = 'product_id',
'scan.partition.num' = '5',
'scan.partition.lower-bound' = '1',
'scan.partition.upper-bound' = '10000',
'scan.fetch-size' = '1000'
);
-- JDBC结果表
CREATE TABLE jdbc_sink_table (
user_id BIGINT,
metric_name VARCHAR(50),
metric_value DOUBLE,
update_time TIMESTAMP(3),
PRIMARY KEY (user_id, metric_name) NOT ENFORCED
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://localhost:3306/metrics_db',
'table-name' = 'user_metrics',
'username' = 'flink_user',
'password' = 'flink_pass',
-- 写入优化
'sink.buffer-flush.max-rows' = '1000',
'sink.buffer-flush.interval' = '30s',
'sink.max-retries' = '3'
);
6. 高级DDL特性
6.1 表继承与LIKE子句
sql
-- 基础表定义
CREATE TABLE base_log_table (
log_id VARCHAR(50),
log_time TIMESTAMP(3),
level VARCHAR(10),
message VARCHAR(1000),
WATERMARK FOR log_time AS log_time - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = 'logs',
'properties.bootstrap.servers' = 'localhost:9092'
);
-- 继承表定义(包含所有属性和约束)
CREATE TABLE error_log_table (
error_code VARCHAR(20)
) WITH (
'topic' = 'error_logs'
) LIKE base_log_table;
-- 选择性继承
CREATE TABLE warn_log_table (
-- 包含所有列,排除水印和连接器配置
) LIKE base_log_table (
EXCLUDING CONSTRAINTS
EXCLUDING ALL
) WITH (
'connector' = 'kafka',
'topic' = 'warn_logs',
'properties.bootstrap.servers' = 'localhost:9092'
);
6.2 临时表与会话级表
sql
-- 临时表(会话级别可见)
CREATE TEMPORARY TABLE temp_results (
window_time TIMESTAMP(3),
metric_value DOUBLE
) WITH (
'connector' = 'blackhole'
);
-- 临时表用于中间计算
INSERT INTO temp_results
SELECT window_time, COUNT(*)
FROM source_table
GROUP BY TUMBLE(event_time, INTERVAL '1' MINUTE);
-- 从临时表继续处理
SELECT AVG(metric_value) FROM temp_results;
7. 分区表与分桶表
7.1 文件系统分区表
sql
-- 分区文件表
CREATE TABLE partitioned_file_table (
event_time TIMESTAMP(3),
user_id BIGINT,
event_type VARCHAR(20),
metric_value DOUBLE
) PARTITIONED BY (event_type, dt) -- 按事件类型和日期分区
WITH (
'connector' = 'filesystem',
'path' = 'file:///data/events',
'format' = 'parquet',
'partition.default-name' = '__DEFAULT_PARTITION__',
'sink.partition-commit.delay' = '1 h',
'sink.partition-commit.policy.kind' = 'success-file'
);
7.2 动态分桶表
sql
-- 分桶表配置
CREATE TABLE bucketed_table (
user_id BIGINT,
event_time TIMESTAMP(3),
data VARCHAR(100)
) WITH (
'connector' = 'filesystem',
'path' = 'file:///data/buckets',
'format' = 'json',
'sink.bucket-assigner.class' = 'org.apache.flink.connector.file.sink.bucket.assigners.DateTimeBucketAssigner',
'sink.bucket.check-interval' = '1min',
'sink.rolling-policy.file-size' = '128MB',
'sink.rolling-policy.rollover-interval' = '1 h'
);
8. 表属性与配置管理
8.1 表注释与文档化
sql
-- 完整的表文档化
CREATE TABLE well_documented_table (
user_id BIGINT COMMENT '用户唯一标识,自增主键',
user_name VARCHAR(100) NOT NULL COMMENT '用户名,唯一索引',
register_time TIMESTAMP(3) COMMENT '注册时间,用于计算用户生命周期',
WATERMARK FOR register_time AS register_time - INTERVAL '1' DAY COMMENT '用户注册水印'
) COMMENT '用户基本信息表,包含用户核心属性'
WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://localhost:3306/user_db',
'table-name' = 'users'
);
8.2 配置参数管理
sql
-- 通过表参数调优性能
CREATE TABLE optimized_table (
id BIGINT,
data VARCHAR(1000)
) WITH (
'connector' = 'kafka',
-- 性能参数
'properties.batch.size' = '16384',
'properties.linger.ms' = '10',
'properties.buffer.memory' = '33554432',
-- 容错参数
'properties.retries' = '3',
'properties.retry.backoff.ms' = '100',
-- 安全参数
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.mechanism' = 'PLAIN'
);
9. 表管理操作
9.1 表结构变更
sql
-- 修改表结构(Flink 1.14+)
ALTER TABLE user_events
ADD COLUMN device_type VARCHAR(20) COMMENT '设备类型';
ALTER TABLE user_events
MODIFY COLUMN user_name VARCHAR(200) COMMENT '扩展用户名长度';
-- 修改表属性
ALTER TABLE user_events SET ('scan.startup.mode' = 'latest-offset');
-- 重命名表
ALTER TABLE old_table_name RENAME TO new_table_name;
9.2 表生命周期管理
sql
-- 查看表信息
DESCRIBE user_events;
SHOW CREATE TABLE user_events;
-- 删除表
DROP TABLE IF EXISTS user_events;
-- 清空表数据(仅支持批模式, 并且要求目标表实现了 SupportsTruncate 接口)
TRUNCATE TABLE temp_results;
10. 最佳实践与调试技巧
10.1 DDL设计模式
sql
-- 模式1:配置与结构分离
CREATE TABLE base_kafka_config (
-- 只定义连接配置
) WITH (
'connector' = 'kafka',
'properties.bootstrap.servers' = '${KAFKA_SERVERS:localhost:9092}'
);
CREATE TABLE business_table (
-- 业务字段定义
) LIKE base_kafka_config (EXCLUDING ALL)
WITH ('topic' = 'business_events');
-- 模式2:环境特定配置
CREATE TABLE dev_table (...) WITH (
'properties.bootstrap.servers' = 'localhost:9092'
);
CREATE TABLE prod_table (...) WITH (
'properties.bootstrap.servers' = 'kafka-prod-1:9092,kafka-prod-2:9092'
);
掌握DDL是构建健壮流处理应用的基础。