第16章:Flink集成深度解析
导言:数据实时处理的完美伴侣
Paimon与Flink的集成是最重要的应用场景。本章讲解如何在Flink中高效地读写Paimon表。
第一部分:Paimon Table Store Connector
1.1 依赖配置
xml
<dependency>
<groupId>org.apache.paimon</groupId>
<artifactId>paimon-flink-runtime</artifactId>
<version>0.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java</artifactId>
<version>1.18.0</version>
</dependency>
1.2 创建Catalog
java
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv =
StreamTableEnvironment.create(env);
// 注册Paimon Catalog
tEnv.executeSql(
"CREATE CATALOG paimon WITH (" +
" 'type' = 'paimon'," +
" 'warehouse' = '/path/to/paimon'" +
")");
// 设置默认Catalog
tEnv.useCatalog("paimon");
tEnv.useDatabase("default");
第二部分:Flink批处理
2.1 批量读取
java
// 创建表
tEnv.executeSql(
"CREATE TABLE orders (" +
" order_id BIGINT," +
" user_id BIGINT," +
" amount DECIMAL(10, 2)," +
" dt DATE" +
") WITH (" +
" 'connector' = 'paimon'," +
" 'path' = '/path/to/orders'" +
")");
// 批量查询
Table result = tEnv.sqlQuery("SELECT * FROM orders");
result.execute().print();
2.2 批量写入
java
// 插入数据
tEnv.executeSql(
"INSERT INTO orders SELECT * FROM staging_table");
// 批量更新
tEnv.executeSql(
"UPDATE orders SET status='shipped' " +
"WHERE created_at < NOW() - INTERVAL '7' DAY");
第三部分:Flink流处理
3.1 实时读取(Changelog Stream)
java
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv =
StreamTableEnvironment.create(env);
// 创建Paimon表
tEnv.executeSql("CREATE TABLE orders (...) WITH (...)");
// 实时读取Changelog
DataStream<Row> stream = tEnv.toChangelogStream(
tEnv.sqlQuery("SELECT * FROM orders"));
stream.print();
env.execute("Read Paimon Changelog");
3.2 实时写入
java
// 创建源表(Kafka)
tEnv.executeSql(
"CREATE TABLE kafka_orders (" +
" order_id BIGINT," +
" amount DECIMAL(10, 2)," +
" ...," +
" WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND" +
") WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'orders'," +
" 'properties.bootstrap.servers' = 'localhost:9092'" +
")");
// 创建Paimon目标表
tEnv.executeSql(
"CREATE TABLE paimon_orders (" +
" order_id BIGINT PRIMARY KEY," +
" amount DECIMAL(10, 2)," +
" ...," +
" dt DATE" +
") PARTITIONED BY (dt) WITH (" +
" 'connector' = 'paimon'," +
" 'path' = '/path/to/paimon/orders'" +
")");
// 实时写入
tEnv.executeSql(
"INSERT INTO paimon_orders " +
"SELECT " +
" order_id, amount, ..., CAST(event_time AS DATE) as dt " +
"FROM kafka_orders");
env.execute("Real-time Ingestion");
第四部分:特殊操作
4.1 Upsert(更新插入)
java
tEnv.executeSql(
"CREATE TABLE user_events (" +
" user_id BIGINT PRIMARY KEY," +
" last_click_time BIGINT," +
" click_count BIGINT," +
" ...," +
" updated_at BIGINT" +
") WITH (" +
" 'connector' = 'paimon'," +
" 'path' = '/path/to/user_events'," +
" 'merge-engine' = 'aggregation'," +
" 'aggregation.field.click_count' = 'SUM'" +
")");
// Kafka源的Upsert语义
tEnv.executeSql(
"CREATE TABLE kafka_events (" +
" user_id BIGINT," +
" last_click_time BIGINT," +
" click_count BIGINT," +
" ...," +
" updated_at BIGINT," +
" PRIMARY KEY (user_id) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'upsert-kafka'," +
" 'topic' = 'user-events'" +
")");
// 直接写入,实现Upsert
tEnv.executeSql(
"INSERT INTO user_events SELECT * FROM kafka_events");
4.2 动态分区
java
tEnv.executeSql(
"INSERT INTO orders " +
"SELECT " +
" order_id, user_id, amount, " +
" CAST(event_time AS DATE) as dt -- 自动路由到对应分区" +
"FROM kafka_orders");
第五部分:性能优化
5.1 写入并发配置
yaml
CREATE TABLE orders (...) WITH (
'connector' = 'paimon',
'path' = '/path/to/orders',
'write-buffer-size' = '256MB',
'write-buffer-spillable' = 'true',
'sink.parallelism' = '16'
);
5.2 读取优化
java
// 指定读取模式
DataStreamSource<Row> source = env
.addSource(new PaimonTableSource(
table,
new SourceOptions()
.withReadMode("batch") // 批处理模式
.withParallelism(16)
.withPushdownFilters(filters)));
第六部分:生产级集成架构
6.1 数据同步架构
css
Kafka数据源
↓
Flink Transformation
├─ ETL处理
├─ 数据清洗
└─ 业务逻辑
↓
Paimon Table Store
├─ 实时存储
├─ ACID保证
└─ 多版本支持
↓
下游消费(BI、查询、报表)
6.2 配置示例
java
public class PaimonIngestionPipeline {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment()
.enableCheckpointing(60000) // 60秒一次检查点
.setStateBackend(new RocksDBStateBackend());
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
// 注册Paimon Catalog
tEnv.executeSql(
"CREATE CATALOG paimon WITH (" +
" 'type' = 'paimon'," +
" 'warehouse' = 'file:///data/paimon'" +
")");
tEnv.useCatalog("paimon");
// 创建源表(Kafka)
tEnv.executeSql(
"CREATE TABLE orders_source (" +
" order_id STRING," +
" amount DECIMAL(10, 2)," +
" event_time BIGINT," +
" WATERMARK FOR event_time AS " +
" FROM_UNIXTIME(event_time/1000)" +
") WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'orders'," +
" 'properties.bootstrap.servers' = 'localhost:9092'" +
")");
// 创建目标表(Paimon)
tEnv.executeSql(
"CREATE TABLE orders_sink (" +
" order_id STRING PRIMARY KEY," +
" amount DECIMAL(10, 2)," +
" event_time BIGINT," +
" dt DATE" +
") PARTITIONED BY (dt) WITH (" +
" 'connector' = 'paimon'," +
" 'path' = 'file:///data/paimon/orders'" +
")");
// 数据处理和写入
tEnv.executeSql(
"INSERT INTO orders_sink " +
"SELECT " +
" order_id," +
" amount," +
" event_time," +
" CAST(FROM_UNIXTIME(event_time/1000) AS DATE) as dt " +
"FROM orders_source");
env.execute("Paimon Ingestion");
}
}
总结
Flink + Paimon的优势
markdown
实时写入 ← Flink强项
↓
Paimon存储(ACID、索引、分布式)
↓
实时读取 ← Flink强项
↓
流式和批式统一处理
下一章:第17章讲解集群部署与运维