一、流批一体架构核心思想
1.1 什么是流批一体?
流批一体是一种架构范式,用同一套代码逻辑处理流式数据和批量数据,实现 Lambda 和 Kappa 架构的统一。
1.2 传统架构 vs 流批一体
text
复制
下载
传统Lambda架构:
┌─────────────────┐ ┌─────────────────┐
│ 批量处理层 │ │ 流处理层 │
│ (Hive/Spark) │ │ (Flink/Storm) │
├─────────────────┤ ├─────────────────┤
│ 批处理代码 │ │ 流处理代码 │ ← 两套代码!
│ 批处理状态 │ │ 流处理状态 │ ← 状态不一致!
└─────────────────┘ └─────────────────┘
│ │
└─────────┬───────────┘
↓
┌─────────────┐
│ 服务层 │ ← 需要合并结果!
└─────────────┘
流批一体架构:
┌─────────────────────────────────┐
│ 统一处理层 │
│ ┌─────────────────────────┐ │
│ │ 同一套业务逻辑 │ │ ← 一套代码!
│ │ 同一套状态管理 │ │ ← 状态统一!
│ └─────────────────────────┘ │
│ ▲ ▲ │
│ │ │ │
│ ┌─────┴─────┐ ┌─────┴─────┐ │
│ │ 流式数据源 │ │ 批量数据源 │ │
│ └───────────┘ └───────────┘ │
└─────────────────────────────────┘
↓
┌─────────────┐
│ 统一输出 │
└─────────────┘
二、核心实现模式
2.1 统一API层:Flink为例
java
复制
下载
// 统一数据源抽象
public abstract class UnifiedSource<T> {
public abstract DataStream<T> asStream(StreamExecutionEnvironment env);
public abstract DataSet<T> asBatch(ExecutionEnvironment env);
}
// 统一处理逻辑
public class UnifiedProcessor {
// 流批统一的业务逻辑
public static <T> SingleOutputStreamOperator<T> process(
DataStream<T> input) {
return input
.keyBy(item -> item.getUserId())
.process(new UnifiedKeyedProcessFunction());
}
public static <T> DataSet<T> process(
DataSet<T> input) {
return input
.groupBy(item -> item.getUserId())
.reduceGroup(new UnifiedGroupReduceFunction());
}
}
// 统一状态声明
public class UnifiedKeyedProcessFunction
extends KeyedProcessFunction<String, Event, Result> {
// 状态声明 - 流批统一
private transient ValueState<Long> countState;
private transient MapState<String, Double> sumState;
@Override
public void open(Configuration parameters) {
ValueStateDescriptor<Long> countDesc =
new ValueStateDescriptor<>("count", Long.class);
countState = getRuntimeContext().getState(countDesc);
MapStateDescriptor<String, Double> sumDesc =
new MapStateDescriptor<>("sum", String.class, Double.class);
sumState = getRuntimeContext().getMapState(sumDesc);
}
}
2.2 统一执行引擎实现
java
复制
下载
// 统一执行入口
public class UnifiedExecutionEngine {
// 配置驱动执行模式
public void execute(String mode, JobConfig config) {
switch (mode.toLowerCase()) {
case "streaming":
executeStreaming(config);
break;
case "batch":
executeBatch(config);
break;
case "unified":
executeUnified(config);
break;
}
}
private void executeUnified(JobConfig config) {
// 1. 创建统一执行环境
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 设置批处理优化(有限流)
if (config.isBounded()) {
env.setRuntimeMode(RuntimeExecutionMode.BATCH);
env.enableCheckpointing(Long.MAX_VALUE); // 批处理模式checkpoint
}
// 3. 创建统一数据源
UnifiedSource<Event> source = createUnifiedSource(config);
DataStream<Event> dataStream = source.asStream(env);
// 4. 应用统一处理逻辑
DataStream<Result> resultStream = UnifiedProcessor.process(dataStream);
// 5. 统一输出
UnifiedSink<Result> sink = createUnifiedSink(config);
resultStream.addSink(sink);
env.execute("Unified Streaming-Batch Job");
}
}
三、状态管理实现
3.1 统一状态存储抽象
java
复制
下载
// 状态存储接口
public interface UnifiedStateStore {
// 键值状态
<K, V> KeyedState<K, V> getKeyedState(
String namespace,
StateDescriptor<K, V> descriptor);
// 算子状态
<S extends State> S getOperatorState(
String name,
StateDescriptor<?, S> descriptor);
// 状态快照
CompletableFuture<StateSnapshot> snapshot(
long checkpointId,
CheckpointOptions options);
// 状态恢复
void restore(StateSnapshot snapshot);
}
// 分层状态存储实现
public class TieredStateStore implements UnifiedStateStore {
private final StateStorage localStorage; // 本地内存/磁盘
private final StateStorage remoteStorage; // RocksDB/HDFS
private final StateStorage externalStorage; // 外部存储(MySQL/HBase)
@Override
public <K, V> KeyedState<K, V> getKeyedState(
String namespace,
StateDescriptor<K, V> descriptor) {
// 根据状态大小和访问频率选择存储层
if (isHotSmallState(descriptor)) {
return localStorage.getKeyedState(namespace, descriptor);
} else if (isLargeState(descriptor)) {
return remoteStorage.getKeyedState(namespace, descriptor);
} else {
return externalStorage.getKeyedState(namespace, descriptor);
}
}
}
// 状态TTL管理
public class StateTTLManager {
public static <T> StateTtlConfig buildTtlConfig(Duration ttl) {
return StateTtlConfig.newBuilder(ttl)
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
.setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
.cleanupInBackground() // 后台清理
.cleanupFullSnapshot() // 全量快照时清理
.build();
}
// 批量状态过期
public void batchExpireStates(
Map<String, Long> stateLastAccessTime,
Duration ttl) {
long currentTime = System.currentTimeMillis();
long expireThreshold = currentTime - ttl.toMillis();
stateLastAccessTime.entrySet().removeIf(entry ->
entry.getValue() < expireThreshold
);
}
}
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
3.2 流批统一状态管理器
java
复制
下载
public class UnifiedStateManager {
// 流处理状态(实时更新)
private final MapState<String, WindowAggregate> streamingState;
// 批处理状态(周期性更新)
private final MapState<String, BatchAggregate> batchState;
// 融合状态(流批统一视图)
private final MapState<String, UnifiedAggregate> unifiedState;
// 状态版本管理
private final MapState<String, List<StateVersion>> stateVersions;
public UnifiedAggregate computeUnifiedResult(String key) {
// 1. 获取流状态
WindowAggregate streamAgg = streamingState.get(key);
// 2. 获取批状态
BatchAggregate batchAgg = batchState.get(key);
// 3. 融合计算
return UnifiedAggregate.merge(streamAgg, batchAgg);
}
// 状态一致性保证
public synchronized void updateState(
String key,
Event event,
boolean isStreaming) {
// 使用MVCC保证一致性
long version = System.currentTimeMillis();
if (isStreaming) {
streamingState.put(key,
updateStreamingState(key, event, version));
} else {
batchState.put(key,
updateBatchState(key, event, version));
}
// 记录状态版本
stateVersions.put(key, new StateVersion(version, isStreaming));
// 异步更新统一状态
updateUnifiedStateAsync(key);
}
// 状态回退(时间旅行)
public UnifiedAggregate getStateAtTime(String key, long timestamp) {
List<StateVersion> versions = stateVersions.get(key);
// 找到指定时间点的状态版本
StateVersion targetVersion = versions.stream()
.filter(v -> v.getTimestamp() <= timestamp)
.max(Comparator.comparingLong(StateVersion::getTimestamp))
.orElse(null);
if (targetVersion == null) {
return null;
}
// 从状态存储中恢复历史状态
return restoreHistoricalState(key, targetVersion);
}
}
3.3 状态后端实现
java
复制
下载
// 统一状态后端
public class UnifiedStateBackend implements StateBackend {
private final MemoryStateBackend memoryBackend; // 内存状态
private final RocksDBStateBackend rocksDBBackend; // 磁盘状态
private final ExternalStateBackend externalBackend; // 外部状态
@Override
public <K> AbstractKeyedStateBackend<K> createKeyedStateBackend(
Environment env,
JobID jobID,
String operatorIdentifier,
TypeSerializer<K> keySerializer,
int numberOfKeyGroups,
KeyGroupRange keyGroupRange,
TaskKvStateRegistry kvStateRegistry,
TtlTimeProvider ttlTimeProvider) {
// 根据状态类型选择后端
if (isSmallState(operatorIdentifier)) {
return memoryBackend.createKeyedStateBackend(...);
} else if (isLargeState(operatorIdentifier)) {
return rocksDBBackend.createKeyedStateBackend(...);
} else {
return new ExternalKeyedStateBackend(...);
}
}
// 状态快照存储
@Override
public CompletedCheckpointStorageLocation resolveCheckpoint(
String pointer) throws IOException {
// 支持多种存储位置
if (pointer.startsWith("hdfs://")) {
return new HdfsCheckpointStorageLocation(pointer);
} else if (pointer.startsWith("s3://")) {
return new S3CheckpointStorageLocation(pointer);
} else if (pointer.startsWith("mysql://")) {
return new DatabaseCheckpointStorageLocation(pointer);
}
throw new IOException("Unsupported checkpoint storage: " + pointer);
}
}
// 外部状态后端(连接外部存储)
public class ExternalStateBackend extends AbstractStateBackend {
private final StateStorageClient storageClient;
private final Cache<String, byte[]> localCache;
@Override
public ExternalKeyedStateBackend createKeyedStateBackend(...) {
return new ExternalKeyedStateBackend(
storageClient,
localCache,
keySerializer,
numberOfKeyGroups,
keyGroupRange
);
}
}
// 外部状态后端实现
public class ExternalKeyedStateBackend<K> extends AbstractKeyedStateBackend<K> {
private final StateStorageClient client;
private final Cache<String, byte[]> cache;
@Override
public <N> KvState<N> createState(
TypeSerializer<N> namespaceSerializer,
StateDescriptor<?, ?> stateDesc) {
String stateId = generateStateId(stateDesc);
return new ExternalKvState<>(
client,
cache,
stateId,
namespaceSerializer,
stateDesc.getSerializer(),
getKeySerializer()
);
}
// 异步状态访问
@Override
public CompletableFuture<Void> asyncSnapshot(
long checkpointId,
long timestamp,
CheckpointStreamFactory streamFactory,
CheckpointOptions checkpointOptions) {
return CompletableFuture.runAsync(() -> {
// 1. 本地状态序列化
byte[] localState = serializeLocalState();
// 2. 异步写入外部存储
CompletableFuture<Void> writeFuture =
client.writeAsync(checkpointId, localState);
// 3. 更新元数据
writeFuture.thenAccept(v ->
updateCheckpointMetadata(checkpointId, timestamp)
);
});
}
}
四、流批统一数据源
4.1 统一数据源实现
java
复制
下载
// 数据源工厂
public class UnifiedSourceFactory {
public static <T> Source<T, ?, ?> createSource(
SourceConfig config,
RuntimeMode mode) {
switch (config.getType()) {
case "kafka":
return createKafkaSource(config, mode);
case "hdfs":
return createHdfsSource(config, mode);
case "jdbc":
return createJdbcSource(config, mode);
case "hybrid":
return createHybridSource(config, mode);
default:
throw new IllegalArgumentException("Unsupported source type");
}
}
private static <T> Source<T, ?, ?> createHybridSource(
SourceConfig config,
RuntimeMode mode) {
// 混合源:流式消费 + 批量补数
HybridSource.Builder<T> builder = HybridSource.builder();
// 第一阶段:批量读取历史数据
builder.addSource(
new FileSource<>(config.getBatchPath()),
Boundedness.BOUNDED
);
// 第二阶段:流式读取实时数据
builder.addSource(
new KafkaSource<>(config.getKafkaConfig()),
Boundedness.CONTINUOUS_UNBOUNDED
);
return builder.build();
}
}
// 统一数据源接口
public interface UnifiedDataSource<T> {
// 作为流数据源
DataStream<T> asStream(StreamExecutionEnvironment env);
// 作为批数据源
DataSet<T> asBatch(ExecutionEnvironment env);
// 元数据管理
DataSourceMetadata getMetadata();
// 数据质量检查
DataQualityMetrics checkQuality();
}
// 实现示例:统一文件源
public class UnifiedFileSource<T> implements UnifiedDataSource<T> {
private final String basePath;
private final FileFormat format;
private final TypeInformation<T> typeInfo;
@Override
public DataStream<T> asStream(StreamExecutionEnvironment env) {
// 流式读取:监控目录变化
return env.readFile(
new StreamingFileSource<>(basePath, format),
basePath,
FileProcessingMode.PROCESS_CONTINUOUSLY,
1000 // 监控间隔
).returns(typeInfo);
}
@Override
public DataSet<T> asBatch(ExecutionEnvironment env) {
// 批量读取:全量数据
return env.readFile(
new BulkFileSource<>(basePath, format),
basePath
).returns(typeInfo);
}
}
4.2 增量与全量统一读取
java
复制
下载
public class IncrementalSource<T> implements SourceFunction<T> {
private final StateStorage stateStorage;
private final SourceFunction<T> baseSource;
private volatile boolean isRunning = true;
@Override
public void run(SourceContext<T> ctx) throws Exception {
// 1. 从状态中读取上次处理的位置
String lastPosition = stateStorage.get("last_position");
// 2. 从指定位置开始增量读取
if (lastPosition != null) {
baseSource.seek(lastPosition);
}
// 3. 记录当前处理位置
String currentPosition = null;
while (isRunning) {
T record = baseSource.nextRecord();
if (record == null) {
break;
}
// 收集数据
ctx.collect(record);
// 更新处理位置
currentPosition = baseSource.getPosition();
stateStorage.put("last_position", currentPosition);
// 定期提交状态
if (shouldCommitState()) {
stateStorage.commit();
}
}
}
// 全量重放模式
public void replayFullData(SourceContext<T> ctx) throws Exception {
// 重置位置到最开始
baseSource.seek("beginning");
// 清空状态
stateStorage.delete("last_position");
// 重新运行
run(ctx);
}
}
五、处理逻辑统一
5.1 统一处理函数
java
复制
下载
// 统一处理函数基类
public abstract class UnifiedProcessFunction<IN, OUT>
extends ProcessFunction<IN, OUT> {
// 运行时模式
protected RuntimeMode runtimeMode;
// 状态存储
protected transient MapState<String, Object> state;
@Override
public void open(Configuration parameters) {
// 初始化状态
MapStateDescriptor<String, Object> descriptor =
new MapStateDescriptor<>("state", String.class, Object.class);
state = getRuntimeContext().getMapState(descriptor);
// 获取运行时模式
runtimeMode = getRuntimeContext().getExecutionConfig()
.getExecutionMode();
}
@Override
public void processElement(IN value, Context ctx, Collector<OUT> out)
throws Exception {
if (runtimeMode == RuntimeExecutionMode.BATCH) {
processBatchElement(value, ctx, out);
} else {
processStreamElement(value, ctx, out);
}
}
// 批处理逻辑
protected abstract void processBatchElement(
IN value, Context ctx, Collector<OUT> out) throws Exception;
// 流处理逻辑
protected abstract void processStreamElement(
IN value, Context ctx, Collector<OUT> out) throws Exception;
// 定时器处理
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<OUT> out)
throws Exception {
if (runtimeMode == RuntimeExecutionMode.BATCH) {
// 批处理模式可能不需要定时器
handleBatchTimer(timestamp, ctx, out);
} else {
handleStreamTimer(timestamp, ctx, out);
}
}
}
// 具体业务处理示例:订单统计
public class OrderStatisticsFunction
extends UnifiedProcessFunction<OrderEvent, OrderStats> {
// 流处理:实时更新
@Override
protected void processStreamElement(
OrderEvent order, Context ctx, Collector<OrderStats> out)
throws Exception {
String userId = order.getUserId();
long currentTime = ctx.timestamp();
// 更新实时状态
OrderStats stats = state.get(userId);
if (stats == null) {
stats = new OrderStats(userId);
}
stats.updateRealtime(order);
state.put(userId, stats);
// 实时输出
out.collect(stats);
// 注册窗口定时器
ctx.timerService().registerEventTimeTimer(
getNextWindowTime(currentTime)
);
}
// 批处理:全量计算
@Override
protected void processBatchElement(
OrderEvent order, Context ctx, Collector<OrderStats> out)
throws Exception {
String userId = order.getUserId();
// 累加批量状态
OrderStats stats = state.get(userId);
if (stats == null) {
stats = new OrderStats(userId);
}
stats.updateBatch(order);
state.put(userId, stats);
// 批处理结束时输出(在finish方法中)
}
@Override
public void finish() throws Exception {
if (runtimeMode == RuntimeExecutionMode.BATCH) {
// 批处理结束,输出所有结果
for (String userId : state.keys()) {
OrderStats stats = state.get(userId);
out.collect(stats);
}
}
}
}
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
5.2 窗口处理统一
java
复制
下载
// 统一窗口分配器
public class UnifiedWindowAssigner<T> extends WindowAssigner<T> {
private final WindowType windowType;
private final Duration size;
private final Duration slide;
@Override
public Collection<TimeWindow> assignWindows(
T element,
long timestamp,
WindowAssignerContext context) {
if (windowType == WindowType.TUMBLING) {
// 滚动窗口
long start = TimeWindow.getWindowStartWithOffset(
timestamp, 0, size.toMillis());
return Collections.singletonList(
new TimeWindow(start, start + size.toMillis())
);
} else if (windowType == WindowType.SLIDING) {
// 滑动窗口
List<TimeWindow> windows = new ArrayList<>();
long lastStart = TimeWindow.getWindowStartWithOffset(
timestamp, 0, slide.toMillis());
for (long start = lastStart;
start > timestamp - size.toMillis();
start -= slide.toMillis()) {
windows.add(new TimeWindow(start, start + size.toMillis()));
}
return windows;
} else if (windowType == WindowType.SESSION) {
// 会话窗口
// 需要根据事件间隔动态创建窗口
return assignSessionWindows(element, timestamp);
}
throw new IllegalStateException("Unsupported window type");
}
}
// 统一窗口触发器
public class UnifiedTrigger<T> extends Trigger<T, TimeWindow> {
private final TriggerType triggerType;
@Override
public TriggerResult onElement(
T element,
long timestamp,
TimeWindow window,
TriggerContext ctx) throws Exception {
if (triggerType == TriggerType.EVENT_TIME) {
// 事件时间触发
if (window.maxTimestamp() <= ctx.getCurrentWatermark()) {
return TriggerResult.FIRE;
} else {
ctx.registerEventTimeTimer(window.maxTimestamp());
return TriggerResult.CONTINUE;
}
} else if (triggerType == TriggerType.PROCESSING_TIME) {
// 处理时间触发
ctx.registerProcessingTimeTimer(window.maxTimestamp());
return TriggerResult.CONTINUE;
} else if (triggerType == TriggerType.COUNT) {
// 计数触发
ValueState<Long> countState = ctx.getPartitionedState(
new ValueStateDescriptor<>("count", Long.class));
Long count = countState.value();
if (count == null) {
count = 0L;
}
count++;
countState.update(count);
if (count >= windowSize) {
return TriggerResult.FIRE;
}
return TriggerResult.CONTINUE;
}
return TriggerResult.CONTINUE;
}
}
// 统一窗口函数
public class UnifiedWindowFunction<IN, OUT, KEY, W extends Window>
implements WindowFunction<IN, OUT, KEY, W> {
private final WindowFunction<IN, OUT, KEY, W> streamingFunction;
private final WindowFunction<IN, OUT, KEY, W> batchFunction;
private final RuntimeMode mode;
@Override
public void apply(KEY key, W window, Iterable<IN> input, Collector<OUT> out)
throws Exception {
if (mode == RuntimeExecutionMode.STREAMING) {
streamingFunction.apply(key, window, input, out);
} else {
batchFunction.apply(key, window, input, out);
}
}
}
六、状态一致性与容错
6.1 检查点与保存点统一
java
复制
下载
public class UnifiedCheckpointCoordinator {
private final CheckpointStorage checkpointStorage;
private final StateBackend stateBackend;
private final Map<Long, CheckpointMetadata> checkpoints;
// 创建检查点
public CompletableFuture<CompletedCheckpoint> triggerCheckpoint(
long checkpointId,
CheckpointProperties props) {
CompletableFuture<CompletedCheckpoint> future =
new CompletableFuture<>();
// 1. 准备阶段
prepareCheckpoint(checkpointId);
// 2. 异步执行状态快照
CompletableFuture<Void> snapshotFuture =
snapshotAllStates(checkpointId);
snapshotFuture.whenComplete((v, error) -> {
if (error != null) {
future.completeExceptionally(error);
} else {
// 3. 完成检查点
CompletedCheckpoint checkpoint =
finalizeCheckpoint(checkpointId, props);
future.complete(checkpoint);
// 4. 清理旧检查点
cleanupOldCheckpoints();
}
});
return future;
}
// 流批统一的检查点策略
private CheckpointConfig createCheckpointConfig(RuntimeMode mode) {
CheckpointConfig config = new CheckpointConfig();
if (mode == RuntimeExecutionMode.STREAMING) {
// 流处理:频繁的增量检查点
config.setCheckpointInterval(60000); // 1分钟
config.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
config.setMinPauseBetweenCheckpoints(30000);
config.setCheckpointTimeout(600000);
config.setTolerableCheckpointFailureNumber(2);
} else {
// 批处理:任务结束时的全量检查点
config.setCheckpointInterval(Long.MAX_VALUE);
config.setCheckpointingMode(CheckpointingMode.AT_LEAST_ONCE);
config.setCheckpointTimeout(3600000); // 1小时
}
return config;
}
}
// 统一状态恢复
public class UnifiedStateRecovery {
public void recoverJob(JobGraph jobGraph, String restorePath) {
// 1. 解析恢复路径
RestoreInfo restoreInfo = parseRestorePath(restorePath);
// 2. 加载检查点/保存点
CompletedCheckpoint checkpoint =
checkpointStorage.loadCheckpoint(restoreInfo.getCheckpointId());
// 3. 恢复算子状态
restoreOperatorStates(jobGraph, checkpoint);
// 4. 恢复键控状态
restoreKeyedStates(jobGraph, checkpoint);
// 5. 调整运行配置
adjustRuntimeConfig(jobGraph, restoreInfo);
}
// 部分状态恢复
public void partialStateRecovery(
JobGraph jobGraph,
Map<String, byte[]> partialStates) {
// 逐个算子恢复状态
for (Map.Entry<String, byte[]> entry : partialStates.entrySet()) {
String operatorId = entry.getKey();
byte[] state = entry.getValue();
// 找到对应算子
JobVertex vertex = findJobVertex(jobGraph, operatorId);
// 反序列化状态
OperatorState operatorState = deserializeState(state);
// 恢复状态
restoreOperatorState(vertex, operatorState);
}
}
}
6.2 端到端一致性
java
复制
下载
// 两阶段提交实现
public class TwoPhaseCommitSink<IN> implements SinkFunction<IN> {
private final XASink<IN> xaSink;
private transient List<Transaction> pendingTransactions;
@Override
public void invoke(IN value, Context context) throws Exception {
// 开始事务
Transaction tx = xaSink.beginTransaction();
pendingTransactions.add(tx);
// 写入数据
xaSink.write(value, tx);
}
// 预提交阶段
@Override
public void prepareCommit(long checkpointId) throws Exception {
// 准备提交所有事务
for (Transaction tx : pendingTransactions) {
xaSink.prepareCommit(tx);
}
}
// 提交阶段
@Override
public void commit(long checkpointId) throws Exception {
for (Transaction tx : pendingTransactions) {
xaSink.commit(tx);
}
pendingTransactions.clear();
}
// 回滚阶段
@Override
public void abort(long checkpointId) throws Exception {
for (Transaction tx : pendingTransactions) {
xaSink.rollback(tx);
}
pendingTransactions.clear();
}
}
// 幂等性保证
public class IdempotentSink<IN> implements SinkFunction<IN> {
private final StateStorage stateStorage;
private final SinkFunction<IN> delegate;
@Override
public void invoke(IN value, Context context) throws Exception {
// 生成唯一ID
String recordId = generateRecordId(value, context);
// 检查是否已处理
if (!stateStorage.exists(recordId)) {
// 写入数据
delegate.invoke(value, context);
// 记录已处理
stateStorage.put(recordId, System.currentTimeMillis());
} else {
// 已处理,跳过
log.debug("Record {} already processed, skipping", recordId);
}
}
// 清理过期记录
public void cleanupExpiredRecords(Duration retention) {
long expireTime = System.currentTimeMillis() - retention.toMillis();
stateStorage.scan((recordId, timestamp) -> {
if (timestamp < expireTime) {
stateStorage.delete(recordId);
}
});
}
}
七、实战案例:电商实时分析系统
7.1 系统架构
java
复制
下载
// 电商流批一体分析系统
public class EcommerceAnalyticsSystem {
// 统一数据源配置
private static final SourceConfig SOURCE_CONFIG = SourceConfig.builder()
.type("hybrid")
.batchPath("hdfs:///data/history/orders")
.streamConfig(KafkaConfig.builder()
.bootstrapServers("kafka:9092")
.topic("orders")
.groupId("ecommerce-analytics")
.build())
.build();
// 统一状态配置
private static final StateConfig STATE_CONFIG = StateConfig.builder()
.backend("rocksdb")
.checkpointDir("hdfs:///checkpoints")
.incrementalCheckpoints(true)
.ttl(Duration.ofDays(7))
.build();
public static void main(String[] args) throws Exception {
// 1. 创建统一执行环境
StreamExecutionEnvironment env = createUnifiedEnv();
// 2. 创建统一数据源
UnifiedDataSource<OrderEvent> source =
UnifiedSourceFactory.createSource(SOURCE_CONFIG, env.getRuntimeMode());
// 3. 构建处理流程
DataStream<OrderEvent> orders = source.asStream(env);
DataStream<OrderStats> stats = orders
// 数据清洗
.filter(order -> order.getAmount() > 0)
// 关键字段提取
.map(order -> enrichOrder(order))
// 用户维度聚合
.keyBy(order -> order.getUserId())
// 统一处理
.process(new UnifiedOrderProcessor())
// 窗口聚合
.windowAll(TumblingEventTimeWindows.of(Duration.ofHours(1)))
.process(new HourlyAggregationFunction());
// 4. 结果输出
stats.addSink(new UnifiedSinkFactory()
.createSink(SinkType.JDBC, getJdbcConfig()));
// 5. 执行作业
env.execute("Ecommerce Unified Analytics");
}
private static StreamExecutionEnvironment createUnifiedEnv() {
StreamExecutionEnvironment env = StreamExecutionEnvironment
.getExecutionEnvironment();
// 根据参数决定执行模式
if (args.contains("--batch")) {
env.setRuntimeMode(RuntimeExecutionMode.BATCH);
} else {
env.setRuntimeMode(RuntimeExecutionMode.STREAMING);
}
// 状态后端配置
StateBackend stateBackend = new RocksDBStateBackend(
"hdfs:///state-backend", true);
env.setStateBackend(stateBackend);
// 检查点配置
env.enableCheckpointing(60000);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
env.getCheckpointConfig().setCheckpointTimeout(600000);
env.getCheckpointConfig().setTolerableCheckpointFailureNumber(2);
// 重启策略
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(
3, Time.of(10, TimeUnit.SECONDS)));
return env;
}
}
// 统一订单处理器
public class UnifiedOrderProcessor
extends UnifiedProcessFunction<OrderEvent, OrderStats> {
// 状态定义
private transient MapState<String, UserProfile> userProfiles;
private transient ValueState<HourlyStats> hourlyStats;
private transient ListState<OrderEvent> pendingOrders;
@Override
public void open(Configuration parameters) {
// 初始化状态
MapStateDescriptor<String, UserProfile> userDesc =
new MapStateDescriptor<>("user_profiles",
String.class, UserProfile.class);
userProfiles = getRuntimeContext().getMapState(userDesc);
ValueStateDescriptor<HourlyStats> hourlyDesc =
new ValueStateDescriptor<>("hourly_stats", HourlyStats.class);
hourlyStats = getRuntimeContext().getState(hourlyDesc);
ListStateDescriptor<OrderEvent> pendingDesc =
new ListStateDescriptor<>("pending_orders", OrderEvent.class);
pendingOrders = getRuntimeContext().getListState(pendingDesc);
}
@Override
protected void processStreamElement(OrderEvent order, Context ctx,
Collector<OrderStats> out) throws Exception {
// 实时处理逻辑
String userId = order.getUserId();
// 更新用户画像
UserProfile profile = userProfiles.get(userId);
if (profile == null) {
profile = new UserProfile(userId);
}
profile.updateRealtime(order);
userProfiles.put(userId, profile);
// 更新小时统计
HourlyStats stats = hourlyStats.value();
if (stats == null) {
stats = new HourlyStats(getCurrentHour());
}
stats.addOrder(order);
hourlyStats.update(stats);
// 实时输出
OrderStats result = new OrderStats(userId, profile, stats);
out.collect(result);
// 检查是否需要触发警报
checkAlerts(order, profile);
}
@Override
protected void processBatchElement(OrderEvent order, Context ctx,
Collector<OrderStats> out) throws Exception {
// 批处理逻辑
// 累积计算,不实时输出
pendingOrders.add(order);
// 更新状态
HourlyStats stats = hourlyStats.value();
if (stats == null) {
stats = new HourlyStats(getCurrentHour());
}
stats.addOrder(order);
hourlyStats.update(stats);
}
@Override
public void finish() throws Exception {
if (runtimeMode == RuntimeExecutionMode.BATCH) {
// 批处理结束,输出所有结果
for (OrderEvent order : pendingOrders.get()) {
String userId = order.getUserId();
UserProfile profile = userProfiles.get(userId);
OrderStats result = new OrderStats(userId, profile,
hourlyStats.value());
out.collect(result);
}
}
}
// 状态TTL管理
private void cleanupExpiredStates() {
long currentTime = System.currentTimeMillis();
long expireTime = currentTime - Duration.ofDays(30).toMillis();
// 清理过期用户画像
for (String userId : userProfiles.keys()) {
UserProfile profile = userProfiles.get(userId);
if (profile.getLastActiveTime() < expireTime) {
userProfiles.remove(userId);
}
}
}
}
八、性能优化
8.1 状态访问优化
java
复制
下载
public class StateAccessOptimizer {
// 状态本地缓存
private final Cache<String, Object> stateCache;
// 批量状态访问
public Map<String, Object> batchGetStates(Collection<String> keys) {
Map<String, Object> result = new HashMap<>();
List<String> missingKeys = new ArrayList<>();
// 1. 从缓存获取
for (String key : keys) {
Object value = stateCache.getIfPresent(key);
if (value != null) {
result.put(key, value);
} else {
missingKeys.add(key);
}
}
// 2. 批量查询缺失的key
if (!missingKeys.isEmpty()) {
Map<String, Object> remoteValues = stateStorage.batchGet(missingKeys);
result.putAll(remoteValues);
// 更新缓存
stateCache.putAll(remoteValues);
}
return result;
}
// 异步状态更新
public CompletableFuture<Void> asyncUpdateState(
String key, Object value) {
// 1. 更新本地缓存
stateCache.put(key, value);
// 2. 异步写入远程存储
return CompletableFuture.runAsync(() -> {
stateStorage.put(key, value);
});
}
// 状态压缩
public byte[] compressState(Object state) throws IOException {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(baos);
ObjectOutputStream oos = new ObjectOutputStream(gzip)) {
oos.writeObject(state);
oos.flush();
gzip.finish();
return baos.toByteArray();
}
}
}
// 状态分区优化
public class StatePartitioner {
public static <K> int getPartition(K key, int numPartitions) {
// 基于键的分区策略
int hashCode = key.hashCode();
// 二次哈希减少冲突
hashCode = hashCode ^ (hashCode >>> 16);
// 确保非负
return (hashCode & Integer.MAX_VALUE) % numPartitions;
}
// 热点key检测与处理
public static <K> Map<K, Integer> detectHotKeys(
Map<K, Integer> accessCounts,
double threshold) {
Map<K, Integer> hotKeys = new HashMap<>();
int totalAccess = accessCounts.values().stream()
.mapToInt(Integer::intValue).sum();
for (Map.Entry<K, Integer> entry : accessCounts.entrySet()) {
double accessRate = (double) entry.getValue() / totalAccess;
if (accessRate > threshold) {
hotKeys.put(entry.getKey(), entry.getValue());
}
}
return hotKeys;
}
// 热点key拆分
public static <K> List<K> splitHotKey(K hotKey, int splits) {
List<K> splitKeys = new ArrayList<>();
for (int i = 0; i < splits; i++) {
// 生成拆分后的key
String splitKey = hotKey.toString() + "_" + i;
splitKeys.add((K) splitKey);
}
return splitKeys;
}
}
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
九、监控与运维
9.1 状态监控
java
复制
下载
// 状态监控器
@RestController
public class StateMonitorController {
@Autowired
private StateBackend stateBackend;
@GetMapping("/api/state/metrics")
public StateMetrics getStateMetrics() {
StateMetrics metrics = new StateMetrics();
// 状态大小
metrics.setTotalStateSize(getTotalStateSize());
metrics.setKeyedStateSize(getKeyedStateSize());
metrics.setOperatorStateSize(getOperatorStateSize());
// 访问频率
metrics.setReadOpsPerSecond(getReadOps());
metrics.setWriteOpsPerSecond(getWriteOps());
// 命中率
metrics.setCacheHitRate(getCacheHitRate());
// 延迟
metrics.setAverageReadLatency(getAvgReadLatency());
metrics.setAverageWriteLatency(getAvgWriteLatency());
return metrics;
}
@GetMapping("/api/state/snapshot")
public StateSnapshot takeStateSnapshot() {
// 创建状态快照
long snapshotId = System.currentTimeMillis();
// 序列化状态
byte[] snapshotData = serializeAllStates();
// 存储快照
stateBackend.storeSnapshot(snapshotId, snapshotData);
return new StateSnapshot(snapshotId, snapshotData.length);
}
@PostMapping("/api/state/cleanup")
public CleanupResult cleanupExpiredStates(@RequestParam Duration ttl) {
CleanupResult result = new CleanupResult();
long startTime = System.currentTimeMillis();
// 清理过期状态
int cleanedKeys = stateBackend.cleanupExpired(ttl);
result.setCleanedKeys(cleanedKeys);
result.setDuration(System.currentTimeMillis() - startTime);
result.setSuccess(true);
return result;
}
}
// 状态告警
@Component
public class StateAlertManager {
@Scheduled(fixedRate = 60000) // 每分钟检查一次
public void checkStateAlerts() {
StateMetrics metrics = stateMonitor.getStateMetrics();
// 检查状态大小告警
if (metrics.getTotalStateSize() > MAX_STATE_SIZE) {
sendAlert("状态大小超出阈值: " + metrics.getTotalStateSize());
}
// 检查访问延迟告警
if (metrics.getAverageReadLatency() > MAX_LATENCY) {
sendAlert("状态读取延迟过高: " + metrics.getAverageReadLatency());
}
// 检查缓存命中率告警
if (metrics.getCacheHitRate() < MIN_CACHE_HIT_RATE) {
sendAlert("缓存命中率过低: " + metrics.getCacheHitRate());
}
}
}
十、总结
10.1 流批一体架构核心价值
-
代码统一:一套代码处理流批场景
-
状态统一:流批共享状态,结果一致
-
运维统一:统一监控、告警、调试
-
成本优化:资源复用,减少重复计算
10.2 关键实现技术
-
统一API层:抽象流批差异
-
分层状态存储:根据访问模式选择存储介质
-
增量-全量融合:历史数据批量+实时数据流式
-
一致性保证:检查点、事务、幂等性
10.3 最佳实践
-
状态设计:
-
区分冷热数据,热数据放内存
-
设置合理的TTL
-
监控状态大小和访问模式
-
-
资源管理:
-
流批作业资源隔离
-
动态资源调整
-
状态存储分级
-
-
容错设计:
-
定期状态快照
-
多版本状态管理
-
优雅降级策略
-
流批一体是现代大数据架构的必然趋势,通过统一的状态管理和处理逻辑,可以显著降低系统复杂度,提高开发效率和运维便利性。