Flink Runtime 开发指南
开发环境设置
1. 环境要求
- Java: JDK 8/11/17 (推荐 JDK 11)
- Maven: 3.6+
- IDE: IntelliJ IDEA 或 Eclipse
- Git: 版本控制
2. 项目导入
# 克隆项目
git clone https://github.com/apache/flink.git
cd flink
# 编译项目
mvn clean compile -DskipTests
# 导入到 IDE
# 在 IntelliJ IDEA 中: File -> Open -> 选择 flink 目录
3. 模块结构
flink-runtime/
├── src/
│ ├── main/java/org/apache/flink/runtime/
│ │ ├── jobmaster/ # 作业管理器
│ │ ├── taskexecutor/ # 任务执行器
│ │ ├── resourcemanager/ # 资源管理器
│ │ ├── scheduler/ # 调度器
│ │ ├── executiongraph/ # 执行图
│ │ ├── state/ # 状态管理
│ │ ├── checkpoint/ # 检查点
│ │ ├── net/ # 网络通信
│ │ ├── highavailability/ # 高可用性
│ │ └── ... # 其他组件
│ └── test/java/ # 测试代码
├── pom.xml # Maven 配置
└── README.md # 模块说明
代码规范
1. 命名规范
// 类名: PascalCase
public class JobMaster {
// 常量: UPPER_SNAKE_CASE
public static final String DEFAULT_JOB_NAME = "default";
// 字段名: camelCase
private final JobMasterGateway gateway;
// 方法名: camelCase
public CompletableFuture<JobResult> submitJob(JobGraph jobGraph) {
// 实现
}
}
2. 包结构规范
org.apache.flink.runtime.{component}/
├── {Component}.java # 主要实现类
├── {Component}Gateway.java # RPC 网关接口
├── {Component}Configuration.java # 配置类
├── {Component}Service.java # 服务接口
├── event/ # 事件类
├── exceptions/ # 异常类
└── utils/ # 工具类
3. 异常处理
public class JobMasterException extends FlinkException {
public JobMasterException(String message) {
super(message);
}
public JobMasterException(String message, Throwable cause) {
super(message, cause);
}
}
核心组件开发
1. JobMaster 开发
基本结构
public class JobMaster implements JobMasterService {
private final JobMasterGateway gateway;
private final JobMasterConfiguration configuration;
private final CompletableFuture<Void> terminationFuture;
public JobMaster(
JobMasterConfiguration configuration,
JobMasterGateway gateway) {
this.configuration = configuration;
this.gateway = gateway;
this.terminationFuture = new CompletableFuture<>();
}
@Override
public CompletableFuture<Void> start() {
// 启动逻辑
return CompletableFuture.completedFuture(null);
}
@Override
public CompletableFuture<Void> closeAsync() {
// 关闭逻辑
return terminationFuture;
}
}
状态管理
public enum JobMasterState {
CREATED,
RUNNING,
SUSPENDED,
FAILED,
FINISHED
}
private volatile JobMasterState state = JobMasterState.CREATED;
private void transitionTo(JobMasterState newState) {
JobMasterState oldState = this.state;
this.state = newState;
LOG.info("JobMaster state transition: {} -> {}", oldState, newState);
}
2. TaskExecutor 开发
任务提交
public class TaskExecutor {
private final Map<ExecutionAttemptID, Task> runningTasks = new ConcurrentHashMap<>();
public CompletableFuture<Acknowledge> submitTask(
TaskDeploymentDescriptor tdd,
JobMasterId jobMasterId,
Time timeout) {
Task task = new Task(tdd, jobMasterId);
runningTasks.put(tdd.getExecutionAttemptId(), task);
return task.startTask()
.thenApply(ignored -> Acknowledge.get());
}
}
资源管理
public class TaskExecutorResourceManager {
private final MemoryManager memoryManager;
private final NetworkBufferPool networkBufferPool;
public CompletableFuture<Boolean> requestMemory(
MemoryRequest request) {
return memoryManager.requestMemory(request);
}
}
3. ResourceManager 开发
资源分配
public class ResourceManager {
private final Map<ResourceID, TaskExecutorRegistration> registeredTaskExecutors =
new ConcurrentHashMap<>();
public CompletableFuture<RegistrationResponse> registerTaskExecutor(
TaskExecutorRegistration registration) {
registeredTaskExecutors.put(registration.getResourceId(), registration);
return CompletableFuture.completedFuture(
new TaskExecutorRegistrationSuccess(registration.getResourceId()));
}
}
RPC 开发
1. 网关接口定义
public interface JobMasterGateway extends RpcGateway {
CompletableFuture<JobResult> submitJob(
JobGraph jobGraph,
Time timeout);
CompletableFuture<Acknowledge> cancelJob(
Time timeout);
CompletableFuture<SerializedInputSplit> requestNextInputSplit(
JobVertexID vertexID,
ExecutionAttemptID executionAttempt,
Time timeout);
}
2. RPC 实现
public class JobMasterRpcService extends RpcService {
private final JobMasterGateway gateway;
public JobMasterRpcService(JobMasterGateway gateway) {
this.gateway = gateway;
}
@Override
public CompletableFuture<JobResult> submitJob(
JobGraph jobGraph,
Time timeout) {
return gateway.submitJob(jobGraph, timeout);
}
}
状态管理开发
1. 状态后端接口
public interface StateBackend {
CheckpointStorage createCheckpointStorage(JobID jobId);
StateBackend createSerializedCopy(Configuration config);
}
2. 状态快照
public class StateSnapshotContext {
private final long checkpointId;
private final long checkpointTimestamp;
public CompletableFuture<StateSnapshotResult> snapshotState() {
// 实现状态快照逻辑
return CompletableFuture.completedFuture(new StateSnapshotResult());
}
}
测试开发
1. 单元测试
@Test
public void testJobMasterStart() throws Exception {
JobMasterConfiguration config = JobMasterConfiguration.newConfiguration();
JobMasterGateway gateway = mock(JobMasterGateway.class);
JobMaster jobMaster = new JobMaster(config, gateway);
CompletableFuture<Void> startFuture = jobMaster.start();
startFuture.get(10, TimeUnit.SECONDS);
assertThat(jobMaster.getState()).isEqualTo(JobMasterState.RUNNING);
}
2. 集成测试
@Test
public void testJobSubmission() throws Exception {
MiniCluster miniCluster = new MiniCluster(configuration);
miniCluster.start();
JobGraph jobGraph = createTestJobGraph();
CompletableFuture<JobResult> jobResultFuture =
miniCluster.submitJob(jobGraph);
JobResult jobResult = jobResultFuture.get(30, TimeUnit.SECONDS);
assertThat(jobResult.isSuccess()).isTrue();
miniCluster.close();
}
3. 性能测试
@Test
public void testThroughput() {
long startTime = System.currentTimeMillis();
// 执行测试逻辑
for (int i = 0; i < 1000; i++) {
// 执行操作
}
long endTime = System.currentTimeMillis();
long throughput = 1000 / (endTime - startTime);
assertThat(throughput).isGreaterThan(100); // 每秒至少100个操作
}
配置管理
1. 配置类定义
public class JobMasterConfiguration {
private final Time rpcTimeout;
private final Time slotRequestTimeout;
private final Time slotIdleTimeout;
public static JobMasterConfiguration newConfiguration() {
return new JobMasterConfiguration(
Time.seconds(10),
Time.seconds(5),
Time.seconds(30));
}
}
2. 配置验证
public class ConfigurationValidator {
public static void validateJobMasterConfiguration(
JobMasterConfiguration config) {
if (config.getRpcTimeout().getSize() <= 0) {
throw new IllegalArgumentException("RPC timeout must be positive");
}
}
}
日志和监控
1. 日志配置
public class JobMaster {
private static final Logger LOG = LoggerFactory.getLogger(JobMaster.class);
public void start() {
LOG.info("Starting JobMaster with configuration: {}", configuration);
// 启动逻辑
LOG.info("JobMaster started successfully");
}
}
2. 指标收集
public class JobMasterMetrics {
private final Counter jobSubmissions;
private final Gauge<Integer> runningJobs;
public JobMasterMetrics(MetricGroup metricGroup) {
this.jobSubmissions = metricGroup.counter("job_submissions");
this.runningJobs = metricGroup.gauge("running_jobs", () -> getRunningJobCount());
}
public void incrementJobSubmissions() {
jobSubmissions.inc();
}
}
性能优化
1. 内存优化
public class MemoryOptimizedBuffer {
private final ByteBuffer buffer;
public MemoryOptimizedBuffer(int size) {
// 使用堆外内存
this.buffer = ByteBuffer.allocateDirect(size);
}
public void write(byte[] data) {
buffer.put(data);
}
}
2. 并发优化
public class ConcurrentTaskExecutor {
private final ExecutorService executorService;
private final ConcurrentHashMap<ExecutionAttemptID, Task> tasks;
public CompletableFuture<Void> submitTask(Task task) {
return CompletableFuture.runAsync(task::execute, executorService);
}
}
故障处理
1. 异常处理
public class FaultTolerantJobMaster {
private final CircuitBreaker circuitBreaker;
public CompletableFuture<JobResult> submitJob(JobGraph jobGraph) {
return circuitBreaker.runSupplier(() -> {
try {
return doSubmitJob(jobGraph);
} catch (Exception e) {
LOG.error("Failed to submit job", e);
throw new JobSubmissionException("Job submission failed", e);
}
});
}
}
2. 重试机制
public class RetryableOperation {
public <T> CompletableFuture<T> executeWithRetry(
Supplier<CompletableFuture<T>> operation,
int maxRetries) {
return operation.get()
.handle((result, throwable) -> {
if (throwable != null && maxRetries > 0) {
return executeWithRetry(operation, maxRetries - 1);
}
return CompletableFuture.completedFuture(result);
})
.thenCompose(Function.identity());
}
}
代码审查清单
1. 功能检查
2. 代码质量
3. 架构设计
常见问题
1. 内存泄漏
// 问题: 没有正确释放资源
public class ResourceLeakExample {
private final List<ByteBuffer> buffers = new ArrayList<>();
public void addBuffer(ByteBuffer buffer) {
buffers.add(buffer); // 可能导致内存泄漏
}
}
// 解决方案: 正确管理资源生命周期
public class ResourceManagementExample {
private final List<ByteBuffer> buffers = new ArrayList<>();
public void addBuffer(ByteBuffer buffer) {
buffers.add(buffer);
}
public void cleanup() {
buffers.clear();
}
}
2. 线程安全
// 问题: 非线程安全的实现
public class ThreadUnsafeExample {
private int counter = 0;
public void increment() {
counter++; // 非原子操作
}
}
// 解决方案: 使用线程安全的数据结构
public class ThreadSafeExample {
private final AtomicInteger counter = new AtomicInteger(0);
public void increment() {
counter.incrementAndGet();
}
}
最佳实践
1. 异步编程
public class AsyncBestPractices {
// 使用 CompletableFuture 进行异步操作
public CompletableFuture<Result> processAsync(Input input) {
return CompletableFuture.supplyAsync(() -> {
// 异步处理逻辑
return process(input);
});
}
// 避免阻塞操作
public CompletableFuture<Result> nonBlockingOperation() {
return CompletableFuture.completedFuture(new Result());
}
}
2. 资源管理
public class ResourceManagement {
// 使用 try-with-resources
public void processWithResource() {
try (AutoCloseableResource resource = new AutoCloseableResource()) {
resource.process();
} catch (Exception e) {
LOG.error("Processing failed", e);
}
}
}
3. 配置管理
public class ConfigurationBestPractices {
// 使用构建者模式
public static class Builder {
private Time timeout = Time.seconds(10);
private int maxRetries = 3;
public Builder setTimeout(Time timeout) {
this.timeout = timeout;
return this;
}
public Configuration build() {
return new Configuration(timeout, maxRetries);
}
}
}
参考资料