一、Consumer Group Rebalance核心机制
1. Rebalance触发条件
图表
代码
复制
下载
全屏
graph TB
A[Rebalance触发事件] --> B{判断类型}
B --> C[新Consumer加入]
B --> D[Consumer异常退出]
B --> E[分区数量变化]
B --> F[Coordinator变更]
C --> G[Group Membership变化]
D --> G
E --> H[Topic Metadata变化]
F --> I[Coordinator故障]
G --> J[触发Rebalance]
H --> J
I --> J
2. Coordinator核心架构
java
复制
下载
// GroupCoordinator负责管理Consumer Group状态
public class GroupCoordinator {
// 存储Group元数据
private final ConcurrentHashMap<String, GroupMetadata> groups =
new ConcurrentHashMap<>();
// 心跳检测
private final Heartbeat heartbeat;
// Group状态机
public enum GroupState {
EMPTY, // 空组,没有成员
PREPARING_REBALANCE, // 准备重新平衡
COMPLETING_REBALANCE, // 完成重新平衡
STABLE, // 稳定状态
DEAD // 组已死亡
}
/**
* Consumer加入组请求处理
*/
public JoinGroupResponse handleJoinGroup(JoinGroupRequest request) {
String groupId = request.groupId();
String memberId = request.memberId();
int sessionTimeout = request.sessionTimeout();
// 1. 获取或创建Group元数据
GroupMetadata group = groups.computeIfAbsent(groupId,
id -> new GroupMetadata(id, GroupState.EMPTY));
synchronized (group) {
// 2. 验证会话超时
if (sessionTimeout < group.rebalanceTimeout() / 2) {
throw new InvalidSessionTimeoutException();
}
// 3. 处理Consumer加入
MemberMetadata member = new MemberMetadata(
memberId,
groupId,
request.groupInstanceId().orElse(null),
request.clientId(),
request.clientHost(),
sessionTimeout,
request.protocolType(),
request.supportedProtocols()
);
// 4. 更新Group状态
if (group.is(GroupState.EMPTY) ||
group.is(GroupState.DEAD) ||
member.isNewMember()) {
// 新Consumer加入,触发Rebalance
group.transitionTo(GroupState.PREPARING_REBALANCE);
group.add(member);
} else if (group.is(GroupState.STABLE)) {
// 现有Consumer重新加入
if (member.matches(group.get(memberId))) {
// 正常心跳续期
group.updateMember(member);
} else {
// Consumer元数据变化,触发Rebalance
group.transitionTo(GroupState.PREPARING_REBALANCE);
group.updateMember(member);
}
}
// 5. 等待所有Consumer加入或超时
CompletableFuture<JoinGroupResponse> responseFuture =
new CompletableFuture<>();
group.awaitingJoinCallback = new DelayedJoin(
group,
responseFuture,
group.rebalanceTimeout()
);
// 6. 如果所有Consumer已加入,立即完成
if (group.allMembersJoined()) {
completeJoinGroup(group, responseFuture);
}
return responseFuture.get();
}
}
}
二、完整Rebalance流程
1. Rebalance状态流转
java
复制
下载
public class GroupMetadata {
private String groupId;
private GroupState state;
private String protocol;
private Map<String, MemberMetadata> members = new HashMap<>();
private int generationId = 0; // 代数,每次Rebalance递增
/**
* Rebalance状态机
*/
public synchronized void transitionTo(GroupState targetState) {
switch (this.state) {
case EMPTY:
if (targetState == GroupState.PREPARING_REBALANCE) {
this.state = targetState;
this.generationId++;
}
break;
case STABLE:
if (targetState == GroupState.PREPARING_REBALANCE) {
this.state = targetState;
this.generationId++;
}
break;
case PREPARING_REBALANCE:
if (targetState == GroupState.COMPLETING_REBALANCE) {
this.state = targetState;
}
break;
case COMPLETING_REBALANCE:
if (targetState == GroupState.STABLE) {
this.state = targetState;
}
break;
default:
throw new IllegalStateException(
"Invalid transition from " + this.state + " to " + targetState);
}
log.info("Group {} transitioned from {} to {}",
groupId, this.state, targetState);
}
/**
* 完整的Rebalance流程
*/
public void performRebalance() {
// 阶段1: 发现Coordinator
Node coordinator = findCoordinator(groupId);
// 阶段2: 加入Group
JoinGroupResponse joinResponse = joinGroup(coordinator);
if (joinResponse.error() == Errors.NONE) {
// 阶段3: Leader Consumer分配分区
if (joinResponse.isLeader()) {
Map<String, List<TopicPartition>> assignment =
performPartitionAssignment(
joinResponse.members(),
joinResponse.groupProtocol()
);
// 阶段4: 同步分配结果
syncGroup(coordinator, assignment);
} else {
// Follower等待分配结果
syncGroup(coordinator, null);
}
// 阶段5: 获取分配结果并开始消费
SyncGroupResponse syncResponse = syncGroup.get();
applyAssignment(syncResponse.assignment());
// 阶段6: 开始心跳维持会话
startHeartbeat(coordinator);
}
}
}
2. JoinGroup协议
java
复制
下载
// JoinGroupRequest关键字段
public class JoinGroupRequest extends AbstractRequest {
private final String groupId;
private final int sessionTimeout;
private final int rebalanceTimeout;
private final String memberId;
private final String protocolType; // "consumer"
private final List<Protocol> protocols; // 支持的分区分配策略
public static class Protocol {
private final String name; // 策略名称:range, roundrobin, sticky
private final byte[] metadata; // 订阅信息序列化
}
}
// JoinGroupResponse关键字段
public class JoinGroupResponse extends AbstractResponse {
private final Errors error;
private final int generationId; // 代数
private final String groupProtocol; // 选择的分配策略
private final String leaderId; // Leader Consumer ID
private final String memberId; // 当前Consumer ID
private final List<MemberInfo> members; // Group成员列表
}
三、分区分配策略详解
1. RangeAssignor(默认策略)
java
复制
下载
public class RangeAssignor implements PartitionAssignor {
@Override
public Map<String, List<TopicPartition>> assign(
Map<String, Integer> partitionsPerTopic,
Map<String, Subscription> subscriptions) {
Map<String, List<TopicPartition>> assignment = new HashMap<>();
for (String memberId : subscriptions.keySet()) {
assignment.put(memberId, new ArrayList<>());
}
// 对每个Topic独立分配
for (Map.Entry<String, Integer> entry : partitionsPerTopic.entrySet()) {
String topic = entry.getKey();
int numPartitions = entry.getValue();
// 获取订阅该Topic的所有Consumer
List<String> consumersForTopic = new ArrayList<>();
for (Map.Entry<String, Subscription> subscriptionEntry :
subscriptions.entrySet()) {
if (subscriptionEntry.getValue().topics().contains(topic)) {
consumersForTopic.add(subscriptionEntry.getKey());
}
}
// 按字母顺序排序Consumer
Collections.sort(consumersForTopic);
int numConsumers = consumersForTopic.size();
if (numConsumers == 0) continue;
// 计算每个Consumer分配的分区数
int partitionsPerConsumer = numPartitions / numConsumers;
int consumersWithExtraPartition = numPartitions % numConsumers;
// 分配分区
List<TopicPartition> partitions = partitions(topic, numPartitions);
int position = 0;
for (int i = 0; i < numConsumers; i++) {
String consumer = consumersForTopic.get(i);
int extra = (i < consumersWithExtraPartition) ? 1 : 0;
int numPartitionsForConsumer = partitionsPerConsumer + extra;
// 分配连续的分区范围
if (numPartitionsForConsumer > 0) {
List<TopicPartition> assignedPartitions =
partitions.subList(position, position + numPartitionsForConsumer);
assignment.get(consumer).addAll(assignedPartitions);
position += numPartitionsForConsumer;
}
}
}
return assignment;
}
// 示例:topic有7个分区,3个Consumer
// Consumer分配结果:
// C1: [p0, p1, p2] // 3个分区
// C2: [p3, p4] // 2个分区
// C3: [p5, p6] // 2个分区
}
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
2. RoundRobinAssignor
java
复制
下载
public class RoundRobinAssignor implements PartitionAssignor {
@Override
public Map<String, List<TopicPartition>> assign(
Map<String, Integer> partitionsPerTopic,
Map<String, Subscription> subscriptions) {
Map<String, List<TopicPartition>> assignment = new HashMap<>();
// 检查所有Consumer是否订阅相同的Topic
Set<String> allSubscribedTopics = new HashSet<>();
for (Subscription subscription : subscriptions.values()) {
allSubscribedTopics.addAll(subscription.topics());
}
boolean allConsumersHaveSameSubscription = true;
for (Subscription subscription : subscriptions.values()) {
if (!subscription.topics().equals(allSubscribedTopics)) {
allConsumersHaveSameSubscription = false;
break;
}
}
if (allConsumersHaveSameSubscription) {
// 所有Consumer订阅相同Topic,使用全局轮询
// 1. 收集所有分区并排序
List<TopicPartition> allPartitions = new ArrayList<>();
for (Map.Entry<String, Integer> entry : partitionsPerTopic.entrySet()) {
String topic = entry.getKey();
int numPartitions = entry.getValue();
for (int i = 0; i < numPartitions; i++) {
allPartitions.add(new TopicPartition(topic, i));
}
}
// 按Topic和分区号排序
Collections.sort(allPartitions);
// 2. 收集所有Consumer并排序
List<String> consumers = new ArrayList<>(subscriptions.keySet());
Collections.sort(consumers);
// 3. 轮询分配
for (int i = 0; i < allPartitions.size(); i++) {
String consumer = consumers.get(i % consumers.size());
assignment.computeIfAbsent(consumer, k -> new ArrayList<>())
.add(allPartitions.get(i));
}
} else {
// Consumer订阅不同的Topic,按Topic分组轮询
for (Map.Entry<String, Integer> entry : partitionsPerTopic.entrySet()) {
String topic = entry.getKey();
int numPartitions = entry.getValue();
// 获取订阅该Topic的Consumer
List<String> consumersForTopic = new ArrayList<>();
for (Map.Entry<String, Subscription> subscriptionEntry :
subscriptions.entrySet()) {
if (subscriptionEntry.getValue().topics().contains(topic)) {
consumersForTopic.add(subscriptionEntry.getKey());
}
}
Collections.sort(consumersForTopic);
if (consumersForTopic.isEmpty()) continue;
// 对该Topic的分区进行轮询分配
for (int partition = 0; partition < numPartitions; partition++) {
String consumer = consumersForTopic.get(
partition % consumersForTopic.size());
assignment.computeIfAbsent(consumer, k -> new ArrayList<>())
.add(new TopicPartition(topic, partition));
}
}
}
return assignment;
}
// 示例:topic有7个分区,3个Consumer订阅相同Topic
// Consumer分配结果:
// C1: [p0, p3, p6]
// C2: [p1, p4]
// C3: [p2, p5]
}
3. StickyAssignor(粘性分配)
java
复制
下载
public class StickyAssignor implements PartitionAssignor {
private Map<String, List<TopicPartition>> currentAssignment = new HashMap<>();
@Override
public Map<String, List<TopicPartition>> assign(
Map<String, Integer> partitionsPerTopic,
Map<String, Subscription> subscriptions) {
Map<String, List<TopicPartition>> assignment = new HashMap<>();
// 第一阶段:尽可能保持原有分配
Map<String, Set<TopicPartition>> consumerToCurrentPartitions =
new HashMap<>();
// 收集当前每个Consumer的分区
for (Map.Entry<String, List<TopicPartition>> entry :
currentAssignment.entrySet()) {
String consumer = entry.getKey();
List<TopicPartition> partitions = entry.getValue();
// 只保留仍然订阅这些分区的Consumer
if (subscriptions.containsKey(consumer)) {
Subscription subscription = subscriptions.get(consumer);
Set<String> subscribedTopics = new HashSet<>(subscription.topics());
// 过滤出Consumer仍然订阅的分区
Set<TopicPartition> validPartitions = partitions.stream()
.filter(p -> subscribedTopics.contains(p.topic()))
.collect(Collectors.toSet());
consumerToCurrentPartitions.put(consumer, validPartitions);
}
}
// 第二阶段:分配未分配的分区
Set<TopicPartition> allPartitions = new HashSet<>();
for (Map.Entry<String, Integer> entry : partitionsPerTopic.entrySet()) {
String topic = entry.getKey();
int numPartitions = entry.getValue();
for (int i = 0; i < numPartitions; i++) {
allPartitions.add(new TopicPartition(topic, i));
}
}
// 已分配的分区
Set<TopicPartition> alreadyAssigned = consumerToCurrentPartitions.values()
.stream()
.flatMap(Set::stream)
.collect(Collectors.toSet());
// 需要分配的分区
Set<TopicPartition> partitionsToAssign = new HashSet<>(allPartitions);
partitionsToAssign.removeAll(alreadyAssigned);
// 第三阶段:平衡分配
balanceAssignment(assignment, consumerToCurrentPartitions, partitionsToAssign);
// 更新当前分配
this.currentAssignment = assignment;
return assignment;
}
/**
* 平衡分配算法
*/
private void balanceAssignment(Map<String, List<TopicPartition>> assignment,
Map<String, Set<TopicPartition>> currentPartitions,
Set<TopicPartition> partitionsToAssign) {
// 1. 初始化分配
for (String consumer : currentPartitions.keySet()) {
assignment.put(consumer, new ArrayList<>(currentPartitions.get(consumer)));
}
// 2. 计算目标分配数量
int totalPartitions = assignment.values().stream()
.mapToInt(List::size)
.sum() + partitionsToAssign.size();
int numConsumers = assignment.size();
int targetPartitionsPerConsumer = totalPartitions / numConsumers;
int consumersWithExtra = totalPartitions % numConsumers;
// 3. 分配额外分区,优先分配给分区少的Consumer
List<Map.Entry<String, List<TopicPartition>>> sortedConsumers =
new ArrayList<>(assignment.entrySet());
Collections.sort(sortedConsumers,
Comparator.comparingInt(e -> e.getValue().size()));
Iterator<TopicPartition> partitionIterator = partitionsToAssign.iterator();
for (int i = 0; i < numConsumers && partitionIterator.hasNext(); i++) {
Map.Entry<String, List<TopicPartition>> entry = sortedConsumers.get(i);
entry.getValue().add(partitionIterator.next());
}
// 4. 如果还有剩余分区,继续分配
while (partitionIterator.hasNext()) {
// 选择当前分区最少的Consumer
sortedConsumers.sort(Comparator.comparingInt(e -> e.getValue().size()));
sortedConsumers.get(0).getValue().add(partitionIterator.next());
}
// 5. 最终排序分区
for (List<TopicPartition> partitions : assignment.values()) {
Collections.sort(partitions);
}
}
// StickyAssignor优势:
// 1. 最小化分区移动
// 2. 保持分配平衡
// 3. 减少Rebalance开销
}
4. CooperativeStickyAssignor(协作粘性)
java
复制
下载
public class CooperativeStickyAssignor extends StickyAssignor {
@Override
public Map<String, List<TopicPartition>> assign(
Map<String, Integer> partitionsPerTopic,
Map<String, Subscription> subscriptions) {
// 增量Rebalance逻辑
Map<String, List<TopicPartition>> assignment = super.assign(
partitionsPerTopic, subscriptions);
// 在增量Rebalance中,Consumer不会立即放弃原有分区
// 而是分两阶段进行:
// 阶段1: 保持原有分配,同时获得新分配
// 阶段2: 放弃不再分配的分区
return incrementalRebalanceAssignment(assignment);
}
private Map<String, List<TopicPartition>> incrementalRebalanceAssignment(
Map<String, List<TopicPartition>> newAssignment) {
Map<String, List<TopicPartition>> incrementalAssignment = new HashMap<>();
for (Map.Entry<String, List<TopicPartition>> entry : newAssignment.entrySet()) {
String consumer = entry.getKey();
List<TopicPartition> newPartitions = entry.getValue();
List<TopicPartition> currentPartitions = currentAssignment.getOrDefault(
consumer, Collections.emptyList());
// 计算新增的分区
List<TopicPartition> addedPartitions = new ArrayList<>(newPartitions);
addedPartitions.removeAll(currentPartitions);
// 计算需要放弃的分区(在当前分配中但不在新分配中)
List<TopicPartition> revokedPartitions = new ArrayList<>(currentPartitions);
revokedPartitions.removeAll(newPartitions);
// 在增量Rebalance中,暂时不放弃分区
incrementalAssignment.put(consumer, addedPartitions);
// 记录需要放弃的分区,在下次Rebalance时处理
if (!revokedPartitions.isEmpty()) {
log.info("Consumer {} will revoke partitions {} in next rebalance",
consumer, revokedPartitions);
}
}
return incrementalAssignment;
}
}
四、增量Rebalance(Incremental Rebalance)
1. EAGER vs INCREMENTAL_REBALANCE
java
复制
下载
public class ConsumerConfig {
// Rebalance协议配置
public static final String REBALANCE_PROTOCOL_CONFIG = "rebalance.protocol";
public static final String REBALANCE_PROTOCOL_DOC =
"The rebalance protocol to use: 'eager' or 'cooperative'";
// 默认使用协作式Rebalance(Kafka 2.4+)
public static final String DEFAULT_REBALANCE_PROTOCOL = "cooperative";
}
// Eager Rebalance(传统方式)
public class EagerRebalanceProtocol implements RebalanceProtocol {
@Override
public void performRebalance(ConsumerRebalanceListener listener) {
// 1. 所有Consumer停止消费
listener.onPartitionsRevoked(currentAssignment);
// 2. 等待所有Consumer就绪
waitForAllConsumers();
// 3. 重新分配分区
Map<String, List<TopicPartition>> newAssignment = assignPartitions();
// 4. 所有Consumer开始消费新分区
listener.onPartitionsAssigned(newAssignment);
}
}
// Incremental Rebalance(增量方式)
public class CooperativeRebalanceProtocol implements RebalanceProtocol {
@Override
public void performRebalance(ConsumerRebalanceListener listener) {
// 阶段1: Consumer保持当前分区继续消费
// 只放弃明确需要放弃的分区
List<TopicPartition> toRevoke = calculatePartitionsToRevoke();
if (!toRevoke.isEmpty()) {
listener.onPartitionsRevoked(toRevoke);
revokePartitions(toRevoke);
}
// 阶段2: 加入Group,获取新分配
JoinGroupResponse joinResponse = joinGroup();
// 阶段3: 获取新分配的分区
SyncGroupResponse syncResponse = syncGroup();
List<TopicPartition> newlyAssigned = syncResponse.assignment();
// 阶段4: 添加新分区到当前分配
currentAssignment.addAll(newlyAssigned);
listener.onPartitionsAssigned(newlyAssigned);
// 优势:
// 1. 减少Stop-The-World时间
// 2. 保持消费连续性
// 3. 减少重复消费
}
}
2. 配置与监控
java
复制
下载
@Configuration
public class KafkaConsumerConfig {
@Bean
public Map<String, Object> consumerConfigs() {
Map<String, Object> props = new HashMap<>();
// 基础配置
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
props.put(ConsumerConfig.GROUP_ID_CONFIG, "my-group");
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
StringDeserializer.class);
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
StringDeserializer.class);
// Rebalance相关配置
props.put(ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY_CONFIG,
// 使用协作式粘性分配器(推荐)
"org.apache.kafka.clients.consumer.CooperativeStickyAssignor");
props.put(ConsumerConfig.REBALANCE_PROTOCOL_CONFIG, "cooperative");
// 会话和心跳配置
props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 45000); // 45秒
props.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, 3000); // 3秒
// Rebalance超时配置
props.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 300000); // 5分钟
props.put(ConsumerConfig.REBALANCE_TIMEOUT_MS_CONFIG, 60000); // 1分钟
// 开启自动提交偏移量
props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true);
props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 5000);
// 消费起始位置
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
return props;
}
@Bean
public ConsumerFactory<String, String> consumerFactory() {
return new DefaultKafkaConsumerFactory<>(consumerConfigs());
}
@Bean
public ConcurrentKafkaListenerContainerFactory<String, String>
kafkaListenerContainerFactory() {
ConcurrentKafkaListenerContainerFactory<String, String> factory =
new ConcurrentKafkaListenerContainerFactory<>();
factory.setConsumerFactory(consumerFactory());
// 设置并发消费者数量
factory.setConcurrency(3);
// 配置Rebalance监听器
factory.getContainerProperties().setConsumerRebalanceListener(
new ConsumerRebalanceListener() {
@Override
public void onPartitionsRevoked(Collection<TopicPartition> partitions) {
log.info("Partitions revoked: {}", partitions);
// 1. 提交偏移量
commitOffsets();
// 2. 清理状态
cleanupState(partitions);
}
@Override
public void onPartitionsAssigned(Collection<TopicPartition> partitions) {
log.info("Partitions assigned: {}", partitions);
// 1. 初始化状态
initializeState(partitions);
// 2. 从提交的偏移量开始消费
seekToCommittedOffsets(partitions);
}
@Override
public void onPartitionsLost(Collection<TopicPartition> partitions) {
log.warn("Partitions lost: {}", partitions);
// 处理分区丢失(通常发生在长时间GC或网络问题后)
}
});
// 设置批量处理
factory.setBatchListener(true);
return factory;
}
}
五、性能优化与问题排查
1. 避免频繁Rebalance
java
复制
下载
public class RebalanceOptimizer {
// 常见Rebalance原因及解决方案
public void diagnoseAndFixRebalanceIssues(KafkaConsumer<?, ?> consumer) {
Map<String, Object> configs = consumer.configs();
// 1. 会话超时设置过短
int sessionTimeout = (int) configs.get("session.timeout.ms");
if (sessionTimeout < 30000) {
log.warn("Session timeout {}ms is too short, recommend at least 30000ms",
sessionTimeout);
// 解决方案:增大session.timeout.ms
}
// 2. 心跳间隔过长
int heartbeatInterval = (int) configs.get("heartbeat.interval.ms");
if (heartbeatInterval > 3000) {
log.warn("Heartbeat interval {}ms is too long, recommend at most 3000ms",
heartbeatInterval);
// 解决方案:减小heartbeat.interval.ms
}
// 3. Max poll间隔过短
int maxPollInterval = (int) configs.get("max.poll.interval.ms");
if (maxPollInterval < 300000) {
log.warn("Max poll interval {}ms may cause frequent rebalance",
maxPollInterval);
// 解决方案:增大max.poll.interval.ms或减少max.poll.records
}
// 4. Consumer处理时间过长
monitorConsumerProcessingTime(consumer);
// 5. 网络不稳定
monitorNetworkLatency();
}
/**
* 优化配置示例
*/
public Map<String, Object> getOptimizedConfig() {
Map<String, Object> config = new HashMap<>();
// 对于大数据量处理场景
config.put("session.timeout.ms", 45000); // 45秒
config.put("heartbeat.interval.ms", 3000); // 3秒
config.put("max.poll.interval.ms", 300000); // 5分钟
config.put("max.poll.records", 500); // 每批最多500条
config.put("fetch.min.bytes", 1048576); // 1MB
config.put("fetch.max.wait.ms", 500); // 500ms
config.put("partition.assignment.strategy",
"org.apache.kafka.clients.consumer.CooperativeStickyAssignor");
// 开启自动提交,但由应用控制提交时机
config.put("enable.auto.commit", false);
return config;
}
}
2. 监控Rebalance指标
java
复制
下载
@RestController
public class RebalanceMonitorController {
@Autowired
private KafkaConsumer<?, ?> consumer;
@GetMapping("/metrics/rebalance")
public Map<String, Object> getRebalanceMetrics() {
Map<String, Object> metrics = new HashMap<>();
// 1. Rebalance次数
MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
try {
ObjectName objectName = new ObjectName(
"kafka.consumer:type=consumer-metrics,client-id=*");
Set<ObjectName> beanNames = mBeanServer.queryNames(objectName, null);
for (ObjectName beanName : beanNames) {
// Rebalance相关指标
metrics.put("rebalance.rate.per.hour",
mBeanServer.getAttribute(beanName, "rebalance-rate-per-hour"));
metrics.put("rebalance.latency.avg",
mBeanServer.getAttribute(beanName, "rebalance-latency-avg"));
metrics.put("rebalance.latency.max",
mBeanServer.getAttribute(beanName, "rebalance-latency-max"));
metrics.put("rebalance.total",
mBeanServer.getAttribute(beanName, "rebalance-total"));
// 心跳指标
metrics.put("heartbeat.rate",
mBeanServer.getAttribute(beanName, "heartbeat-rate"));
metrics.put("heartbeat.response.time.max",
mBeanServer.getAttribute(beanName, "heartbeat-response-time-max"));
}
} catch (Exception e) {
log.error("Failed to collect rebalance metrics", e);
}
// 2. 当前分配信息
Set<TopicPartition> assignments = consumer.assignment();
metrics.put("assigned.partitions", assignments.size());
metrics.put("assignment.details", assignments);
// 3. Group信息
metrics.put("group.id", consumer.groupMetadata().groupId());
metrics.put("generation.id", consumer.groupMetadata().generationId());
metrics.put("member.id", consumer.groupMetadata().memberId());
return metrics;
}
@GetMapping("/rebalance/history")
public List<RebalanceEvent> getRebalanceHistory() {
// 从日志或监控系统获取历史Rebalance事件
return rebalanceHistoryService.getRecentEvents(100);
}
public static class RebalanceEvent {
private long timestamp;
private String groupId;
private String trigger; // consumer-joined, consumer-left, etc.
private long duration; // rebalance持续时间(ms)
private int generationId;
private Set<String> members;
private Map<String, List<Integer>> assignment;
}
}
3. 高级调优策略
java
复制
下载
public class AdvancedRebalanceOptimization {
/**
* 静态Group成员(Kafka 2.3+)
* 减少因重启导致的Rebalance
*/
public void configureStaticMembership() {
Map<String, Object> config = new HashMap<>();
// 为每个Consumer实例分配唯一ID
config.put("group.instance.id", "consumer-1-host1");
// 配置会话超时
config.put("session.timeout.ms", 45000);
// 当Consumer短暂离线时,保持其分配的分区
// 在session.timeout.ms内重新连接,可恢复原有分配
}
/**
* 优雅关闭Consumer
*/
public void gracefulShutdown(KafkaConsumer<?, ?> consumer) {
// 1. 停止拉取新消息
consumer.wakeup();
// 2. 提交最终偏移量
consumer.commitSync();
// 3. 发送LeaveGroup请求
sendLeaveGroupRequest(consumer);
// 4. 关闭Consumer
consumer.close(Duration.ofSeconds(30));
// 这样可以让Coordinator立即知道Consumer离开
// 而不是等待session.timeout.ms
}
/**
* 处理Rebalance风暴
*/
public void handleRebalanceStorm() {
// 原因:大量Consumer同时加入或离开
// 解决方案1:错开启动时间
// 在应用启动时添加随机延迟
long randomDelay = ThreadLocalRandom.current().nextLong(0, 30000);
Thread.sleep(randomDelay);
// 解决方案2:使用不同的Group ID
// 将Consumer分组到不同的Group
// 解决方案3:调整Coordinator配置
// 增加Broker的num.replica.fetchers
// 增加offsets.topic.replication.factor
}
}
六、生产环境最佳实践
1. Consumer配置模板
yaml
复制
下载
# application-kafka.yaml
spring:
kafka:
consumer:
# 基础配置
bootstrap-servers: ${KAFKA_BOOTSTRAP_SERVERS:localhost:9092}
group-id: ${CONSUMER_GROUP:my-application}
# 序列化
key-deserializer: org.apache.kafka.common.serialization.StringDeserializer
value-deserializer: org.apache.kafka.common.serialization.StringDeserializer
# Rebalance配置
partition-assignment-strategy:
- org.apache.kafka.clients.consumer.CooperativeStickyAssignor
rebalance-protocol: cooperative
# 会话与心跳
session-timeout-ms: 45000
heartbeat-interval-ms: 3000
max-poll-interval-ms: 300000
# 静态成员(减少重启导致的Rebalance)
group-instance-id: ${HOSTNAME:consumer}-${RANDOM_VALUE}
# 消费配置
auto-offset-reset: latest
enable-auto-commit: false # 手动提交以精确控制
fetch-min-bytes: 1048576 # 1MB
fetch-max-wait-ms: 500
max-poll-records: 500
# 连接配置
connections-max-idle-ms: 540000 # 9分钟
receive-buffer-bytes: 65536
send-buffer-bytes: 131072
# 安全配置
security-protocol: ${KAFKA_SECURITY_PROTOCOL:PLAINTEXT}
ssl:
trust-store-location: ${KAFKA_TRUSTSTORE_LOCATION:}
trust-store-password: ${KAFKA_TRUSTSTORE_PASSWORD:}
sasl:
mechanism: ${KAFKA_SASL_MECHANISM:}
jaas:
config: ${KAFKA_JAAS_CONFIG:}
篇幅限制下面就只能给大家展示小册部分内容了。整理了一份核心面试笔记包括了:Java面试、Spring、JVM、MyBatis、Redis、MySQL、并发编程、微服务、Linux、Springboot、SpringCloud、MQ、Kafc
需要全套面试笔记及答案
【点击此处即可/免费获取】
2. Rebalance监听器最佳实践
java
复制
下载
@Component
@Slf4j
public class RobustRebalanceListener implements ConsumerRebalanceListener {
private final OffsetCommitService offsetCommitService;
private final StateManager stateManager;
private final MetricsCollector metricsCollector;
// 记录每个分区的处理状态
private final Map<TopicPartition, ProcessingState> processingState =
new ConcurrentHashMap<>();
@Override
public void onPartitionsRevoked(Collection<TopicPartition> partitions) {
log.info("Partitions being revoked: {}", partitions);
long startTime = System.currentTimeMillis();
try {
// 1. 暂停处理新消息
pauseMessageProcessing();
// 2. 完成正在处理的消息
completeInFlightMessages(partitions);
// 3. 保存处理状态
saveProcessingState(partitions);
// 4. 提交偏移量(同步提交确保成功)
offsetCommitService.commitSync(partitions);
// 5. 清理本地状态
cleanupLocalState(partitions);
metricsCollector.recordRebalanceEvent("partitions_revoked",
partitions.size(), System.currentTimeMillis() - startTime);
} catch (Exception e) {
log.error("Error during partition revocation", e);
// 记录指标但不抛出异常,避免阻塞Rebalance
metricsCollector.recordRebalanceError(e);
}
}
@Override
public void onPartitionsAssigned(Collection<TopicPartition> partitions) {
log.info("Partitions assigned: {}", partitions);
long startTime = System.currentTimeMillis();
try {
// 1. 初始化分区状态
initializePartitions(partitions);
// 2. 从持久化存储恢复状态
restoreProcessingState(partitions);
// 3. 定位到合适的偏移量
seekToAppropriateOffsets(partitions);
// 4. 恢复消息处理
resumeMessageProcessing();
metricsCollector.recordRebalanceEvent("partitions_assigned",
partitions.size(), System.currentTimeMillis() - startTime);
} catch (Exception e) {
log.error("Error during partition assignment", e);
metricsCollector.recordRebalanceError(e);
// 可能需要抛出异常,因为分配失败通常意味着严重问题
throw new RuntimeException("Failed to initialize partitions", e);
}
}
@Override
public void onPartitionsLost(Collection<TopicPartition> partitions) {
log.warn("Partitions lost (likely due to timeout): {}", partitions);
// 分区丢失通常意味着处理超时
// 需要记录丢失的分区和偏移量,以便后续恢复
for (TopicPartition partition : partitions) {
long lastCommittedOffset = offsetCommitService.getLastCommittedOffset(partition);
log.error("Partition {} lost at offset {}", partition, lastCommittedOffset);
// 记录到死信队列或错误日志
recordLostPartition(partition, lastCommittedOffset);
}
// 清理状态但不尝试提交偏移量
cleanupLocalState(partitions);
}
/**
* 优雅的偏移量提交策略
*/
private static class OffsetCommitService {
// 异步提交,但同步确认
public void commitSync(Collection<TopicPartition> partitions) {
// 1. 提交当前批次
consumer.commitAsync((offsets, exception) -> {
if (exception != null) {
log.error("Failed to commit offsets", exception);
// 记录失败的偏移量,后续重试
saveFailedOffsets(offsets);
}
});
// 2. 等待提交完成(带超时)
try {
consumer.commitSync(Duration.ofSeconds(5));
} catch (TimeoutException e) {
log.warn("Offset commit timeout, will retry");
// 将未提交的偏移量保存到数据库
savePendingOffsets(partitions);
}
}
}
}
3. 大规模集群优化
java
复制
下载
@Configuration
public class LargeScaleKafkaConfig {
@Bean
@Primary
public ConsumerFactory<String, String> largeScaleConsumerFactory() {
Map<String, Object> configs = new HashMap<>();
// 针对大规模集群的优化配置
configs.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,
"kafka-1:9092,kafka-2:9092,kafka-3:9092");
// 使用静态成员减少Rebalance
configs.put(ConsumerConfig.GROUP_INSTANCE_ID_CONFIG,
generateStaticMemberId());
// 增大缓冲区
configs.put(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 131072); // 128KB
configs.put(ConsumerConfig.SEND_BUFFER_CONFIG, 131072);
// 优化拉取设置
configs.put(ConsumerConfig.FETCH_MAX_BYTES_CONFIG, 52428800); // 50MB
configs.put(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG, 1048576); // 1MB
// 调整元数据获取
configs.put(ConsumerConfig.METADATA_MAX_AGE_CONFIG, 300000); // 5分钟
// 使用多个消费者实例,而不是一个消费者处理所有分区
configs.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1000);
return new DefaultKafkaConsumerFactory<>(configs);
}
/**
* 生成静态成员ID
*/
private String generateStaticMemberId() {
try {
String hostname = InetAddress.getLocalHost().getHostName();
String containerId = System.getenv("HOSTNAME"); // Kubernetes环境
if (containerId == null) {
containerId = hostname;
}
return String.format("%s-%s", containerId, UUID.randomUUID().toString().substring(0, 8));
} catch (UnknownHostException e) {
return "consumer-" + UUID.randomUUID().toString();
}
}
/**
* 分区预热策略
*/
@Bean
public SmartLifecycle partitionWarmup() {
return new SmartLifecycle() {
private boolean running = false;
@Override
public void start() {
// 在Consumer启动后,预热分区分配
log.info("Starting partition warmup...");
// 延迟启动,避免所有Consumer同时加入
try {
Thread.sleep(ThreadLocalRandom.current().nextLong(10000, 30000));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
running = true;
}
@Override
public void stop() {
// 优雅关闭
log.info("Stopping partition warmup...");
running = false;
}
@Override
public boolean isRunning() {
return running;
}
};
}
}
七、总结与最佳实践
核心要点总结:
-
Rebalance机制:
-
Eager Rebalance:完全停止-重新分配(传统方式)
-
Incremental Rebalance:增量分配,减少停顿(Kafka 2.4+推荐)
-
-
分配策略选择:
-
RangeAssignor:默认,按Topic连续分配,可能不平衡
-
RoundRobinAssignor:全局轮询,需要相同订阅
-
StickyAssignor:最小化分区移动,保持粘性
-
CooperativeStickyAssignor:协作式+粘性(生产环境首选)
-
-
关键配置:
properties
复制
下载
# 必须调整的配置 session.timeout.ms=45000 heartbeat.interval.ms=3000 max.poll.interval.ms=300000 partition.assignment.strategy=CooperativeStickyAssignor # 可选优化 group.instance.id=<unique-id> # 静态成员 enable.auto.commit=false # 手动提交
生产环境最佳实践:
-
避免频繁Rebalance:
-
合理设置session.timeout和max.poll.interval
-
使用静态成员(group.instance.id)
-
确保消费逻辑不会阻塞
-
-
保证数据一致性:
-
在onPartitionsRevoked中同步提交偏移量
-
实现幂等消费处理
-
记录Rebalance事件用于监控和排查
-
-
性能优化:
-
使用批量处理和异步提交
-
根据数据量调整fetch.min.bytes
-
监控Rebalance延迟和频率
-
-
监控告警:
bash
复制
下载
# 关键监控指标 rebalance-rate-per-hour # 每小时Rebalance次数 rebalance-latency-avg # 平均Rebalance延迟 heartbeat-response-time-max # 最大心跳响应时间 assigned-partitions # 已分配分区数 -
故障处理:
-
实现完善的Rebalance监听器
-
处理分区丢失(onPartitionsLost)
-
记录未提交的偏移量用于恢复
-
通过合理配置和优化,可以显著减少Kafka Consumer的Rebalance影响,保证消费的稳定性和性能。