Sarama消费组初始化阻塞问题排查
Gopher们在和KAFKA 打交道的时候,有没有使用过Sarama这个库呢?正好,我们团队的服务用了这个库。
Sarama库的简单介绍:
📖 简介:Sarama is an MIT-licensed Go client library for Apache Kafka.
💻 开发语言:Golang
✨ Stars:12K
在一次发布服务的过程中,我遇到了一个问题:
在一段代码中,服务连续启动了几个ConsumerGroup,但是前面几个都正常启动,唯独最后一个ConsumerGroup一直没有启动成功。
伪代码如下:
scss
log.println("Consumer1 Starting")
NewConsumer1()
log.println("Consumer2 Starting")
NewConsumer2()
log.println("Consumer3 Startting") // 这一行日志死活没打出来
NewConsumer3()
在深入NewConsumer
方法的内部实现后,我发现了一个可能会引发阻塞的代码:
go
func (sc SyncConsumer) Consume(ctx context.Context, handleFunc func(message *sarama.ConsumerMessage) error) {
handler := func(message *sarama.ConsumerMessage) error {
defer func() {
if err := recover(); err != nil {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("painc: %s\n\n", err))
stack := make([]byte, 4096)
stack = stack[:runtime.Stack(stack, false)]
buf.Write(stack)
glog.Errorf("Consume handler func recover %s", buf.String())
}
}()
return handleFunc(message)
}
consumerGroupHandler := syncConsumerGroupHandler{
SyncConsumer: sc,
ready: make(chan struct{}),
handler: handler,
ctx: ctx,
}
go func() {
select {
case groupErr := <-consumerGroupHandler.ConsumerGroup.Errors():
glog.Errorf("consumerGroupHandler errors %s %v", sc.cfg.Topic, groupErr)
case <-ctx.Done():
return
}
}()
go func() {
defer sc.Close()
for {
if err := sc.ConsumerGroup.Consume(ctx, []string{sc.cfg.Topic}, consumerGroupHandler); err != nil {
glog.Infof("Consume: %+v", err)
time.Sleep(1 * time.Second)
}
if ctx.Err() != nil {
log.Println("new consumer ctx err", ctx.Err())
return
}
consumerGroupHandler.ready = make(chan struct{})
}
}()
<-consumerGroupHandler.ready
}
如果初始化ConsumeGroup不成功,是不是会阻塞在这一行代码<-consumerGroupHandler.ready
呢?
初始化的方法中之所以加入channel,当然是为了主线程能够知道这次初始化是否完成了。但为什么这么写?可能就得去探究一下Sarama库的使用姿势了。
Sarama ConsumerGroup Init Example
来看看官方给的示例(代码有点长):
go
package main
// SIGUSR1 toggle the pause/resume consumption
import (
"context"
"errors"
"flag"
"log"
"os"
"os/signal"
"strings"
"sync"
"syscall"
"github.com/IBM/sarama"
)
// Sarama configuration options
var (
brokers = ""
version = ""
group = ""
topics = ""
assignor = ""
oldest = true
verbose = false
)
func init() {
flag.StringVar(&brokers, "brokers", "", "Kafka bootstrap brokers to connect to, as a comma separated list")
flag.StringVar(&group, "group", "", "Kafka consumer group definition")
flag.StringVar(&version, "version", sarama.DefaultVersion.String(), "Kafka cluster version")
flag.StringVar(&topics, "topics", "", "Kafka topics to be consumed, as a comma separated list")
flag.StringVar(&assignor, "assignor", "range", "Consumer group partition assignment strategy (range, roundrobin, sticky)")
flag.BoolVar(&oldest, "oldest", true, "Kafka consumer consume initial offset from oldest")
flag.BoolVar(&verbose, "verbose", false, "Sarama logging")
flag.Parse()
if len(brokers) == 0 {
panic("no Kafka bootstrap brokers defined, please set the -brokers flag")
}
if len(topics) == 0 {
panic("no topics given to be consumed, please set the -topics flag")
}
if len(group) == 0 {
panic("no Kafka consumer group defined, please set the -group flag")
}
}
func main() {
keepRunning := true
log.Println("Starting a new Sarama consumer")
if verbose {
sarama.Logger = log.New(os.Stdout, "[sarama] ", log.LstdFlags)
}
version, err := sarama.ParseKafkaVersion(version)
if err != nil {
log.Panicf("Error parsing Kafka version: %v", err)
}
/**
* Construct a new Sarama configuration.
* The Kafka cluster version has to be defined before the consumer/producer is initialized.
*/
config := sarama.NewConfig()
config.Version = version
switch assignor {
case "sticky":
config.Consumer.Group.Rebalance.GroupStrategies = []sarama.BalanceStrategy{sarama.NewBalanceStrategySticky()}
case "roundrobin":
config.Consumer.Group.Rebalance.GroupStrategies = []sarama.BalanceStrategy{sarama.NewBalanceStrategyRoundRobin()}
case "range":
config.Consumer.Group.Rebalance.GroupStrategies = []sarama.BalanceStrategy{sarama.NewBalanceStrategyRange()}
default:
log.Panicf("Unrecognized consumer group partition assignor: %s", assignor)
}
if oldest {
config.Consumer.Offsets.Initial = sarama.OffsetOldest
}
/**
* Setup a new Sarama consumer group
*/
consumer := Consumer{
ready: make(chan bool),
}
ctx, cancel := context.WithCancel(context.Background())
client, err := sarama.NewConsumerGroup(strings.Split(brokers, ","), group, config)
if err != nil {
log.Panicf("Error creating consumer group client: %v", err)
}
consumptionIsPaused := false
wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
for {
// `Consume` should be called inside an infinite loop, when a
// server-side rebalance happens, the consumer session will need to be
// recreated to get the new claims
if err := client.Consume(ctx, strings.Split(topics, ","), &consumer); err != nil {
if errors.Is(err, sarama.ErrClosedConsumerGroup) {
return
}
log.Panicf("Error from consumer: %v", err)
}
// check if context was cancelled, signaling that the consumer should stop
if ctx.Err() != nil {
return
}
consumer.ready = make(chan bool)
}
}()
<-consumer.ready // Await till the consumer has been set up
log.Println("Sarama consumer up and running!...")
sigusr1 := make(chan os.Signal, 1)
signal.Notify(sigusr1, syscall.SIGUSR1)
sigterm := make(chan os.Signal, 1)
signal.Notify(sigterm, syscall.SIGINT, syscall.SIGTERM)
for keepRunning {
select {
case <-ctx.Done():
log.Println("terminating: context cancelled")
keepRunning = false
case <-sigterm:
log.Println("terminating: via signal")
keepRunning = false
case <-sigusr1:
toggleConsumptionFlow(client, &consumptionIsPaused)
}
}
cancel()
wg.Wait()
if err = client.Close(); err != nil {
log.Panicf("Error closing client: %v", err)
}
}
func toggleConsumptionFlow(client sarama.ConsumerGroup, isPaused *bool) {
if *isPaused {
client.ResumeAll()
log.Println("Resuming consumption")
} else {
client.PauseAll()
log.Println("Pausing consumption")
}
*isPaused = !*isPaused
}
// Consumer represents a Sarama consumer group consumer
type Consumer struct {
ready chan bool
}
// Setup is run at the beginning of a new session, before ConsumeClaim
func (consumer *Consumer) Setup(sarama.ConsumerGroupSession) error {
// Mark the consumer as ready
close(consumer.ready)
return nil
}
// Cleanup is run at the end of a session, once all ConsumeClaim goroutines have exited
func (consumer *Consumer) Cleanup(sarama.ConsumerGroupSession) error {
return nil
}
// ConsumeClaim must start a consumer loop of ConsumerGroupClaim's Messages().
// Once the Messages() channel is closed, the Handler must finish its processing
// loop and exit.
func (consumer *Consumer) ConsumeClaim(session sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error {
// NOTE:
// Do not move the code below to a goroutine.
// The `ConsumeClaim` itself is called within a goroutine, see:
// https://github.com/IBM/sarama/blob/main/consumer_group.go#L27-L29
for {
select {
case message, ok := <-claim.Messages():
if !ok {
log.Printf("message channel was closed")
return nil
}
log.Printf("Message claimed: value = %s, timestamp = %v, topic = %s", string(message.Value), message.Timestamp, message.Topic)
session.MarkMessage(message, "")
// Should return when `session.Context()` is done.
// If not, will raise `ErrRebalanceInProgress` or `read tcp <ip>:<port>: i/o timeout` when kafka rebalance. see:
// https://github.com/IBM/sarama/issues/1192
case <-session.Context().Done():
return nil
}
}
}
消费组初始化的过程中,需要传入一个Handler接口用于处理消息(主要的消费逻辑在这里实现)。同时该接口还有一个Setup方法,这个方法会在消费组初始化后和消费逻辑正式运行之前调用。
vbnet
type ConsumerGroupHandler interface {
// Setup is run at the beginning of a new session, before ConsumeClaim.
Setup(ConsumerGroupSession) error
// Cleanup is run at the end of a session, once all ConsumeClaim goroutines have exited
// but before the offsets are committed for the very last time.
Cleanup(ConsumerGroupSession) error
// ConsumeClaim must start a consumer loop of ConsumerGroupClaim's Messages().
// Once the Messages() channel is closed, the Handler must finish its processing
// loop and exit.
ConsumeClaim(ConsumerGroupSession, ConsumerGroupClaim) error
}
在上面初始化的例子中,我们可以看到Setup方法中关闭channel,在外层中监听channel可以得到一个退出的信号,从而知道了"哦!初始化完成了!"。
又有一个问题:为什么要重新初始化channel呢?
go
go func() {
defer wg.Done()
for {
// `Consume` should be called inside an infinite loop, when a
// server-side rebalance happens, the consumer session will need to be
// recreated to get the new claims
if err := client.Consume(ctx, strings.Split(topics, ","), &consumer); err != nil {
if errors.Is(err, sarama.ErrClosedConsumerGroup) {
return
}
log.Panicf("Error from consumer: %v", err)
}
// check if context was cancelled, signaling that the consumer should stop
if ctx.Err() != nil {
return
}
consumer.ready = make(chan bool) // 为什么要这么做?
}
}()
因为每次消费组重平衡的时候,client.Consumer
方法都会退出,所以sarama官方的例子中要求Consume should be called inside an infinite loop
;每次重平衡完成,Setup方法都会被再次调用,如果不重新初始化channel,同一个channel会被关闭两次,在Golang中会引起panic!
故障排除
现在再来回顾下我们服务中的这段代码:
go
go func() {
defer sc.Close()
for {
if err := sc.ConsumerGroup.Consume(ctx, []string{sc.cfg.Topic}, consumerGroupHandler); err != nil {
glog.Infof("Consume: %+v", err)
time.Sleep(1 * time.Second)
}
if ctx.Err() != nil {
log.Println("new consumer ctx err", ctx.Err())
return
}
consumerGroupHandler.ready = make(chan struct{})
}
}()
<-consumerGroupHandler.ready
显然这代码参考了官方的例子。
不一样的地方是,官方中,如果初始化失败,会直接退出;但服务中的代码,如果失败,会重新进入下一次循环。
如果第一次初始化失败,ready channel会被重新创建,但外层等待channel关闭信号实际上还是在等待老的channel。这就造成了一个deadlock问题。
go
ready := make(chan struct{})
go func() {
time.Sleep(500 * time.Millisecond)
ready = make(chan struct{}) // 重新赋值!
fmt.Println("ready channel replaced")
time.Sleep(500 * time.Millisecond)
close(ready) // 第一次 close
}()
<-ready // 等待的是原来的 channel
fmt.Println("Ready received")
上面这段代码运行后得到的结果是:
go
eady channel replaced
fatal error: all goroutines are asleep - deadlock!
goroutine 1 [chan receive]:
main.newChan()
/Users/ryan/develop/my-project/go-first/chan_new.go:24 +0xb4
main.main()
/Users/ryan/develop/my-project/go-first/main.go:32 +0x1c
exit status 2
至此,阻塞问题的原因真相大白!
问题修复
