KafkaUtils

1. java代码

复制代码
package com.test;

import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.serialization.SerializationSchema;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.OffsetResetStrategy;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.ByteArraySerializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;

import java.util.Properties;

/**
 * Flink 1.18 专属Kafka工具类(官方新版API)
 * 核心:KafkaSource(消费) + KafkaSink(生产)
 */
public class KafkaUtils {
    // 基础配置抽离,统一维护
    public static final String BOOTSTRAP_SERVERS = "10.129.243.106:9092,10.129.243.107:9092,10.129.243.108:9092";
    // 默认消费组(可自定义覆盖)
    public static final String DEFAULT_CONSUMER_GROUP = "flink118_kafka_group";
    // 默认生产事务超时(需 > Flink Checkpoint间隔,建议比CK大50%)
    public static final int TRANSACTION_TIMEOUT_MS = 900000;
    // 私有化构造器,禁止实例化
    private KafkaUtils() {}

    // ======================== 消费端:KafkaSource (Flink1.14+推荐)========================
    /**
     * 通用字符串消费源(最常用,直接消费String类型消息)
     * @param topic 消费主题
     * @param groupId 消费组(必传,避免多任务冲突)
     * @return KafkaSource<String>
     */
    public static KafkaSource<String> getStringKafkaSource(String topic, String groupId) {
        return getKafkaSource(topic, groupId, new SimpleStringSchema());
    }

    /**
     * 自定义反序列化消费源(支持自定义对象反序列化)
     * @param topic 消费主题
     * @param groupId 消费组
     * @param deserializer 自定义反序列化器
     * @return KafkaSource<T>
     */
    public static <T> KafkaSource<T> getKafkaSource(String topic, String groupId, DeserializationSchema<T> deserializer) {
        // 消费端高吞吐配置
        Properties consumerProps = new Properties();
        // 核心配置
        consumerProps.setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId);
        consumerProps.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        consumerProps.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        // 高吞吐调优:批量拉取、减少网络IO
        consumerProps.setProperty(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, "2000"); // 每次拉取2000条
        consumerProps.setProperty(ConsumerConfig.FETCH_MIN_BYTES_CONFIG, "204800"); // 拉取最小200KB,凑批拉取
        consumerProps.setProperty(ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG, "500"); // 最大等待500ms,避免空等
        // 连接优化
        consumerProps.setProperty(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000");
        consumerProps.setProperty(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, "10000");

        return KafkaSource.<T>builder()
                .setBootstrapServers(BOOTSTRAP_SERVERS)
                .setTopics(topic)
                .setProperties(consumerProps)
                // 偏移量策略:已提交偏移量为准,无则从最新开始(生产常用),测试可改earliest
                .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.LATEST))
                // 字符串反序列化(自定义则替换为对应解析器)
                .setDeserializer(KafkaRecordDeserializationSchema.valueOnly(deserializer))
                // 开启分区发现(默认5000ms,可自定义:.setPartitionDiscoveryInterval(10000))
                .build();
    }


    // ======================== 生产端:KafkaSink (Flink1.14+推荐)========================
    /**
     * 通用字符串生产端(最常用,直接发送String类型消息)
     * @param topic 生产主题
     * @return KafkaSink<String>
     */
    public static KafkaSink<String> getStringKafkaSink(String topic) {
        return getKafkaSink(topic, new SimpleStringSchema());
    }

    /**
     * 自定义序列化生产端(支持自定义对象序列化)
     * @param topic 生产主题
     * @param serializer 自定义序列化器
     * @return KafkaSink<T>
     */
    public static <T> KafkaSink<T> getKafkaSink(String topic, SerializationSchema<T> serializer) {
        // 生产端高吞吐+Exactly-Once配置(适配5w条/秒)
        Properties producerProps = new Properties();
        // 核心序列化
        producerProps.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class.getName());
        producerProps.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class.getName());
        // 高吞吐核心调优(重中之重)
        producerProps.setProperty(ProducerConfig.BATCH_SIZE_CONFIG, "163840"); // 批量160KB,凑批发送
        producerProps.setProperty(ProducerConfig.LINGER_MS_CONFIG, "5"); // linger5ms,攒批再发(牺牲5ms延迟换吞吐)
        producerProps.setProperty(ProducerConfig.BUFFER_MEMORY_CONFIG, "33554432"); // 发送缓冲区32MB
        producerProps.setProperty(ProducerConfig.MAX_REQUEST_SIZE_CONFIG, "10485760"); // 最大请求10MB
        // Exactly-Once 精准一次配置(Flink1.18默认支持,需配合Flink Checkpoint)
        producerProps.setProperty(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true"); // 开启幂等
        producerProps.setProperty(ProducerConfig.ACKS_CONFIG, "all"); // 幂等必须配all
        //producerProps.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_MS_CONFIG, String.valueOf(TRANSACTION_TIMEOUT_MS));
        producerProps.setProperty("transaction.timeout.ms", "3600000");

        producerProps.setProperty(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "5"); // 5比1吞吐更高,且保证顺序
        // 重试优化
        producerProps.setProperty(ProducerConfig.RETRIES_CONFIG, "3");
        producerProps.setProperty(ProducerConfig.RETRY_BACKOFF_MS_CONFIG, "1000");
        producerProps.setProperty("metrics.enabled", "false");
        producerProps.setProperty("jmx.enabled", "false");
        producerProps.setProperty("kafka.metrics.reporters", "");


        return KafkaSink.<T>builder()
                .setBootstrapServers(BOOTSTRAP_SERVERS)
                .setKafkaProducerConfig(producerProps)
                // 字符串序列化(自定义则替换为对应序列化器)
                .setRecordSerializer(KafkaRecordSerializationSchema.<T>builder()
                        .setTopic(topic)
                        .setValueSerializationSchema(serializer)
                        .build())
                // 精准一次语义(生产必用),可选AT_LEAST_ONCE(吞吐更高,允许少量重复)
                .setDeliveryGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
                .build();
    }

    /**
     * 高吞吐生产端(牺牲少量一致性换极致吞吐,如日志采集等非核心数据)
     * @param topic 生产主题
     * @return KafkaSink<String>
     */
    public static KafkaSink<String> getHighThroughputKafkaSink(String topic) {
        Properties producerProps = new Properties();
        producerProps.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
        producerProps.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
        // 极致高吞吐配置
        producerProps.setProperty(ProducerConfig.BATCH_SIZE_CONFIG, "327680"); // 批量320KB
        producerProps.setProperty(ProducerConfig.LINGER_MS_CONFIG, "10"); // 攒批10ms
        producerProps.setProperty(ProducerConfig.BUFFER_MEMORY_CONFIG, "67108864"); // 缓冲区64MB
        producerProps.setProperty(ProducerConfig.ACKS_CONFIG, "1"); // acks=1,吞吐最高
        producerProps.setProperty(ProducerConfig.RETRIES_CONFIG, "1");
        producerProps.setProperty(ProducerConfig.COMPRESSION_TYPE_CONFIG, "lz4"); // 开启lz4压缩,减少网络IO

        return KafkaSink.<String>builder()
                .setBootstrapServers(BOOTSTRAP_SERVERS)
                .setKafkaProducerConfig(producerProps)
                .setRecordSerializer(KafkaRecordSerializationSchema.builder()
                        .setTopic(topic)
                        .setValueSerializationSchema(new SimpleStringSchema())
                        .build())
                .setDeliveryGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
                .build();
    }
}

2. server.properties

不同的地方

broker.id=0

listeners=PLAINTEXT://10.100.1.1:9092

advertised.listeners=PLAINTEXT://10.100.1.1:9092

broker.id=1

listeners=PLAINTEXT://10.100.1.2:9092

advertised.listeners=PLAINTEXT://10.100.1.2:9092

broker.id=2

listeners=PLAINTEXT://10.100.1.3:9092

advertised.listeners=PLAINTEXT://10.100.1.3:9092

复制代码
broker.id=0
listeners=PLAINTEXT://10.100.1.1:9092
advertised.listeners=PLAINTEXT://10.100.1.1:9092
num.network.threads=3
num.io.threads=8
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400
socket.request.max.bytes=104857600
log.dirs=/data/kafka/data
num.partitions=8
num.recovery.threads.per.data.dir=1
offsets.topic.replication.factor=3
transaction.state.log.replication.factor=3
transaction.state.log.min.isr=2
log.retention.hours=168
log.segment.bytes=1073741824
log.retention.check.interval.ms=300000
zookeeper.connect=yy1:2181,yy2:2181,yy3:2181/kafka
zookeeper.connection.timeout.ms=18000
group.initial.rebalance.delay.ms=0
auto.create.topics.enable=true
delete.topic.enable=true
transaction.max.timeout.ms=3600000

3. kafka集群启动脚本

kafka-cluster.sh 内容如下( 注意里面的ssh -p 2022 需要修改;默认是22,不做端口指定)

复制代码
#!/bin/bash
# Kafka集群节点列表(替换成你的主机名/IP)
NODES=("yy1" "yy2" "yy3")
# Kafka安装路径
KAFKA_HOME="/opt/kafka"
# 配置文件路径
CONFIG_FILE="${KAFKA_HOME}/config/server.properties"

# 参数校验
if [ $# -ne 1 ]; then
    echo "使用方法: $0 [start|stop]"
    exit 1
fi
COMMAND=$1

# 批量执行
case $COMMAND in
    start)
        echo "===== 开始批量启动Kafka集群 ====="
        for node in "${NODES[@]}"; do
            echo "启动${node}..."
            ssh -p 2022 $node "${KAFKA_HOME}/bin/kafka-server-start.sh -daemon ${CONFIG_FILE}"
        done
        echo "===== Kafka集群启动命令已下发 ====="
        ;;
    stop)
        echo "===== 开始批量停止Kafka集群 ====="
        for node in "${NODES[@]}"; do
            echo "停止${node}..."
            ssh -p 2022 $node "${KAFKA_HOME}/bin/kafka-server-stop.sh"
        done
        echo "===== Kafka集群停止命令已下发 ====="
        ;;
    *)
        echo "无效参数!仅支持: start|stop"

chmod 775 kafka-cluster.sh

sh kafka-cluster.sh start

sh kafka-cluster.sh stop

提示: 如果Linux服务器的默认端口22已修改,把脚本中-p 2022 换成指定的,如果使用默认的端口22,

把 -p 2022 去掉即可

相关推荐
洛豳枭薰5 小时前
消息队列关键问题描述
kafka·rabbitmq·rocketmq
lucky67075 小时前
Spring Boot集成Kafka:最佳实践与详细指南
spring boot·kafka·linq
袁煦丞 cpolar内网穿透实验室7 小时前
远程调试内网 Kafka 不再求运维!cpolar 内网穿透实验室第 791 个成功挑战
运维·分布式·kafka·远程工作·内网穿透·cpolar
岁岁种桃花儿7 小时前
CentOS7 彻底卸载所有JDK/JRE + 重新安装JDK8(实操完整版,解决kafka/jps报错)
java·开发语言·kafka
闻哥13 小时前
Kafka高吞吐量核心揭秘:四大技术架构深度解析
java·jvm·面试·kafka·rabbitmq·springboot
indexsunny1 天前
互联网大厂Java面试实战:Spring Boot微服务在电商场景中的应用与挑战
java·spring boot·redis·微服务·kafka·spring security·电商
TTBIGDATA1 天前
【Atlas】Ambari 中 开启 Kerberos + Ranger 后 Atlas Hook 无权限访问 Kafka Topic:ATLAS_HOOK
大数据·kafka·ambari·linq·ranger·knox·bigtop
岁岁种桃花儿1 天前
Kafka从入门到上天系列第一篇:kafka的安装和启动
大数据·中间件·kafka
TTBIGDATA2 天前
【Atlas】Atlas Hook 消费 Kafka 报错:GroupAuthorizationException
hadoop·分布式·kafka·ambari·hdp·linq·ranger