1. java代码
package com.test;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.serialization.SerializationSchema;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.OffsetResetStrategy;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.ByteArraySerializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;
import java.util.Properties;
/**
* Flink 1.18 专属Kafka工具类(官方新版API)
* 核心:KafkaSource(消费) + KafkaSink(生产)
*/
public class KafkaUtils {
// 基础配置抽离,统一维护
public static final String BOOTSTRAP_SERVERS = "10.129.243.106:9092,10.129.243.107:9092,10.129.243.108:9092";
// 默认消费组(可自定义覆盖)
public static final String DEFAULT_CONSUMER_GROUP = "flink118_kafka_group";
// 默认生产事务超时(需 > Flink Checkpoint间隔,建议比CK大50%)
public static final int TRANSACTION_TIMEOUT_MS = 900000;
// 私有化构造器,禁止实例化
private KafkaUtils() {}
// ======================== 消费端:KafkaSource (Flink1.14+推荐)========================
/**
* 通用字符串消费源(最常用,直接消费String类型消息)
* @param topic 消费主题
* @param groupId 消费组(必传,避免多任务冲突)
* @return KafkaSource<String>
*/
public static KafkaSource<String> getStringKafkaSource(String topic, String groupId) {
return getKafkaSource(topic, groupId, new SimpleStringSchema());
}
/**
* 自定义反序列化消费源(支持自定义对象反序列化)
* @param topic 消费主题
* @param groupId 消费组
* @param deserializer 自定义反序列化器
* @return KafkaSource<T>
*/
public static <T> KafkaSource<T> getKafkaSource(String topic, String groupId, DeserializationSchema<T> deserializer) {
// 消费端高吞吐配置
Properties consumerProps = new Properties();
// 核心配置
consumerProps.setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId);
consumerProps.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
consumerProps.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
// 高吞吐调优:批量拉取、减少网络IO
consumerProps.setProperty(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, "2000"); // 每次拉取2000条
consumerProps.setProperty(ConsumerConfig.FETCH_MIN_BYTES_CONFIG, "204800"); // 拉取最小200KB,凑批拉取
consumerProps.setProperty(ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG, "500"); // 最大等待500ms,避免空等
// 连接优化
consumerProps.setProperty(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000");
consumerProps.setProperty(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, "10000");
return KafkaSource.<T>builder()
.setBootstrapServers(BOOTSTRAP_SERVERS)
.setTopics(topic)
.setProperties(consumerProps)
// 偏移量策略:已提交偏移量为准,无则从最新开始(生产常用),测试可改earliest
.setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.LATEST))
// 字符串反序列化(自定义则替换为对应解析器)
.setDeserializer(KafkaRecordDeserializationSchema.valueOnly(deserializer))
// 开启分区发现(默认5000ms,可自定义:.setPartitionDiscoveryInterval(10000))
.build();
}
// ======================== 生产端:KafkaSink (Flink1.14+推荐)========================
/**
* 通用字符串生产端(最常用,直接发送String类型消息)
* @param topic 生产主题
* @return KafkaSink<String>
*/
public static KafkaSink<String> getStringKafkaSink(String topic) {
return getKafkaSink(topic, new SimpleStringSchema());
}
/**
* 自定义序列化生产端(支持自定义对象序列化)
* @param topic 生产主题
* @param serializer 自定义序列化器
* @return KafkaSink<T>
*/
public static <T> KafkaSink<T> getKafkaSink(String topic, SerializationSchema<T> serializer) {
// 生产端高吞吐+Exactly-Once配置(适配5w条/秒)
Properties producerProps = new Properties();
// 核心序列化
producerProps.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class.getName());
producerProps.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class.getName());
// 高吞吐核心调优(重中之重)
producerProps.setProperty(ProducerConfig.BATCH_SIZE_CONFIG, "163840"); // 批量160KB,凑批发送
producerProps.setProperty(ProducerConfig.LINGER_MS_CONFIG, "5"); // linger5ms,攒批再发(牺牲5ms延迟换吞吐)
producerProps.setProperty(ProducerConfig.BUFFER_MEMORY_CONFIG, "33554432"); // 发送缓冲区32MB
producerProps.setProperty(ProducerConfig.MAX_REQUEST_SIZE_CONFIG, "10485760"); // 最大请求10MB
// Exactly-Once 精准一次配置(Flink1.18默认支持,需配合Flink Checkpoint)
producerProps.setProperty(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true"); // 开启幂等
producerProps.setProperty(ProducerConfig.ACKS_CONFIG, "all"); // 幂等必须配all
//producerProps.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_MS_CONFIG, String.valueOf(TRANSACTION_TIMEOUT_MS));
producerProps.setProperty("transaction.timeout.ms", "3600000");
producerProps.setProperty(ProducerConfig.MAX_IN_FLIGHT_REQUESTS_PER_CONNECTION, "5"); // 5比1吞吐更高,且保证顺序
// 重试优化
producerProps.setProperty(ProducerConfig.RETRIES_CONFIG, "3");
producerProps.setProperty(ProducerConfig.RETRY_BACKOFF_MS_CONFIG, "1000");
producerProps.setProperty("metrics.enabled", "false");
producerProps.setProperty("jmx.enabled", "false");
producerProps.setProperty("kafka.metrics.reporters", "");
return KafkaSink.<T>builder()
.setBootstrapServers(BOOTSTRAP_SERVERS)
.setKafkaProducerConfig(producerProps)
// 字符串序列化(自定义则替换为对应序列化器)
.setRecordSerializer(KafkaRecordSerializationSchema.<T>builder()
.setTopic(topic)
.setValueSerializationSchema(serializer)
.build())
// 精准一次语义(生产必用),可选AT_LEAST_ONCE(吞吐更高,允许少量重复)
.setDeliveryGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
.build();
}
/**
* 高吞吐生产端(牺牲少量一致性换极致吞吐,如日志采集等非核心数据)
* @param topic 生产主题
* @return KafkaSink<String>
*/
public static KafkaSink<String> getHighThroughputKafkaSink(String topic) {
Properties producerProps = new Properties();
producerProps.setProperty(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
producerProps.setProperty(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
// 极致高吞吐配置
producerProps.setProperty(ProducerConfig.BATCH_SIZE_CONFIG, "327680"); // 批量320KB
producerProps.setProperty(ProducerConfig.LINGER_MS_CONFIG, "10"); // 攒批10ms
producerProps.setProperty(ProducerConfig.BUFFER_MEMORY_CONFIG, "67108864"); // 缓冲区64MB
producerProps.setProperty(ProducerConfig.ACKS_CONFIG, "1"); // acks=1,吞吐最高
producerProps.setProperty(ProducerConfig.RETRIES_CONFIG, "1");
producerProps.setProperty(ProducerConfig.COMPRESSION_TYPE_CONFIG, "lz4"); // 开启lz4压缩,减少网络IO
return KafkaSink.<String>builder()
.setBootstrapServers(BOOTSTRAP_SERVERS)
.setKafkaProducerConfig(producerProps)
.setRecordSerializer(KafkaRecordSerializationSchema.builder()
.setTopic(topic)
.setValueSerializationSchema(new SimpleStringSchema())
.build())
.setDeliveryGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
.build();
}
}
2. server.properties
不同的地方
broker.id=0
listeners=PLAINTEXT://10.100.1.1:9092
advertised.listeners=PLAINTEXT://10.100.1.1:9092
broker.id=1
listeners=PLAINTEXT://10.100.1.2:9092
advertised.listeners=PLAINTEXT://10.100.1.2:9092
broker.id=2
listeners=PLAINTEXT://10.100.1.3:9092
advertised.listeners=PLAINTEXT://10.100.1.3:9092
broker.id=0
listeners=PLAINTEXT://10.100.1.1:9092
advertised.listeners=PLAINTEXT://10.100.1.1:9092
num.network.threads=3
num.io.threads=8
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400
socket.request.max.bytes=104857600
log.dirs=/data/kafka/data
num.partitions=8
num.recovery.threads.per.data.dir=1
offsets.topic.replication.factor=3
transaction.state.log.replication.factor=3
transaction.state.log.min.isr=2
log.retention.hours=168
log.segment.bytes=1073741824
log.retention.check.interval.ms=300000
zookeeper.connect=yy1:2181,yy2:2181,yy3:2181/kafka
zookeeper.connection.timeout.ms=18000
group.initial.rebalance.delay.ms=0
auto.create.topics.enable=true
delete.topic.enable=true
transaction.max.timeout.ms=3600000
3. kafka集群启动脚本
kafka-cluster.sh 内容如下( 注意里面的ssh -p 2022 需要修改;默认是22,不做端口指定)
#!/bin/bash
# Kafka集群节点列表(替换成你的主机名/IP)
NODES=("yy1" "yy2" "yy3")
# Kafka安装路径
KAFKA_HOME="/opt/kafka"
# 配置文件路径
CONFIG_FILE="${KAFKA_HOME}/config/server.properties"
# 参数校验
if [ $# -ne 1 ]; then
echo "使用方法: $0 [start|stop]"
exit 1
fi
COMMAND=$1
# 批量执行
case $COMMAND in
start)
echo "===== 开始批量启动Kafka集群 ====="
for node in "${NODES[@]}"; do
echo "启动${node}..."
ssh -p 2022 $node "${KAFKA_HOME}/bin/kafka-server-start.sh -daemon ${CONFIG_FILE}"
done
echo "===== Kafka集群启动命令已下发 ====="
;;
stop)
echo "===== 开始批量停止Kafka集群 ====="
for node in "${NODES[@]}"; do
echo "停止${node}..."
ssh -p 2022 $node "${KAFKA_HOME}/bin/kafka-server-stop.sh"
done
echo "===== Kafka集群停止命令已下发 ====="
;;
*)
echo "无效参数!仅支持: start|stop"
chmod 775 kafka-cluster.sh
sh kafka-cluster.sh start
sh kafka-cluster.sh stop
提示: 如果Linux服务器的默认端口22已修改,把脚本中-p 2022 换成指定的,如果使用默认的端口22,
把 -p 2022 去掉即可