Spark是至少一次语义,在处理数据时,测试使用 kill -15 SparkSubmit PID 杀死任务仍然有可能因为偏移量没提交成功导致重复消费数据;
下面有两个方式实现去避免重复消费:
1.依赖外部组件nacos和redis 做幂等性
nacos有一个配置的回调方法,当配置发生变化时会主动推送最新配置;可以利用此机制结合spark的广播变量,将最新配置推送给executor,executor获取最新的配置,在重启前开启redis去重逻辑防止重复消费,在重启后关闭redis去重逻辑节省资源。
CREATE TABLE `kafka_consumer_offset_test` (
`company_id` varchar(36) DEFAULT NULL,
`result_date` date DEFAULT NULL
`group_id` varchar(36) DEFAULT NULL,
`s_id` varchar(36) DEFAULT NULL ,
`source` int DEFAULT NULL ,
`channel_flag` varchar(100) ,
`total_times` int DEFAULT '0' ,
`total_score` int DEFAULT '0' ,
UNIQUE KEY `PRI_KEY` (`company_id`,`result_date`,`group_id`,`s_id`,`source`,`channel_flag`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
java
object SparkStreamingTest {
@volatile private var instance: Broadcast[String] = _
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.set("spark.eventLog.enabled", "false") //关闭写历史状态
.set("spark.streaming.stopGracefullyOnShutdown", "true")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.kryoserializer.buffer", "20240")
.set("spark.streaming.kafka.maxRatePerPartition", "10")
.setMaster("local[2]")
.setAppName("myTest")
val sc = new SparkContext(conf)
sc.setLogLevel(Level.WARN.toString)
val ssc = new StreamingContext(sc, Seconds(5))
val topic = "evaluation"
val kafkaBootstrapServers = "x.x.x.x:9092"
//准备kafka参数
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaBootstrapServers,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> "1test",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
)
if (instance == null) {
synchronized {
if (instance == null) {
instance = sc.broadcast("0")
}
}
}
YamlTool.configService.addListener(NaCosConstant.NACOS_DATA_ID, NaCosConstant.GROUP_BIG_DATA, new Listener {
override def getExecutor: Executor = null
override def receiveConfigInfo(s: String): Unit = {
try {
val inputStream = IOUtils.toInputStream(s, Charset.forName("UTF-8"))
inputStream.close()
val yaml = new Yaml
val value: util.HashMap[String, Object] = yaml.load[java.util.HashMap[String, Object]](inputStream)
val nObject: JSONObject = new JSONObject(value)
val cache_topic_info: AnyRef = JSONPath.eval(nObject.toString(), "$.spark.job.cache_topic_info")
println("cache_topic_info:" + cache_topic_info)
instance.unpersist()
instance = sc.broadcast(cache_topic_info.toString)
} catch {
case e: NacosException => e.printStackTrace()
}
}
})
val topicArray = StringUtils.split(topic, ",")
val kafkaDS = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topicArray, kafkaParams))
kafkaDS.foreachRDD { rdd =>
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
try {
rdd.foreachPartition(iterator => {
val cacheValue = instance.value
var connection: Connection = null
try {
connection = MysqlToolV2.getConnection()
iterator.foreach(consumer => {
val topic = consumer.topic()
val partition = consumer.partition()
val offset = consumer.offset()
val data: String = consumer.value()
val redisKey = s"${topic}_${partition}_${offset}"
println(redisKey)
println(s"实时配置变化 cacheValue:$cacheValue- \n" + data)
if (cacheValue.equals("1")) {
println(s"启动redis判断是否重复消费 存储redisKey:${redisKey}")
insertMysql(connection, data)
} else {
println("关闭redis判断是否重复消费!")
insertMysql(connection, data)
}
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
MysqlToolV2.closeConnection(connection)
}
})
kafkaDS.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
} catch {
case e: Exception => e.printStackTrace()
}
}
}
ssc.start()
try {
ssc.awaitTermination()
} finally {
println("Spark Streaming应用已终止!")
}
}
def insertMysql(connection: Connection, data: String) = {
try {
val nObject: JSONObject = JSON.parseObject(data)
val companyId = nObject.getString("companyId")
val companyId = nObject.getString("companyId")
val datetime = nObject.getLongValue("datetime")
val resultDay = DateFormatUtils.format(datetime, "yyyy-MM-dd")
val groupId = nObject.getString("groupId")
val s_Id= nObject.getString("sId")
val source = nObject.getIntValue("source")
val channelFlag = nObject.getString("channelFlag")
val totalTimes = 1
val score = nObject.getIntValue("score")
val sql =
"""
|insert into kafka_consumer_offset_test(`company_id`,`result_date`,`group_id`,`s_Id`,`source`,`channel_flag`,`total_times`,`total_score`) values ( ?, ?, ?, ?, ?, ?, ?, ?) on duplicate key update total_times=total_times+?,total_score=total_score+?""".stripMargin
val statement = connection.prepareStatement(sql)
statement.setString(1, companyId)
statement.setString(2, resultDay)
statement.setString(3, groupId)
statement.setString(4, s_Id)
statement.setInt(5, source)
statement.setString(6, channelFlag)
statement.setInt(7, totalTimes)
statement.setInt(8, score)
statement.setInt(9, totalTimes)
statement.setInt(10, score)
statement.executeUpdate()
} catch {
case e: Exception => e.printStackTrace()
}
}
}
2.通过添加钩子,通知executor 停止时不再消费数据
java
object SparkConsumerTest {
// 标记应用是否正在关闭
private val isShuttingDown = new AtomicBoolean(false)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.set("spark.eventLog.enabled", "false") //关闭写历史状态
.set("spark.streaming.stopGracefullyOnShutdown", "true")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.kryoserializer.buffer", "20240")
.set("spark.streaming.kafka.maxRatePerPartition", "10")
.setMaster("local[2]")
.setAppName("myTest")
val sc = new SparkContext(conf)
sc.setLogLevel(Level.WARN.toString)
val ssc = new StreamingContext(sc, Seconds(5))
// 注册JVM关闭钩子
registerShutdownHook(ssc)
val topic = "evaluation"
val kafkaBootstrapServers = "x.x.x.x:9092"
//准备kafka参数
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaBootstrapServers,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> "test",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest", // earliest/latest
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
)
val topicArray = StringUtils.split(topic, ",")
val kafkaDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topicArray, kafkaParams))
kafkaDS.foreachRDD { rdd =>
if (!rdd.isEmpty() && !isShuttingDown.get()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
try {
rdd.foreachPartition(iterator => {
var connection: Connection = null
try {
connection = MysqlToolV2.getConnection()
iterator.foreach(consumer => {
val data: String = consumer.value()
insertMysql(connection, data)
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
MysqlToolV2.closeConnection(connection)
}
})
kafkaDS.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
} catch {
case e: Exception => e.printStackTrace()
}
} else if (isShuttingDown.get()) {
println("应用正在关闭,跳过当前批次处理")
}
}
ssc.start()
try {
ssc.awaitTermination()
} finally {
println("Spark Streaming应用已终止!")
}
}
def insertMysql(connection: Connection, data: String) = {
try {
val nObject: JSONObject = JSON.parseObject(data)
val companyId = nObject.getString("companyId")
var preparedStatementMap = SortedMap[String, PreparedStatement]()
val companyId = nObject.getString("companyId")
val datetime = nObject.getLongValue("datetime")
val resultDay = DateFormatUtils.format(datetime, "yyyy-MM-dd")
val groupId = nObject.getString("groupId")
val sId = nObject.getString("sId")
val source = nObject.getIntValue("source")
val channelFlag = nObject.getString("channelFlag")
val totalTimes = 1
val score = nObject.getIntValue("score")
val sql =
"""
|insert into kafka_consumer_offset_test(`company_id`,`result_date`,`group_id`,`s_id`,`source`,`channel_flag`,`total_times`,`total_score`) values ( ?, ?, ?, ?, ?, ?, ?, ?) on duplicate key update total_times=total_times+?,total_score=total_score+?""".stripMargin
val statement = connection.prepareStatement(sql)
statement.setString(1, companyId)
statement.setString(2, resultDay)
statement.setString(3, groupId)
statement.setString(4, sId)
statement.setInt(5, source)
statement.setString(6, channelFlag)
statement.setInt(7, totalTimes)
statement.setInt(8, score)
statement.setInt(9, totalTimes)
statement.setInt(10, score)
statement.executeUpdate()
} catch {
case e: Exception => e.printStackTrace()
}
}
/**
* 注册关闭钩子
*/
private def registerShutdownHook(ssc: StreamingContext): Unit = {
Runtime.getRuntime.addShutdownHook(new Thread(() => {
println("收到关闭信号,开始执行优雅关闭...")
if (isShuttingDown.compareAndSet(false, true)) {
println("设置关闭标志为true")
try {
// 等待一小段时间让当前批次处理完成
Thread.sleep(5000)
} catch {
case e: Exception =>
println("关闭过程中发生错误", e)
}
} else {
println("关闭标志已设置,无需重复操作")
}
}))
}
}