SparkStreaming消费Kafka 重启任务时重复消费数据

Spark是至少一次语义,在处理数据时,测试使用 kill -15 SparkSubmit PID 杀死任务仍然有可能因为偏移量没提交成功导致重复消费数据;

下面有两个方式实现去避免重复消费:

1.依赖外部组件nacos和redis 做幂等性

nacos有一个配置的回调方法,当配置发生变化时会主动推送最新配置;可以利用此机制结合spark的广播变量,将最新配置推送给executor,executor获取最新的配置,在重启前开启redis去重逻辑防止重复消费,在重启后关闭redis去重逻辑节省资源。

CREATE TABLE `kafka_consumer_offset_test` (

`company_id` varchar(36) DEFAULT NULL,

`result_date` date DEFAULT NULL

`group_id` varchar(36) DEFAULT NULL,

`s_id` varchar(36) DEFAULT NULL ,

`source` int DEFAULT NULL ,

`channel_flag` varchar(100) ,

`total_times` int DEFAULT '0' ,

`total_score` int DEFAULT '0' ,

UNIQUE KEY `PRI_KEY` (`company_id`,`result_date`,`group_id`,`s_id`,`source`,`channel_flag`) USING BTREE

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

java 复制代码
object SparkStreamingTest {
  @volatile private var instance: Broadcast[String] = _


  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .set("spark.eventLog.enabled", "false") //关闭写历史状态
      .set("spark.streaming.stopGracefullyOnShutdown", "true")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .set("spark.kryoserializer.buffer", "20240")
      .set("spark.streaming.kafka.maxRatePerPartition", "10")
      .setMaster("local[2]")
      .setAppName("myTest")


    val sc = new SparkContext(conf)
    sc.setLogLevel(Level.WARN.toString)
    val ssc = new StreamingContext(sc, Seconds(5))
    val topic = "evaluation"

    val kafkaBootstrapServers = "x.x.x.x:9092"
    //准备kafka参数
    val kafkaParams = Map[String, Object](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaBootstrapServers,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.GROUP_ID_CONFIG -> "1test",
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest", 
      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
    )

    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = sc.broadcast("0")
        }
      }
    }

    YamlTool.configService.addListener(NaCosConstant.NACOS_DATA_ID, NaCosConstant.GROUP_BIG_DATA, new Listener {
      override def getExecutor: Executor = null

      override def receiveConfigInfo(s: String): Unit = {
        try {
          val inputStream = IOUtils.toInputStream(s, Charset.forName("UTF-8"))
          inputStream.close()
          val yaml = new Yaml
          val value: util.HashMap[String, Object] = yaml.load[java.util.HashMap[String, Object]](inputStream)
          val nObject: JSONObject = new JSONObject(value)
          val cache_topic_info: AnyRef = JSONPath.eval(nObject.toString(), "$.spark.job.cache_topic_info")
          println("cache_topic_info:" + cache_topic_info)

          instance.unpersist()
          instance = sc.broadcast(cache_topic_info.toString)

        } catch {
          case e: NacosException => e.printStackTrace()
        }
      }
    })

    val topicArray = StringUtils.split(topic, ",")
    val kafkaDS = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topicArray, kafkaParams))
    kafkaDS.foreachRDD { rdd =>
      if (!rdd.isEmpty()) {
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

        try {
          rdd.foreachPartition(iterator => {
            val cacheValue = instance.value
            var connection: Connection = null

            try {
              connection = MysqlToolV2.getConnection()

              iterator.foreach(consumer => {
                val topic = consumer.topic()
                val partition = consumer.partition()
                val offset = consumer.offset()
                val data: String = consumer.value()

                val redisKey = s"${topic}_${partition}_${offset}"

                println(redisKey)
                println(s"实时配置变化 cacheValue:$cacheValue- \n" + data)
                if (cacheValue.equals("1")) {
                  println(s"启动redis判断是否重复消费 存储redisKey:${redisKey}")
                  insertMysql(connection, data)
                } else {
                  println("关闭redis判断是否重复消费!")
                  insertMysql(connection, data)
                }
              })

            } catch {
              case e: Exception => e.printStackTrace()
            } finally {
              MysqlToolV2.closeConnection(connection)
            }


          })


          kafkaDS.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
        } catch {
          case e: Exception => e.printStackTrace()
        }

      }

    }


    ssc.start()
    try {
      ssc.awaitTermination()
    } finally {
      println("Spark Streaming应用已终止!")
    }

  }


  def insertMysql(connection: Connection, data: String) = {
    try {
      val nObject: JSONObject = JSON.parseObject(data)
      val companyId = nObject.getString("companyId")


        val companyId = nObject.getString("companyId")
        val datetime = nObject.getLongValue("datetime")
        val resultDay = DateFormatUtils.format(datetime, "yyyy-MM-dd")
        val groupId = nObject.getString("groupId")
        val s_Id= nObject.getString("sId")
        val source = nObject.getIntValue("source")
        val channelFlag = nObject.getString("channelFlag")
        val totalTimes = 1
        val score = nObject.getIntValue("score")

        val sql =
          """
            |insert into kafka_consumer_offset_test(`company_id`,`result_date`,`group_id`,`s_Id`,`source`,`channel_flag`,`total_times`,`total_score`) values ( ?, ?, ?, ?, ?, ?, ?, ?) on duplicate key update total_times=total_times+?,total_score=total_score+?""".stripMargin


        val statement = connection.prepareStatement(sql)
        statement.setString(1, companyId)
        statement.setString(2, resultDay)
        statement.setString(3, groupId)
        statement.setString(4, s_Id)
        statement.setInt(5, source)
        statement.setString(6, channelFlag)
        statement.setInt(7, totalTimes)
        statement.setInt(8, score)
        statement.setInt(9, totalTimes)
        statement.setInt(10, score)

        statement.executeUpdate()
       

    } catch {
      case e: Exception => e.printStackTrace()
    }
  }

}

2.通过添加钩子,通知executor 停止时不再消费数据

java 复制代码
object SparkConsumerTest {
  // 标记应用是否正在关闭
  private val isShuttingDown = new AtomicBoolean(false)

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .set("spark.eventLog.enabled", "false") //关闭写历史状态
      .set("spark.streaming.stopGracefullyOnShutdown", "true")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .set("spark.kryoserializer.buffer", "20240")
      .set("spark.streaming.kafka.maxRatePerPartition", "10")
      .setMaster("local[2]")
      .setAppName("myTest")


    val sc = new SparkContext(conf)
    sc.setLogLevel(Level.WARN.toString)
    val ssc = new StreamingContext(sc, Seconds(5))
    // 注册JVM关闭钩子
    registerShutdownHook(ssc)
    val topic = "evaluation"

    val kafkaBootstrapServers = "x.x.x.x:9092"
    //准备kafka参数
    val kafkaParams = Map[String, Object](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaBootstrapServers,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.GROUP_ID_CONFIG -> "test",
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest", // earliest/latest
      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
    )




    val topicArray = StringUtils.split(topic, ",")
    val kafkaDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topicArray, kafkaParams))

    kafkaDS.foreachRDD { rdd =>
      if (!rdd.isEmpty() && !isShuttingDown.get()) {
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

        try {
          rdd.foreachPartition(iterator => {
            var connection: Connection = null

            try {
              connection = MysqlToolV2.getConnection()

              iterator.foreach(consumer => {
                val data: String = consumer.value()
                insertMysql(connection, data)
              })

            } catch {
              case e: Exception => e.printStackTrace()
            } finally {
              MysqlToolV2.closeConnection(connection)
            }

          })


          kafkaDS.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
        } catch {
          case e: Exception => e.printStackTrace()
        }

      } else if (isShuttingDown.get()) {
        println("应用正在关闭,跳过当前批次处理")
      }

    }


    ssc.start()
    try {
      ssc.awaitTermination()
    } finally {
      println("Spark Streaming应用已终止!")
    }
  }



  def insertMysql(connection: Connection, data: String) = {
    try {
      val nObject: JSONObject = JSON.parseObject(data)
      val companyId = nObject.getString("companyId")


        var preparedStatementMap = SortedMap[String, PreparedStatement]()

        val companyId = nObject.getString("companyId")
        val datetime = nObject.getLongValue("datetime")
        val resultDay = DateFormatUtils.format(datetime, "yyyy-MM-dd")
        val groupId = nObject.getString("groupId")
        val sId = nObject.getString("sId")
        val source = nObject.getIntValue("source")
        val channelFlag = nObject.getString("channelFlag")
        val totalTimes = 1
        val score = nObject.getIntValue("score")

        val sql =
          """
            |insert into kafka_consumer_offset_test(`company_id`,`result_date`,`group_id`,`s_id`,`source`,`channel_flag`,`total_times`,`total_score`) values ( ?, ?, ?, ?, ?, ?, ?, ?) on duplicate key update total_times=total_times+?,total_score=total_score+?""".stripMargin


        val statement = connection.prepareStatement(sql)
        statement.setString(1, companyId)
        statement.setString(2, resultDay)
        statement.setString(3, groupId)
        statement.setString(4, sId)
        statement.setInt(5, source)
        statement.setString(6, channelFlag)
        statement.setInt(7, totalTimes)
        statement.setInt(8, score)
        statement.setInt(9, totalTimes)
        statement.setInt(10, score)

        statement.executeUpdate()
      

    } catch {
      case e: Exception => e.printStackTrace()
    }
  }

  /**
   * 注册关闭钩子
   */
  private def registerShutdownHook(ssc: StreamingContext): Unit = {
    Runtime.getRuntime.addShutdownHook(new Thread(() => {
      println("收到关闭信号,开始执行优雅关闭...")

      if (isShuttingDown.compareAndSet(false, true)) {
        println("设置关闭标志为true")

        try {
          // 等待一小段时间让当前批次处理完成
          Thread.sleep(5000)
        } catch {
          case e: Exception =>
            println("关闭过程中发生错误", e)
        }
      } else {
        println("关闭标志已设置,无需重复操作")
      }
    }))
  }
}
相关推荐
编程彩机2 小时前
互联网大厂Java面试:从Spring Cloud到Kafka的技术场景深度解析
java·spring cloud·微服务·kafka·技术面试
xiaolyuh1232 小时前
Kafka、RocketMQ、RabbitMQ 事务消息核心差异对比
kafka·rabbitmq·rocketmq
天天进步201513 小时前
多线程与分布式:使用 Botasaurus 轻松构建大规模数据采集集群
分布式·爬虫
川西胖墩墩17 小时前
复杂任务的分布式智能解决方案
人工智能·分布式
2501_9418053119 小时前
使用Python和Go构建高性能分布式任务调度系统的实践分享
分布式·python·golang
徐先生 @_@|||20 小时前
数据分析体系全览导图综述
大数据·hadoop·分布式·数据分析
虹科网络安全21 小时前
艾体宝洞察 | 缓存策略深度解析:从内存缓存到 Redis 分布式缓存
redis·分布式·缓存
廋到被风吹走1 天前
【消息队列】选型深度对比:Kafka vs RocketMQ vs RabbitMQ
kafka·rabbitmq·rocketmq
YE1234567_1 天前
从底层零拷贝到分布式架构:深度剖析现代 C++ 构建超大规模高性能 AI 插件引擎的实战之道
c++·分布式·架构