flink自定义process,使用状态求历史总和(scala)

es idea maven 依赖

<dependency>

<groupId>org.apache.flink</groupId>

<artifactId>flink-connector-elasticsearch7_2.11</artifactId>

<version>1.11.1</version>

</dependency>

import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch7.{ElasticsearchSink, RestClientFactory}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.flink.util.Collector
import org.apache.http.HttpHost
import org.apache.http.client.config.RequestConfig
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.elasticsearch.action.DocWriteRequest
import org.elasticsearch.client.{Requests, RestClientBuilder}

import java.time.Duration
import java.util.Properties

object Test {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //需要状态开启下面的配置
    //env.setStateBackend(new RocksDBStateBackend(s"hdfs://${namenodeID}", true))//hdfs 作为状态后端
    //env.enableCheckpointing(10 * 60 * 1000L)
    //env.getCheckpointConfig.setCheckpointTimeout(10 * 60 * 1000L)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间
    val props = new Properties
    props.setProperty("bootstrap.servers", "host:6667") //有些是9092端口
    props.setProperty("group.id", "groupId")
    props.setProperty("retries", "10")
    props.setProperty("retries.backoff.ms", "100")
    props.put(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "60000")
    //是否配置了权限,有的话加上下面的配置
    // props.setProperty("sasl.jaas.config","org.apache.kafka.common.security.plain.PlainLoginModule required username='' password='';")
    //props.setProperty("security.protocol", "SASL_PLAINTEXT");
    // props.setProperty("sasl.mechanism", "PLAIN")
    val httpHosts = new java.util.ArrayList[HttpHost]
    httpHosts.add(new HttpHost("esIPOne", 9200, "http"))
    httpHosts.add(new HttpHost("esIPTwo", 9200, "http"))
    httpHosts.add(new HttpHost("esIPThree", 9200, "http"))

    val esSinkBuilder = new ElasticsearchSink.Builder[ResultBean](httpHosts, new ElasticsearchSinkFunction[ResultBean] {
      def process(element: ResultBean, ctx: RuntimeContext, indexer: RequestIndexer) {
        val json = new java.util.HashMap[String, Any]
        json.put("@timestamp", element.ts)
        json.put("data", element.data)
        json.put("sum", element.sum)
        val rqst = Requests.indexRequest()
          .index("indexName")
          .id(element.id)
          .source(json)
          .opType(DocWriteRequest.OpType.INDEX)

        indexer.add(rqst)
      }
    })
    setESConf(esSinkBuilder, 5000)
    val myConsumer = new FlinkKafkaConsumer[DemoBean]("topicName", new DemoKafka(), props)
      .setStartFromEarliest() //从什么时间开始读

    val source = env
      .addSource(myConsumer)
      .uid("source-data")
      .name("数据源")
      .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness[DemoBean](Duration.ofSeconds(1))
        .withTimestampAssigner(new SerializableTimestampAssigner[DemoBean] {
          override def extractTimestamp(element: DemoBean, recordTimestamp: Long): Long = element.ts
        }).withIdleness(Duration.ofSeconds(5)))
      .uid("water-marks")
      .name("注册水位线")

    source
      .keyBy(k => k.id)
      .process(new DemoProcess())
      .uid("demo-process")
      .name("process 示例")
      .addSink(esSinkBuilder.build())
      .uid("es-sink")
      .name("数据写入es")

    env.execute("任务名")
  }

  private class DemoKafka() extends KafkaDeserializationSchema[DemoBean] {
    override def isEndOfStream(t: DemoBean): Boolean = false

    override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): DemoBean = {
      val value = new String(consumerRecord.value())
      val list = value.split("\t")
      DemoBean(list(0), list(1), list(2).toInt, list(3).toLong)
    }

    override def getProducedType: TypeInformation[DemoBean] = TypeExtractor.getForClass(classOf[DemoBean])
  }

  private class DemoProcess extends KeyedProcessFunction[String, DemoBean, ResultBean] {
    private var hisSumState: ValueState[Int] = _

    override def open(parameters: Configuration): Unit = {
      hisSumState = getRuntimeContext.getState(new ValueStateDescriptor("his-sum", classOf[Int]))
    }

    override def processElement(data: DemoBean, ctx: KeyedProcessFunction[String, DemoBean, ResultBean]#Context, out: Collector[ResultBean]): Unit = {
      val his = if (hisSumState.value() == null) 0 else hisSumState.value()
      val now = data.value
      hisSumState.update(now)
      out.collect(ResultBean(data.id, data.data, his + now, data.value))
    }
  }

  def setESConf[T](esSinkBuilder: ElasticsearchSink.Builder[T], numMaxActions: Int) {
    esSinkBuilder.setBulkFlushMaxActions(numMaxActions)
    esSinkBuilder.setBulkFlushMaxSizeMb(10)
    esSinkBuilder.setBulkFlushInterval(10000)
    esSinkBuilder.setBulkFlushBackoff(true)
    esSinkBuilder.setBulkFlushBackoffDelay(2)
    esSinkBuilder.setBulkFlushBackoffRetries(3)
    esSinkBuilder.setRestClientFactory(new RestClientFactory {
      override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
        restClientBuilder.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
          override def customizeRequestConfig(requestConfigBuilder: RequestConfig.Builder): RequestConfig.Builder = {
            requestConfigBuilder.setConnectTimeout(12000)
            requestConfigBuilder.setSocketTimeout(90000)
          }
        })
      }
    })
  }

  private case class DemoBean(id: String, data: String, value: Int, ts: Long)

  private case class ResultBean(id: String, data: String, sum: Int, ts: Long)
}
相关推荐
电子手信2 分钟前
知识中台在多语言客户中的应用
大数据·人工智能·自然语言处理·数据挖掘·知识图谱
天冬忘忧21 分钟前
Kafka 数据倾斜:原因、影响与解决方案
分布式·kafka
隔着天花板看星星25 分钟前
Kafka-Consumer理论知识
大数据·分布式·中间件·kafka
holywangle27 分钟前
解决Flink读取kafka主题数据无报错无数据打印的重大发现(问题已解决)
大数据·flink·kafka
隔着天花板看星星28 分钟前
Kafka-副本分配策略
大数据·分布式·中间件·kafka
Lorin 洛林1 小时前
Hadoop 系列 MapReduce:Map、Shuffle、Reduce
大数据·hadoop·mapreduce
DolphinScheduler社区1 小时前
大数据调度组件之Apache DolphinScheduler
大数据
SelectDB技术团队1 小时前
兼顾高性能与低成本,浅析 Apache Doris 异步物化视图原理及典型场景
大数据·数据库·数据仓库·数据分析·doris
我一直在流浪1 小时前
Kafka - 消费者程序仅消费一半分区消息的问题
分布式·kafka
panpantt3212 小时前
【参会邀请】第二届大数据与数据挖掘国际会议(BDDM 2024)邀您相聚江城!
大数据·人工智能·数据挖掘