flink自定义process,使用状态求历史总和(scala)

es idea maven 依赖

<dependency>

<groupId>org.apache.flink</groupId>

<artifactId>flink-connector-elasticsearch7_2.11</artifactId>

<version>1.11.1</version>

</dependency>

import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch7.{ElasticsearchSink, RestClientFactory}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.flink.util.Collector
import org.apache.http.HttpHost
import org.apache.http.client.config.RequestConfig
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.elasticsearch.action.DocWriteRequest
import org.elasticsearch.client.{Requests, RestClientBuilder}

import java.time.Duration
import java.util.Properties

object Test {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //需要状态开启下面的配置
    //env.setStateBackend(new RocksDBStateBackend(s"hdfs://${namenodeID}", true))//hdfs 作为状态后端
    //env.enableCheckpointing(10 * 60 * 1000L)
    //env.getCheckpointConfig.setCheckpointTimeout(10 * 60 * 1000L)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间
    val props = new Properties
    props.setProperty("bootstrap.servers", "host:6667") //有些是9092端口
    props.setProperty("group.id", "groupId")
    props.setProperty("retries", "10")
    props.setProperty("retries.backoff.ms", "100")
    props.put(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "60000")
    //是否配置了权限,有的话加上下面的配置
    // props.setProperty("sasl.jaas.config","org.apache.kafka.common.security.plain.PlainLoginModule required username='' password='';")
    //props.setProperty("security.protocol", "SASL_PLAINTEXT");
    // props.setProperty("sasl.mechanism", "PLAIN")
    val httpHosts = new java.util.ArrayList[HttpHost]
    httpHosts.add(new HttpHost("esIPOne", 9200, "http"))
    httpHosts.add(new HttpHost("esIPTwo", 9200, "http"))
    httpHosts.add(new HttpHost("esIPThree", 9200, "http"))

    val esSinkBuilder = new ElasticsearchSink.Builder[ResultBean](httpHosts, new ElasticsearchSinkFunction[ResultBean] {
      def process(element: ResultBean, ctx: RuntimeContext, indexer: RequestIndexer) {
        val json = new java.util.HashMap[String, Any]
        json.put("@timestamp", element.ts)
        json.put("data", element.data)
        json.put("sum", element.sum)
        val rqst = Requests.indexRequest()
          .index("indexName")
          .id(element.id)
          .source(json)
          .opType(DocWriteRequest.OpType.INDEX)

        indexer.add(rqst)
      }
    })
    setESConf(esSinkBuilder, 5000)
    val myConsumer = new FlinkKafkaConsumer[DemoBean]("topicName", new DemoKafka(), props)
      .setStartFromEarliest() //从什么时间开始读

    val source = env
      .addSource(myConsumer)
      .uid("source-data")
      .name("数据源")
      .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness[DemoBean](Duration.ofSeconds(1))
        .withTimestampAssigner(new SerializableTimestampAssigner[DemoBean] {
          override def extractTimestamp(element: DemoBean, recordTimestamp: Long): Long = element.ts
        }).withIdleness(Duration.ofSeconds(5)))
      .uid("water-marks")
      .name("注册水位线")

    source
      .keyBy(k => k.id)
      .process(new DemoProcess())
      .uid("demo-process")
      .name("process 示例")
      .addSink(esSinkBuilder.build())
      .uid("es-sink")
      .name("数据写入es")

    env.execute("任务名")
  }

  private class DemoKafka() extends KafkaDeserializationSchema[DemoBean] {
    override def isEndOfStream(t: DemoBean): Boolean = false

    override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): DemoBean = {
      val value = new String(consumerRecord.value())
      val list = value.split("\t")
      DemoBean(list(0), list(1), list(2).toInt, list(3).toLong)
    }

    override def getProducedType: TypeInformation[DemoBean] = TypeExtractor.getForClass(classOf[DemoBean])
  }

  private class DemoProcess extends KeyedProcessFunction[String, DemoBean, ResultBean] {
    private var hisSumState: ValueState[Int] = _

    override def open(parameters: Configuration): Unit = {
      hisSumState = getRuntimeContext.getState(new ValueStateDescriptor("his-sum", classOf[Int]))
    }

    override def processElement(data: DemoBean, ctx: KeyedProcessFunction[String, DemoBean, ResultBean]#Context, out: Collector[ResultBean]): Unit = {
      val his = if (hisSumState.value() == null) 0 else hisSumState.value()
      val now = data.value
      hisSumState.update(now)
      out.collect(ResultBean(data.id, data.data, his + now, data.value))
    }
  }

  def setESConf[T](esSinkBuilder: ElasticsearchSink.Builder[T], numMaxActions: Int) {
    esSinkBuilder.setBulkFlushMaxActions(numMaxActions)
    esSinkBuilder.setBulkFlushMaxSizeMb(10)
    esSinkBuilder.setBulkFlushInterval(10000)
    esSinkBuilder.setBulkFlushBackoff(true)
    esSinkBuilder.setBulkFlushBackoffDelay(2)
    esSinkBuilder.setBulkFlushBackoffRetries(3)
    esSinkBuilder.setRestClientFactory(new RestClientFactory {
      override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
        restClientBuilder.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
          override def customizeRequestConfig(requestConfigBuilder: RequestConfig.Builder): RequestConfig.Builder = {
            requestConfigBuilder.setConnectTimeout(12000)
            requestConfigBuilder.setSocketTimeout(90000)
          }
        })
      }
    })
  }

  private case class DemoBean(id: String, data: String, value: Int, ts: Long)

  private case class ResultBean(id: String, data: String, sum: Int, ts: Long)
}
相关推荐
WTT00112 小时前
2024楚慧杯WP
大数据·运维·网络·安全·web安全·ctf
云云3216 小时前
怎么通过亚矩阵云手机实现营销?
大数据·服务器·安全·智能手机·矩阵
新加坡内哥谈技术6 小时前
苏黎世联邦理工学院与加州大学伯克利分校推出MaxInfoRL:平衡内在与外在探索的全新强化学习框架
大数据·人工智能·语言模型
Data-Miner7 小时前
经典案例PPT | 大型水果连锁集团新零售数字化建设方案
大数据·big data
lovelin+v175030409667 小时前
安全性升级:API接口在零信任架构下的安全防护策略
大数据·数据库·人工智能·爬虫·数据分析
道一云黑板报8 小时前
Flink集群批作业实践:七析BI批作业执行
大数据·分布式·数据分析·flink·kubernetes
qq_5470261798 小时前
Kafka 常见问题
kafka
darkdragonking8 小时前
OpenEuler 22.03 安装 flink-1.17.2 集群
flink·openeuler
core5128 小时前
flink sink kafka
flink·kafka·sink