flink自定义process,使用状态求历史总和(scala)

es idea maven 依赖

<dependency>

<groupId>org.apache.flink</groupId>

<artifactId>flink-connector-elasticsearch7_2.11</artifactId>

<version>1.11.1</version>

</dependency>

复制代码
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch7.{ElasticsearchSink, RestClientFactory}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.flink.util.Collector
import org.apache.http.HttpHost
import org.apache.http.client.config.RequestConfig
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.elasticsearch.action.DocWriteRequest
import org.elasticsearch.client.{Requests, RestClientBuilder}

import java.time.Duration
import java.util.Properties

object Test {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //需要状态开启下面的配置
    //env.setStateBackend(new RocksDBStateBackend(s"hdfs://${namenodeID}", true))//hdfs 作为状态后端
    //env.enableCheckpointing(10 * 60 * 1000L)
    //env.getCheckpointConfig.setCheckpointTimeout(10 * 60 * 1000L)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间
    val props = new Properties
    props.setProperty("bootstrap.servers", "host:6667") //有些是9092端口
    props.setProperty("group.id", "groupId")
    props.setProperty("retries", "10")
    props.setProperty("retries.backoff.ms", "100")
    props.put(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "60000")
    //是否配置了权限,有的话加上下面的配置
    // props.setProperty("sasl.jaas.config","org.apache.kafka.common.security.plain.PlainLoginModule required username='' password='';")
    //props.setProperty("security.protocol", "SASL_PLAINTEXT");
    // props.setProperty("sasl.mechanism", "PLAIN")
    val httpHosts = new java.util.ArrayList[HttpHost]
    httpHosts.add(new HttpHost("esIPOne", 9200, "http"))
    httpHosts.add(new HttpHost("esIPTwo", 9200, "http"))
    httpHosts.add(new HttpHost("esIPThree", 9200, "http"))

    val esSinkBuilder = new ElasticsearchSink.Builder[ResultBean](httpHosts, new ElasticsearchSinkFunction[ResultBean] {
      def process(element: ResultBean, ctx: RuntimeContext, indexer: RequestIndexer) {
        val json = new java.util.HashMap[String, Any]
        json.put("@timestamp", element.ts)
        json.put("data", element.data)
        json.put("sum", element.sum)
        val rqst = Requests.indexRequest()
          .index("indexName")
          .id(element.id)
          .source(json)
          .opType(DocWriteRequest.OpType.INDEX)

        indexer.add(rqst)
      }
    })
    setESConf(esSinkBuilder, 5000)
    val myConsumer = new FlinkKafkaConsumer[DemoBean]("topicName", new DemoKafka(), props)
      .setStartFromEarliest() //从什么时间开始读

    val source = env
      .addSource(myConsumer)
      .uid("source-data")
      .name("数据源")
      .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness[DemoBean](Duration.ofSeconds(1))
        .withTimestampAssigner(new SerializableTimestampAssigner[DemoBean] {
          override def extractTimestamp(element: DemoBean, recordTimestamp: Long): Long = element.ts
        }).withIdleness(Duration.ofSeconds(5)))
      .uid("water-marks")
      .name("注册水位线")

    source
      .keyBy(k => k.id)
      .process(new DemoProcess())
      .uid("demo-process")
      .name("process 示例")
      .addSink(esSinkBuilder.build())
      .uid("es-sink")
      .name("数据写入es")

    env.execute("任务名")
  }

  private class DemoKafka() extends KafkaDeserializationSchema[DemoBean] {
    override def isEndOfStream(t: DemoBean): Boolean = false

    override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): DemoBean = {
      val value = new String(consumerRecord.value())
      val list = value.split("\t")
      DemoBean(list(0), list(1), list(2).toInt, list(3).toLong)
    }

    override def getProducedType: TypeInformation[DemoBean] = TypeExtractor.getForClass(classOf[DemoBean])
  }

  private class DemoProcess extends KeyedProcessFunction[String, DemoBean, ResultBean] {
    private var hisSumState: ValueState[Int] = _

    override def open(parameters: Configuration): Unit = {
      hisSumState = getRuntimeContext.getState(new ValueStateDescriptor("his-sum", classOf[Int]))
    }

    override def processElement(data: DemoBean, ctx: KeyedProcessFunction[String, DemoBean, ResultBean]#Context, out: Collector[ResultBean]): Unit = {
      val his = if (hisSumState.value() == null) 0 else hisSumState.value()
      val now = data.value
      hisSumState.update(now)
      out.collect(ResultBean(data.id, data.data, his + now, data.value))
    }
  }

  def setESConf[T](esSinkBuilder: ElasticsearchSink.Builder[T], numMaxActions: Int) {
    esSinkBuilder.setBulkFlushMaxActions(numMaxActions)
    esSinkBuilder.setBulkFlushMaxSizeMb(10)
    esSinkBuilder.setBulkFlushInterval(10000)
    esSinkBuilder.setBulkFlushBackoff(true)
    esSinkBuilder.setBulkFlushBackoffDelay(2)
    esSinkBuilder.setBulkFlushBackoffRetries(3)
    esSinkBuilder.setRestClientFactory(new RestClientFactory {
      override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
        restClientBuilder.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
          override def customizeRequestConfig(requestConfigBuilder: RequestConfig.Builder): RequestConfig.Builder = {
            requestConfigBuilder.setConnectTimeout(12000)
            requestConfigBuilder.setSocketTimeout(90000)
          }
        })
      }
    })
  }

  private case class DemoBean(id: String, data: String, value: Int, ts: Long)

  private case class ResultBean(id: String, data: String, sum: Int, ts: Long)
}
相关推荐
Jackeyzhe5 小时前
Flink学习笔记:状态类型和应用
flink
九河云5 小时前
华为云 ECS 弹性伸缩技术:应对业务峰值的算力动态调度策略
大数据·服务器·人工智能·物联网·华为云
AI营销资讯站6 小时前
AI营销内容生产:哪些平台支持全球多语言内容同步生产?
大数据·人工智能
桃花键神7 小时前
openFuyao在AI推理与大数据场景中的加速方案:技术特性与实践探索
大数据·人工智能
天远数科9 小时前
前端全栈进阶:使用 Node.js Crypto 模块处理 AES 加密与天远API数据聚合
大数据·api
天远API9 小时前
后端进阶:使用 Go 处理天远API的 KV 数组结构与并发风控
大数据·api
千匠网络9 小时前
S2B供应链平台:优化资源配置,推动产业升级
大数据·人工智能·产品运营·供应链·s2b
WX-bisheyuange10 小时前
基于Spring Boot的智慧校园管理系统设计与实现
java·大数据·数据库·毕业设计
AI营销快线10 小时前
AI如何每日自动生成大量高质量营销素材?
大数据·人工智能
KKKlucifer10 小时前
从 “人工标注” 到 “AI 驱动”:数据分类分级技术的效率革命
大数据·人工智能·分类