flink自定义process,使用状态求历史总和(scala)

es idea maven 依赖

<dependency>

<groupId>org.apache.flink</groupId>

<artifactId>flink-connector-elasticsearch7_2.11</artifactId>

<version>1.11.1</version>

</dependency>

复制代码
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch7.{ElasticsearchSink, RestClientFactory}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.flink.util.Collector
import org.apache.http.HttpHost
import org.apache.http.client.config.RequestConfig
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.elasticsearch.action.DocWriteRequest
import org.elasticsearch.client.{Requests, RestClientBuilder}

import java.time.Duration
import java.util.Properties

object Test {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //需要状态开启下面的配置
    //env.setStateBackend(new RocksDBStateBackend(s"hdfs://${namenodeID}", true))//hdfs 作为状态后端
    //env.enableCheckpointing(10 * 60 * 1000L)
    //env.getCheckpointConfig.setCheckpointTimeout(10 * 60 * 1000L)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //事件时间
    val props = new Properties
    props.setProperty("bootstrap.servers", "host:6667") //有些是9092端口
    props.setProperty("group.id", "groupId")
    props.setProperty("retries", "10")
    props.setProperty("retries.backoff.ms", "100")
    props.put(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "60000")
    //是否配置了权限,有的话加上下面的配置
    // props.setProperty("sasl.jaas.config","org.apache.kafka.common.security.plain.PlainLoginModule required username='' password='';")
    //props.setProperty("security.protocol", "SASL_PLAINTEXT");
    // props.setProperty("sasl.mechanism", "PLAIN")
    val httpHosts = new java.util.ArrayList[HttpHost]
    httpHosts.add(new HttpHost("esIPOne", 9200, "http"))
    httpHosts.add(new HttpHost("esIPTwo", 9200, "http"))
    httpHosts.add(new HttpHost("esIPThree", 9200, "http"))

    val esSinkBuilder = new ElasticsearchSink.Builder[ResultBean](httpHosts, new ElasticsearchSinkFunction[ResultBean] {
      def process(element: ResultBean, ctx: RuntimeContext, indexer: RequestIndexer) {
        val json = new java.util.HashMap[String, Any]
        json.put("@timestamp", element.ts)
        json.put("data", element.data)
        json.put("sum", element.sum)
        val rqst = Requests.indexRequest()
          .index("indexName")
          .id(element.id)
          .source(json)
          .opType(DocWriteRequest.OpType.INDEX)

        indexer.add(rqst)
      }
    })
    setESConf(esSinkBuilder, 5000)
    val myConsumer = new FlinkKafkaConsumer[DemoBean]("topicName", new DemoKafka(), props)
      .setStartFromEarliest() //从什么时间开始读

    val source = env
      .addSource(myConsumer)
      .uid("source-data")
      .name("数据源")
      .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness[DemoBean](Duration.ofSeconds(1))
        .withTimestampAssigner(new SerializableTimestampAssigner[DemoBean] {
          override def extractTimestamp(element: DemoBean, recordTimestamp: Long): Long = element.ts
        }).withIdleness(Duration.ofSeconds(5)))
      .uid("water-marks")
      .name("注册水位线")

    source
      .keyBy(k => k.id)
      .process(new DemoProcess())
      .uid("demo-process")
      .name("process 示例")
      .addSink(esSinkBuilder.build())
      .uid("es-sink")
      .name("数据写入es")

    env.execute("任务名")
  }

  private class DemoKafka() extends KafkaDeserializationSchema[DemoBean] {
    override def isEndOfStream(t: DemoBean): Boolean = false

    override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): DemoBean = {
      val value = new String(consumerRecord.value())
      val list = value.split("\t")
      DemoBean(list(0), list(1), list(2).toInt, list(3).toLong)
    }

    override def getProducedType: TypeInformation[DemoBean] = TypeExtractor.getForClass(classOf[DemoBean])
  }

  private class DemoProcess extends KeyedProcessFunction[String, DemoBean, ResultBean] {
    private var hisSumState: ValueState[Int] = _

    override def open(parameters: Configuration): Unit = {
      hisSumState = getRuntimeContext.getState(new ValueStateDescriptor("his-sum", classOf[Int]))
    }

    override def processElement(data: DemoBean, ctx: KeyedProcessFunction[String, DemoBean, ResultBean]#Context, out: Collector[ResultBean]): Unit = {
      val his = if (hisSumState.value() == null) 0 else hisSumState.value()
      val now = data.value
      hisSumState.update(now)
      out.collect(ResultBean(data.id, data.data, his + now, data.value))
    }
  }

  def setESConf[T](esSinkBuilder: ElasticsearchSink.Builder[T], numMaxActions: Int) {
    esSinkBuilder.setBulkFlushMaxActions(numMaxActions)
    esSinkBuilder.setBulkFlushMaxSizeMb(10)
    esSinkBuilder.setBulkFlushInterval(10000)
    esSinkBuilder.setBulkFlushBackoff(true)
    esSinkBuilder.setBulkFlushBackoffDelay(2)
    esSinkBuilder.setBulkFlushBackoffRetries(3)
    esSinkBuilder.setRestClientFactory(new RestClientFactory {
      override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
        restClientBuilder.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() {
          override def customizeRequestConfig(requestConfigBuilder: RequestConfig.Builder): RequestConfig.Builder = {
            requestConfigBuilder.setConnectTimeout(12000)
            requestConfigBuilder.setSocketTimeout(90000)
          }
        })
      }
    })
  }

  private case class DemoBean(id: String, data: String, value: Int, ts: Long)

  private case class ResultBean(id: String, data: String, sum: Int, ts: Long)
}
相关推荐
fanstuck2 小时前
基于大模型的个性化推荐系统实现探索与应用
大数据·人工智能·语言模型·数据挖掘
IT学长编程3 小时前
计算机毕业设计 基于大数据技术的医疗数据分析与研究 Python 大数据毕业设计 Hadoop毕业设计选题【附源码+文档报告+安装调试】
大数据·hadoop·机器学习·数据分析·毕业设计·毕业论文·医疗数据分析
free3 小时前
基于librdkafa C++客户端生产者发送数据失败问题处理#2
c++·kafka
不吃饭的猪3 小时前
kafka启动小脚本
分布式·kafka
lwprain4 小时前
龙蜥8.10中spark各种集群及单机模式的搭建spark3.5.6(基于hadoop3.3.6集群)
大数据·ajax·spark
shallwe小威4 小时前
SpringBoot集成ElasticSearch
数据库·spring boot·elasticsearch
电商软件开发 小银5 小时前
本地生活服务平台创新模式观察:积分体系如何重塑消费生态?
大数据·人工智能·数字化转型·私域运营·消费者心理学
chenglin0165 小时前
TOGAF——ArchiMate
大数据
扬帆起航135 小时前
亚马逊新品推广破局指南:从手动试错到智能闭环的系统化路径
大数据·数据库·人工智能
Elastic 中国社区官方博客5 小时前
使用 LangExtract 和 Elasticsearch
大数据·人工智能·elasticsearch·搜索引擎·ai·信息可视化·全文检索