Spark学习之SaprkCore

FlinkCore

1、JavaAPI
1、创建一个Topic并写入数据

向Kafka写数据 如果topic不存在则会自动创建一个副本和分区数都是1的topic

java 复制代码
package com.shujia.kafka;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.Properties;

public class Demo01KafkaProducer {
    public static void main(String[] args) {

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "master:9092,node1:9092,node2:9092");

        properties.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        properties.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
         // 创建Kafka 生产者
        KafkaProducer<String, String> producer = new KafkaProducer<>(properties);

        // 向Kafka写数据 如果topic不存在则会自动创建一个副本和分区数都是1的topic
        producer.send(new ProducerRecord<String,String>("topic02","1500100001,施笑槐,22,女,文科六班"));
        producer.flush();

    }
}

执行了两次的结果:

2、从一个.txt 的文件中读取数据,并写入到Topic中
java 复制代码
package com.shujia.kafka;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;

public class Demo02KafkaStuProducer {
    // 将1000条学生数据写入Kafka
    public static void main(String[] args) throws IOException {
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "master:9092,node1:9092,node2:9092");

        properties.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        properties.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

        // 创建Kafka 生产者
        KafkaProducer<String, String> producer = new KafkaProducer<>(properties);

        // 读取文件
        BufferedReader br = new BufferedReader(new FileReader("kafka/data/stu/students.txt"));

        String line;
        while ((line = br.readLine()) != null) {
            producer.send(new ProducerRecord<>("students1000", line));
        }
        producer.flush();


    }
}

结果:

3、创建一个消费者,读取Topic中的数据
java 复制代码
package com.shujia.kafka;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;

import java.util.ArrayList;
import java.util.Properties;

public class Demo03KafkaConsumer {
    public static void main(String[] args) throws InterruptedException {

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "master:9092,node1:9092,node2:9092");
        properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");


        /*
         * 消费者组的偏移量设定规则:
         * earliest 相当于from-beginning 从头开始消费
         * latest 从最新的数据开始消费
         */
        properties.setProperty("auto.offset.reset", "earliest");
        // 设置消费者组id,每一个grp02只能读一次,第二次再读取时,则不会出现数据!(结果显示)
        properties.setProperty("group.id", "grp03");

        // 创建Kafka的消费者
        KafkaConsumer<String, String> consumer = new KafkaConsumer<>(properties);
        ArrayList<String> topic = new ArrayList<>();
        topic.add("students1000");
        // 指定消费的topic
        consumer.subscribe(topic);
        // 加一个死循环,使其为无界流,一直读取
        while (true){
            ConsumerRecords<String, String> records = consumer.poll(10000);
            for (ConsumerRecord<String, String> record : records) {
                System.out.println(record.headers());
                System.out.println(record.offset());
                System.out.println(record.timestamp());
                // 由于所读的Topic students1000中只有一个partition,所以结果中只显示有一个分区
//                System.out.println(record.partition());
                // 没有key,打印结果为null
                System.out.println(record.key());
                System.out.println(record.value());
            }
            // 防止执行太快,没有取完所有的数据;给它睡眠一会
            Thread.sleep(5000);
        }

    }
}
2、FlinkAPI
1、从Topic中取出数据,并做相应处理

统计每个班级的学生人数:

java 复制代码
package tfTest;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Demo01KafkaSource {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        KafkaSource<String> kafkaSource = KafkaSource
                .<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setGroupId("group001")
                .setTopics("students1000")
                .setStartingOffsets(OffsetsInitializer.earliest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();

        DataStreamSource<String> kafkaDS = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource");

        kafkaDS.map(line -> Tuple2.of(line.split(",")[4],1), Types.TUPLE(Types.STRING,Types.INT))
                .keyBy(t2 -> t2.f0)
                .sum(1)
                .print();

        env.execute();

        /**
         * 结果为:(数据流为无界流,会一直接收数据并处理)
         * 13> (理科五班,17)
         * 13> (理科四班,17)
         * 13> (理科四班,18)
         * 13> (理科四班,19)
         * 13> (理科四班,20)
         * 13> (理科五班,18)
         * 13> (理科四班,21)
         * 13> (理科四班,22)
         * 13> (理科五班,19)
         * 13> (理科四班,23)
         */

    }
}
2、读取.json格式的数据,写入到Topic中

设置写入时的语义:

1、AT_LEAST_ONCE:保证数据至少被写入了一次,性能会更好,但是又可能会写入重复的数据

2、EXACTLY_ONCE:保证数据只会写入一次,不多不少,性能会有损耗

java 复制代码
package com.shujia.flink.kafka;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Demo02KafkaSink {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<String> carDS = env.readTextFile("flink/data/cars_sample.json");

        KafkaSink<String> sink = KafkaSink.<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setRecordSerializer(
                        KafkaRecordSerializationSchema
                                .builder()
                                .setTopic("cars_json") // 不存在会自动创建
                                // 指定数据流的序列化方式
                                .setValueSerializationSchema(new SimpleStringSchema())
                                .build()
                )
                /**
                设置写入时的语义:
                1、AT_LEAST_ONCE:保证数据至少被写入了一次,性能会更好,但是又可能会写入重复的数据
                2、EXACTLY_ONCE:保证数据只会写入一次,不多不少,性能会有损耗
                 */
                .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE)
                .build();

        carDS.sinkTo(sink);

        env.execute();


    }
}
3、从写入json数据的Topic中读取数据,并求出各道路的平均车速

注:

TODO 将从Topic中读取的数据转换成自定义的Car对象,便于后续操作
SingleOutputStreamOperator carDS = carStrDS.map(carStr -> JSON.parseObject(carStr, Car.class));

第三个参数:源名称的设置在Flink中主要用于提高日志、监控、元数据管理和代码可读性 的目的

DataStreamSource carStrDS = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "cars");

reduce中聚合操作

java 复制代码
package com.shujia.flink.kafka;

import com.alibaba.fastjson.JSON;
import jdk.nashorn.internal.scripts.JO;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Demo03CarsAvgSpeed {
    public static void main(String[] args) throws Exception {
        // 基于Kafka Cars数据实时统计每条道路的平均车速
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 构建Kafka Source
        KafkaSource<String> kafkaSource = KafkaSource
                .<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setGroupId("grp001")
                .setTopics("cars_json")
                .setStartingOffsets(OffsetsInitializer.earliest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();
        // {"car":"皖AQLXL2","city_code":"340100","county_code":"340111","card":117331031812010,"camera_id":"01012","orientation":"西","road_id":34406326,"time":1614731906,"speed":47.86}
        // 第三个参数:源名称的设置在Flink中主要用于提高日志、监控、元数据管理和代码可读性的目的
        DataStreamSource<String> carStrDS = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "cars");
        //TODO 将从Topic中读取的数据转换成自定义的Car对象,便于后续操作
        SingleOutputStreamOperator<Car> carDS = carStrDS.map(carStr -> JSON.parseObject(carStr, Car.class));

        carDS
                // 只取Car对象属性中的road_id、speed,加上"1",用于后续求平均值
                .map(car-> Tuple3.of(car.road_id,car.speed,1), Types.TUPLE(Types.LONG,Types.DOUBLE,Types.INT))
                .keyBy(t3->t3.f0,Types.LONG)
                // 对整体进行聚合:
                .reduce(new ReduceFunction<Tuple3<Long, Double, Integer>>() {
                    @Override
                    public Tuple3<Long, Double, Integer> reduce(Tuple3<Long, Double, Integer> value1, Tuple3<Long, Double, Integer> value2) throws Exception {
                        return Tuple3.of(value1.f0, value1.f1 + value2.f1, value1.f2 + value2.f2);
                    }
                })
                // 聚合后,求出各路段的平均车速
                .map(t3 -> Tuple2.of(t3.f0, t3.f1 / t3.f2),Types.TUPLE(Types.LONG,Types.DOUBLE))
                .print();

        env.execute();
    }
}

// 定义一个Car类型,使用注解的方式创建get、set、构造器方法; 前期处理:加入lombok依赖,下载lombok插件
@Getter
@Setter
@AllArgsConstructor
@NoArgsConstructor
class Car{
    String car;
    Integer city_code;
    Integer county_code;
    Long card;
    String camera_id;
    String orientation;
    Long road_id;
    Long time;
    Double speed;
}
3、checkpoint

**当任务停止后,可以在HDFS上缓存任务中的结果数据。**再次启动任务时,输入数据得出的结果会算上上次运行的结果(实现故障恢复的效果)

checkpoint保存了算子运行后的结果状态:

java 复制代码
package com.shujia.flink.state;

import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Demo02CheckPoint {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 开启CK,每 5000ms 开始一次 checkpoint
        env.enableCheckpointing(5000);

        // 高级选项:
        // 设置模式为精确一次 (这是默认值)
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        // 确认 checkpoints 之间的时间会进行 500 ms
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
        // Checkpoint 必须在一分钟内完成,否则就会被抛弃
        env.getCheckpointConfig().setCheckpointTimeout(60000);
        // 允许两个连续的 checkpoint 错误
        env.getCheckpointConfig().setTolerableCheckpointFailureNumber(2);
        // 同一时间只允许一个 checkpoint 进行
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        // 使用 externalized checkpoints,这样 checkpoint 在作业取消后仍就会被保留
        env.getCheckpointConfig().setExternalizedCheckpointCleanup(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        // 开启实验性的 unaligned checkpoints
        env.getCheckpointConfig().enableUnalignedCheckpoints();
        // 设置CK保存的路径,一般是HDFS的路径;端口号9000可以在HDFS UI上查看;地址中只会保存最新的checkpoint,之前的会进进行缓存
        env.getCheckpointConfig().setCheckpointStorage("hdfs://master:9000/flink/checkpoint");

        // Flink算子在计算时,实际上已经自带了状态,但是并没有主动进行CheckPoint
        env
            // 从一个开启的socket中获取数据
                .socketTextStream("master", 8888)
                .map(word -> Tuple2.of(word, 1), Types.TUPLE(Types.STRING, Types.INT))
                .keyBy(t2 -> t2.f0).sum(1)
                .print();

        env.execute();
    }
}

再次启动时,需要指定checkpoint的存储路径:

java 复制代码
hdfs://master:9000/flink/checkpoint/d882f9b4d726d7462573a3bee8ab4fcb/chk-14
4、使用状态

一般在keyBy计算之后之后进行状态存储,将状态保存得开启checkpoint,可以在配置文件中开启(就不用总是使用代码进行开启了)

使用状态

不同类型的状态:

ValueState:单值状态,包含两个方法:update更新状态、value获取状态

ListState :状态为多值

MapState : 状态为KV

ReducingState :状态需要聚合,最终还是单值状态

AggregatingState:状态需要聚合,最终还是单值状态

java 复制代码
package com.shujia.flink.state;

import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

public class Demo03ValueState {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStream<String> ds01 = env.socketTextStream("master", 8888);

        // 在配置文件中开启了CK,则不需要通过env再设置了

        SingleOutputStreamOperator<Tuple2<String,Integer>> wordDS = ds01.flatMap((line, out) -> {
            for (String word : line.split(",")) {
                out.collect(Tuple2.of(word, 1));
            }
        }, Types.TUPLE(Types.STRING, Types.INT));

        KeyedStream<Tuple2<String, Integer>, String> keyedDS = wordDS.keyBy(t2 -> t2.f0, Types.STRING);

        // 基于分组之后的数据流同样可以调用process方法
        keyedDS
                .process(new KeyedProcessFunction<String, Tuple2<String, Integer>, String>() {
                    // 定义一个ValueState单值状态,包含两个方法:update更新状态、value获取状态
                    // Flink会给每一个keyBy的key单独维护一个状态
                    /**
                     * 一般在keyBy计算之后之后进行状态存储,将状态保存得开启checkpoint,可以在配置文件中开启(就不用总是使用代码进行开启了)
                     * 使用状态
                     * 不同类型的状态:
                     * ListState :状态为多值
                     * MapState : 状态为KV
                     * ReducingState :状态需要聚合,最终还是单值状态
                     * AggregatingState:状态需要聚合,最终还是单值状态
                     */
                    ValueState<Integer> valueState;

                    // 当KeyedProcessFunction构建时只会执行一次
                    @Override
                    public void open(Configuration parameters) throws Exception {
                        // 使用Flink Context来初始化状态
                        RuntimeContext context = getRuntimeContext();
                        ValueStateDescriptor<Integer> descriptor = new ValueStateDescriptor<>("count", Types.INT);
                        valueState = context.getState(descriptor);

                    }

                    // 每一条数据会执行一次
                    @Override
                    public void processElement(Tuple2<String, Integer> value, KeyedProcessFunction<String, Tuple2<String, Integer>, String>.Context ctx, Collector<String> out) throws Exception {
                        Integer cnt = valueState.value();
                        int count = 1;
                        // 如果是第一次处理某个单词,则返回null
                        if (cnt != null){
                            count = cnt + 1;
                        }
                        valueState.update(count);

                        out.collect(value.f0+","+count);
                    }
                }).print();

        env.execute();
    }
}

任务停止后,要想继续上一次的执行结果,再次启动时,需要指定checkpoint的存储路径:

再次执行时,checkpoint会从所指定的checkpoint开始,如下图:

案例:

对某个人的交易流水进行欺诈检测:如果有一笔交易小于一元,然后紧接着的一笔交易大于500,则判断有欺诈风险

java 复制代码
package com.shujia.flink.state;

import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

public class Demo04FraudCheck {
    public static void main(String[] args) throws Exception {
        // 对某个人的交易流水进行欺诈检测:如果有一笔交易小于一元,然后紧接着的一笔交易大于500,则判断有欺诈风险
        /*
         * 1,1000
         * 1,500
         * 1,200
         * 1,0.1
         * 1,1000
         * 1,0.1
         * 1,300
         */

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<String> transDS = env.socketTextStream("master", 8888);
        SingleOutputStreamOperator<MyTrans> myTransDS = transDS.map(line -> {
            String[] split = line.split(",");
            return new MyTrans(split[0], Double.parseDouble(split[1]));
        });

        myTransDS
                .keyBy(MyTrans::getId)// 按照每个人的户号进行分组
                .process(new KeyedProcessFunction<String, MyTrans, String>() {

                    // 创建状态:
                    ValueState<Boolean> flagState;

                    @Override
                    public void open(Configuration parameters) throws Exception {
                        RuntimeContext context = getRuntimeContext();
                        flagState = context.getState(new ValueStateDescriptor<Boolean>("flag", Types.BOOLEAN));
                    }

                    @Override
                    public void processElement(MyTrans value, KeyedProcessFunction<String, MyTrans, String>.Context ctx, Collector<String> out) throws Exception {
                        // 获取上一条纪录的状态,如果为true,则表示上一条记录是小于1的,则需要对当前记录进行是否大于500的判断
                        // 如果为false,则只需要判断当前记录中的金额是否小于1
                        Boolean flag = flagState.value();
                        if(flag==null){
                            // 默认值为false
                            flag = false;
                        }
                        // 获取每笔交易中的交易金额
                        Double trans = value.getTrans();
                        if (trans < 1) {
                            flagState.update(true);
                        }

                        // 如果上一次的flag为true,并且当前的trans>500,则会触发println执行
                        if (flag) {
                            if (trans > 500) {
                                System.out.println("存在交易风险");
                            }
                            // 大于500设置flagState为false
                            flagState.update(false);
                        }
                    }
                });
        // flink不需要action算子触发任务,由事件触发(数据流发生变化、所监控的文件发生变化时,会触发执行)
        env.execute();
    }
}

@Data
@AllArgsConstructor
class MyTrans {
    String id;
    Double trans;
}
5、Kafka的send操作及事务支持

要么都执行,要么都不执行

send操作,要么是提交事务后都执行,要么是都不执行

shell 复制代码
# 若通过事务的方式写Kafka,在读取时--isolation-level <String> :默认读取未提交的所以数据 read_uncommitted
# 若要读取提交了的数据,那么得使用 read_committed
kafka-console-consumer.sh --isolation-level read_committed --bootstrap-server master:9092,node1:9092,node3:9092 --from-beginning --topic trans_topic

kafka-console-consumer.sh --bootstrap-server master:9092,node1:9092,node3:9092 --from-beginning --topic trans_topic
java 复制代码
package com.shujia.kafka;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.Properties;

public class Demo03KafkaTransaction {
    public static void main(String[] args) throws InterruptedException {
        // 通过事务的方式写Kafka
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "master:9092,node2:9092,node2:9092");

        properties.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        properties.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        // 若不加则会报错
        properties.setProperty("transactional.id", "trans01");
        // 创建Kafka 生产者
        KafkaProducer<String, String> producer = new KafkaProducer<>(properties);

        // 开启一个事务,要么都执行,要么都不执行
        producer.initTransactions();
        producer.beginTransaction();
        // 向Kafka写数据 如果topic不存在则会自动创建一个副本和分区数都是1的topic
        producer.send(new ProducerRecord<String,String>("trans_topic","1500100001,施笑槐,22,女,文科六班"));
        producer.send(new ProducerRecord<String,String>("trans_topic","1500100002,施笑槐,22,女,文科六班"));
        producer.send(new ProducerRecord<String,String>("trans_topic","1500100003,施笑槐,22,女,文科六班"));
        Thread.sleep(10000);
        producer.send(new ProducerRecord<String,String>("trans_topic","1500100004,施笑槐,22,女,文科六班"));
        producer.send(new ProducerRecord<String,String>("trans_topic","1500100005,施笑槐,22,女,文科六班"));
        producer.flush();
        // 提交事务之后才算写入完成
        producer.commitTransaction();

    }
}
6、ExactlyOnce

确保数据不重复也不丢失

案例一:

Flink作为消费端和处理端,从kafka中读取数据,将消费的偏移量和计算的结果通过checkpoint保存起来,以便故障的恢复。

如果需要提交到集群运行,记得在$FLINK_HOME/lib目录下添加flink-sql-connector-kafka-1.15.4.jar依赖

java 复制代码
package com.shujia.flink.state;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Demo05ConsumeKafkaExactlyOnce {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置CK的时间间隔
        env.enableCheckpointing(15000);

        // 如果需要提交到集群运行,记得在$FLINK_HOME/lib目录下添加flink-sql-connector-kafka-1.15.4.jar依赖
        KafkaSource<String> kafkaSource = KafkaSource
                .<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setGroupId("grp001") // 第一次可以随便指定,如果需要恢复则必须和上一次同步
                .setTopics("words001") //TODO 读取的时候如果不存在会报错(读取Topic时,若其不存在,不会为其创建)
                // 如果是故障后从CK恢复,FLink会自动将其设置为committedOffsets,即从上一次失败的位置继续消费
                .setStartingOffsets(OffsetsInitializer.earliest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();
        // 从KafkaSource接收数据变成DS 无界流
        // Topic有几个分区,则KafkaSource有几个并行度去读取Kafka的数据
        DataStreamSource<String> kafkaDS = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource");

        // 统计班级人数
        kafkaDS
                .map(word -> Tuple2.of(word, 1), Types.TUPLE(Types.STRING, Types.INT))
                .keyBy(t2 -> t2.f0)
                .sum(1)
                .print();

        env.execute();


    }
}
案例二:

Flink从kafka接收数据,处理后再将结果写入到kafka

这种情况下:Flink会将需要做的checkpoint操作(将消费偏移量和计算的结果保存到HDFS上)和将结果写入到Kafka这两个操作构成一个事务,来保证ExactlyOnce
注:执行时报错

org.apache.flink.kafka.shaded.org.apache.kafka.common.KafkaException:

Unexpected error in InitProducerIdResponse;

The transaction timeout is larger than the maximum value allowed by the broker

(as configured by transaction.max.timeout.ms).

原因:

transaction.max.timeout.ms : Kafka事务最大的超时时间,默认15分钟,即Broker允许的事务最大时间为15分钟 ,Flink的KafkaSink默认事务的超时时间为1小时。若不统一它们的时间,则会发生冲突。

transaction.timeout.ms :设置Kafka Sink的事务时间,只要小于15分钟即可

java 复制代码
package com.shujia.flink.state;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Properties;

public class Demo06SinkKafkaExactlyOnce {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置CK的时间间隔
        env.enableCheckpointing(15000);

        KafkaSource<String> kafkaSource = KafkaSource
                .<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setGroupId("grp001") // 第一次可以随便指定,如果需要恢复则必须和上一次同步
                .setTopics("words001") // 读取的时候如果不存在会报错
                // 如果是故障后从CK恢复,FLink会自动将其设置为committedOffsets,即从上一次失败的位置继续消费
                .setStartingOffsets(OffsetsInitializer.earliest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();
        /**
         * 从KafkaSource接收数据变成DS 无界流
         *   Topic有几个分区,则KafkaSource有几个并行度去读取Kafka的数据
         *   从kafka中读取数据,再对其进行处理,最后写入到kafka中的一个Topic中
         */
        DataStreamSource<String> kafkaDS = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource");

        Properties prop = new Properties();
        /*
         * org.apache.flink.kafka.shaded.org.apache.kafka.common.KafkaException:
         * Unexpected error in InitProducerIdResponse;
         * The transaction timeout is larger than the maximum value allowed by the broker
         * (as configured by transaction.max.timeout.ms).
         *
         * transaction.max.timeout.ms : Kafka事务最大的超时时间,默认15分钟,即Broker允许的事务最大时间为15分钟
         * Flink的Kafka Sink默认事务的超时时间为1小时。若不同意它们的时间,则会放生冲突。
         *
         * transaction.timeout.ms :设置Kafka Sink的事务时间,只要小于15分钟即可
         */
        prop.setProperty("transaction.timeout.ms", 15 * 1000 + "");

        KafkaSink<String> sink = KafkaSink
                .<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setKafkaProducerConfig(prop)
                .setRecordSerializer(
                        KafkaRecordSerializationSchema
                                .builder()
                                .setTopic("word_cnt01") // 写入数据时,Topic不存在会自动创建
                                .setValueSerializationSchema(new SimpleStringSchema())
                                .build()
                )
                /*
                设置写入时的语义:
                1、AT_LEAST_ONCE:保证数据至少被写入了一次,性能会更好,但是又可能会写入重复的数据
                2、EXACTLY_ONCE:保证数据只会写入一次,不多不少,性能会有损耗
                 */
                .setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
                .build();

        // 统计班级人数
        kafkaDS
                .map(word -> Tuple2.of(word, 1), Types.TUPLE(Types.STRING, Types.INT))
                .keyBy(t2 -> t2.f0)
                .sum(1)
                // 将结果的二元组转换成String才能写入Kafka
                .map(t2 -> t2.f0 + "," + t2.f1)
                .sinkTo(sink);

        env.execute();


    }
}
相关推荐
A懿轩A33 分钟前
C/C++ 数据结构与算法【数组】 数组详细解析【日常学习,考研必备】带图+详细代码
c语言·数据结构·c++·学习·考研·算法·数组
Data跳动8 小时前
Spark内存都消耗在哪里了?
大数据·分布式·spark
南宫生8 小时前
力扣-图论-17【算法学习day.67】
java·学习·算法·leetcode·图论
sanguine__9 小时前
Web APIs学习 (操作DOM BOM)
学习
lucky_syq10 小时前
流式处理,为什么Flink比Spark Streaming好?
大数据·flink·spark
数据的世界0111 小时前
.NET开发人员学习书籍推荐
学习·.net
四口鲸鱼爱吃盐11 小时前
CVPR2024 | 通过集成渐近正态分布学习实现强可迁移对抗攻击
学习
OopspoO13 小时前
qcow2镜像大小压缩
学习·性能优化
A懿轩A14 小时前
C/C++ 数据结构与算法【栈和队列】 栈+队列详细解析【日常学习,考研必备】带图+详细代码
c语言·数据结构·c++·学习·考研·算法·栈和队列
居居飒14 小时前
Android学习(四)-Kotlin编程语言-for循环
android·学习·kotlin