Flink中常用物理分区(区别和用法)算子以及Sink(JDBC Connector、Kafka Connector以及自定义Sink)

文章目录


一、物理分区

1、自定义分区+重分区(解决数据倾斜)

重分区能够解决数据倾斜

java 复制代码
package com.bigdata.transformation;


import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;


// 自定义分区
class MyPartitioner implements Partitioner<Long>{

    @Override
    public int partition(Long key, int numPartitions) {
        // 小于等于10000的放到1分区,否则放到2分区
        if (key <= 10000){
            return 0;
        }
        return 1;
    }
}


public class CustomPartition {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        env.setParallelism(2);

        DataStreamSource<Long> dataStreamSource = env.fromSequence(1, 15000);

        // 自定义分区API
        DataStream<Long> dataStream = dataStreamSource.partitionCustom(new MyPartitioner(), new KeySelector<Long, Long>() {
            @Override
            public Long getKey(Long value) throws Exception {
                return value;
            }
        });

        // 查看每个分区的数据量
        dataStream.map(new RichMapFunction<Long, Tuple2<Integer,Integer>>() {
            @Override
            public Tuple2<Integer, Integer> map(Long aLong) throws Exception {
                int partitions = getRuntimeContext().getIndexOfThisSubtask();
                return Tuple2.of(partitions,1);
            }
        }).keyBy(new KeySelector<Tuple2<Integer, Integer>, Integer>() {
            @Override
            public Integer getKey(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
                return integerIntegerTuple2.f0;
            }
        }).sum(1).print("前:");

        // 打印自定义分区结果
        // dataStream.print();

        // 进行重分区
        DataStream<Long> rebalance = dataStream.rebalance();

        //查看重分区每个分区的数据量
        rebalance.map(new RichMapFunction<Long, Tuple2<Integer,Integer>>() {
            @Override
            public Tuple2<Integer, Integer> map(Long aLong) throws Exception {
                int partitions = getRuntimeContext().getIndexOfThisSubtask();
                return Tuple2.of(partitions,1);
            }
        }).keyBy(new KeySelector<Tuple2<Integer, Integer>, Integer>() {
            @Override
            public Integer getKey(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
                return integerIntegerTuple2.f0;
            }
        }).sum(1).print("后:");

        env.execute("自定义分区+重分区(解决数据倾斜)");
    }
}

二、Sink

有print、writerAsText(以文本格式输出)、Connectors(连接器)

1、JDBC Connector(JDBC连接器)

导包:

java 复制代码
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-connector-jdbc_2.11</artifactId>
    <version>${flink.version}</version>
</dependency>

<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>8.0.25</version>
</dependency>

代码演示:

java 复制代码
package com.bigdata.sink;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.connector.jdbc.JdbcStatementBuilder;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.api.common.RuntimeExecutionMode;

import java.sql.PreparedStatement;
import java.sql.SQLException;

/**
 @基本功能:
 @program:FlinkDemo
 @author: hang
 @create:2024-11-22 15:55:54
 **/

@Data
@NoArgsConstructor
@AllArgsConstructor
class Student {
    private int id;
    private String name;
    private int age;
}

public class JdbcConnector {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);

        //2. source-加载数据
        DataStreamSource<Student> studentDataStreamSource = env.fromElements(
                new Student(1, "zhanngsan", 18),
                new Student(2, "lisi", 19),
                new Student(3, "wangwu", 20)
        );

        //3. transformation-数据处理转换
        //4. sink-数据输出
        JdbcConnectionOptions jdbcConnectionOptions = new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                .withDriverName("com.mysql.cj.jdbc.Driver")
                .withUrl("jdbc:mysql://localhost:3306/mydb01")
                .withUsername("root")
                .withPassword("123456").build();


        studentDataStreamSource.addSink(JdbcSink.sink(
                "insert into student values (?,?,?)",
                new JdbcStatementBuilder<Student>() {
                    @Override
                    public void accept(PreparedStatement preparedStatement, Student student) throws SQLException {
                        preparedStatement.setInt(1,student.getId());
                        preparedStatement.setString(2,student.getName());
                        preparedStatement.setInt(3,student.getAge());

                    }
                },
                jdbcConnectionOptions
                // 假如是流的方式可以设置两条插入一次
                //JdbcExecutionOptions.builder().withBatchSize(2).build(),jdbcConnectionOptions
        ));

        //5. execute-执行
        env.execute();
    }
}

2、Kafka Connector(Kafka连接器)

需求:从Kafka的topic1中消费日志数据,并做实时ETL,将状态为success的数据写入到Kafka的topic2中

java 复制代码
package com.bigdata.sink;

import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;

import java.util.Properties;

/**
 @基本功能:
 @program:FlinkDemo
 @author: hang
 @create:2024-11-22 16:38:58
 **/
public class KafkaConnector {

    public static void main(String[] args) throws Exception {

        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);

        //2. source-加载数据
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers","node01:9092");
        properties.setProperty("group.id", "g1");
        FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>("topic1", new SimpleStringSchema(), properties);
        DataStreamSource<String> dataStreamSource = env.addSource(kafkaConsumer);

        //3. transformation-数据处理转换
        SingleOutputStreamOperator<String> success = dataStreamSource.filter(new FilterFunction<String>() {
            @Override
            public boolean filter(String s) throws Exception {
                return s.contains("success");
            }
        });

        //4. sink-数据输出
        FlinkKafkaProducer<String> kafkaProducer = new FlinkKafkaProducer<>("topic2", new SimpleStringSchema(), properties);
        success.addSink(kafkaProducer);

        //5. execute-执行
        env.execute();
    }

}

3、自定义Sink

模拟jdbcSink的实现
jdbcSink官方已经提供过了,此处仅仅是模拟它的实现,从而学习如何自定义sink

java 复制代码
package com.bigdata.sink;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;

class MyJdbcSink extends RichSinkFunction<Student>{
    Connection conn = null;
    PreparedStatement statement = null;
    @Override
    public void open(Configuration parameters) throws Exception {
        // 注册驱动(安转驱动)  此时这句话可以省略  如果书写的话,mysql8.0 带 cj
        Class.forName("com.mysql.cj.jdbc.Driver");
        // 获取数据库连接对象 Connection
        conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/mydb01","root","123456");
        // 执行sql语句
        statement = conn.prepareStatement("insert into student values (?,?,?)");
    }

    @Override
    public void close() throws Exception {
        // 释放资源
        statement.close();
        conn.close();
    }

    @Override
    public void invoke(Student student, Context context) throws Exception {
        statement.setInt(1,student.getId());
        statement.setString(2,student.getName());
        statement.setInt(3,student.getAge());
        statement.execute();
    }
}

public class jdbcCustomSink {
    public static void main(String[] args) throws Exception {
        //1. env-准备环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
        //2. source-加载数据
        DataStreamSource<Student> studentDataStreamSource = env.fromElements(
                new Student(4, "zhaoliu", 18),
                new Student(5, "qianqi", 19),
                new Student(6, "wuba", 20)
        );
        //3. transformation-数据处理转换
        //4. sink-数据输出
        DataStreamSink<Student> studentDataStreamSink = studentDataStreamSource.addSink(new MyJdbcSink());

        //5. execute-执行
        env.execute();
    }
}
相关推荐
商业模式源码开发3 小时前
实体门店低获客成本增长案例:3 人转介绍模型 + 消费返还机制落地分析
大数据·商业模式·私域流量
元拓数智4 小时前
智能分析落地卡壳?先补好「数据关系+语义治理」这层技术基建
大数据·分布式·ai·spark·数据关系·语义治理
TDengine (老段)5 小时前
TDengine Tag 设计哲学与 Schema 变更机制
大数据·数据库·物联网·时序数据库·iot·tdengine·涛思数据
sxgzzn6 小时前
新能源场站数智化转型:基于数字孪生与AI的智慧运维管理平台解析
大数据·运维·人工智能
清平乐的技术专栏7 小时前
【Flink学习】(二)Flink 本地环境搭建,运行第一个入门程序
大数据·flink
这是程序猿7 小时前
Spring Boot自动配置详解
java·大数据·前端
ws2019077 小时前
AUTO TECH China 2026广州汽车零部件展:从整机集成迈向核心部件的产业跃升
大数据·人工智能·科技·汽车
humors2218 小时前
从数据到决策:汽车使用成本的精细计算指南
大数据·程序人生
大大大大晴天8 小时前
Flink技术实践:RocksDB 状态后端技术解密
大数据·flink
189228048619 小时前
NY382固态MT29F32T08GSLBHL8-24QM:B
大数据·服务器·人工智能·科技·缓存