FlinkCDC-MYSQL批量写入

一、运行环境

(1)Flink:1.17.2

(2)Scala:2.12.20

(3)Mysql:5.7.43 ##开启binlog

二、代码示例

思路:通过滚动窗口收集一批数据推给sink消费。binlog日志对于dataStream是没有key的,那么需要给每条数据造一个key。两种方式:(1)通过UUID创建一个全局key。(2)解析数据中的时间字段作为key。

java 复制代码
package org.example;

import com.ververica.cdc.connectors.mysql.source.MySqlSource;
import com.ververica.cdc.connectors.mysql.table.StartupOptions;
import com.ververica.cdc.debezium.JsonDebeziumDeserializationSchema;
import lombok.val;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.example.sink.MyJdbcSinkFunction;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;

public class FLCDCToMySql {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        Properties properties = new Properties();
        properties.setProperty("snapshot.locking.mode", "none");

        MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
                .hostname("localhost")
                .port(3306)
                .username("root")
                .password("root321")
                .databaseList("test")
                .tableList("test.t_class")
                .debeziumProperties(properties)
                .deserializer(new JsonDebeziumDeserializationSchema())
                .startupOptions(StartupOptions.earliest())
                .build();

        DataStreamSource<String> dataStream = env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "Mysql Source");
        // 10s一批数据推给sink
        SingleOutputStreamOperator<List<String>> outputStream = dataStream.map(new MapFunction<String, Tuple2<String, String>>() {
                    @Override
                    public Tuple2<String, String> map(String s) throws Exception {
                        return Tuple2.of(UUID.randomUUID().toString(), s);
                    }
                }).keyBy(x -> x.f0).window(TumblingProcessingTimeWindows.of(Time.seconds(10)))
                .process(new ProcessWindowFunction<Tuple2<String, String>, List<String>, String, TimeWindow>() {
                    @Override
                    public void process(String s, ProcessWindowFunction<Tuple2<String, String>, List<String>, String, TimeWindow>.Context context, Iterable<Tuple2<String, String>> iterable, Collector<List<String>> collector) throws Exception {
                        List datalist = new ArrayList();
                        for(Tuple2<String, String> element : iterable){
                            datalist.add(element.f1);
                        }
                        collector.collect(datalist);
                    }
                });
//        outputStream.print();
        outputStream.addSink(new MyJdbcSinkFunction());
        env.execute("flink cdc demo ");

    }
}

批次写入思路:使用executeBatch方式提交,url中需添加rewriteBatchedStatements=true 。

java 复制代码
package org.example.sink;

import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.example.RecordData;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.List;

public class MyJdbcSinkFunction extends RichSinkFunction<List<String>> {

    private int batchSize = 10;
    private Connection connection = null;
    private PreparedStatement statement = null;
    private String jbcUrl = String.format("jdbc:mysql://localhost:3306/test?&useSSL=false&rewriteBatchedStatements=true");
    private String insertSql = String.format("insert into %s values (?,?,?,?,?,?)","t_demo");

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        connection = DriverManager.getConnection(jbcUrl,"root","xxxxxx");
        connection.setAutoCommit(false);

        statement = connection.prepareStatement(insertSql);
    }

    @Override
    public void invoke(List<String> value, Context context) throws Exception {
        super.invoke(value, context);
        System.out.println("dataInfo: "+ value);
        int flag = 0;          //处理数据条数的标记
        int batchSize = 1000;  //批次commit
        int dataSize = value.size();

        String before = "";
        String after = "";
        String op = "";
        String dbname = "";
        String tablename = "";
        String ts_ms = "";

        for (int i = 0; i < dataSize; i++){
            String data = value.get(i);
            JSONObject object = JSONObject.parseObject(data);

            op = String.valueOf(object.get("op"));
            if("c".equals(op)){
                after = String.valueOf(object.get("after"));
            }else if("d".equals(op)){
                before = String.valueOf(object.get("before"));
            }else if("u".equals(op)){
                before = String.valueOf(object.get("before"));
                after = String.valueOf(object.get("after"));
            }

            JSONObject sourceObj = JSONObject.parseObject(object.get("source").toString());
            dbname = String.valueOf(sourceObj.get("db"));
            tablename = String.valueOf(sourceObj.get("table"));
            ts_ms = String.valueOf(sourceObj.get("ts_ms"));

            statement.setString(1, before);
            statement.setString(2, after);
            statement.setString(3, op);
            statement.setString(4, dbname);
            statement.setString(5, tablename);
            statement.setString(6, ts_ms);
            statement.addBatch();

            flag = flag + 1;
            if(i % batchSize == 0 ){
                statement.executeBatch();
                connection.commit();
                statement.clearBatch();
                flag = 0;  //批次提交后重置
            }
        }
        if(flag > 0){   //不满批次的提交
            statement.executeBatch();
            connection.commit();
            statement.clearBatch();
        }

    }

    @Override
    public void close() throws Exception {
        super.close();
        connection.close();
    }
}

三、插入目标表结果

相关推荐
老齐谈电商1 小时前
记录使用Spark计算订单明细表销量排行榜的实现
大数据·分布式·spark
c无序3 小时前
期末复习-Hadoop名词解释+简答题纯享版
大数据·hadoop·分布式
超级小的大杯柠檬水3 小时前
Spark 计算总销量
大数据·分布式·spark
Apache Spark中国社区3 小时前
Paimon x Spark:助力企业 Lakehouse 架构升级
大数据·分布式·架构·spark
985小水博一枚呀3 小时前
【深度学习基础之Scikit-learn库3】Scikit-learn 库提供了丰富的功能,包括数据预处理、特征选择、模型训练与评估....
大数据·人工智能·python·深度学习·机器学习·scikit-learn
黄焖鸡能干四碗3 小时前
【智慧园区】智慧产业园区能源管控系统解决方案,智慧工业园区解决方案(Word原件)
大数据·能源·软件需求·设计规范·规格说明书·1024程序员节
星环科技TDH社区版4 小时前
【多模型能力测试记录】ArgoDB分布式分析型数据库与图数据库StellarDB联合查询
大数据·数据库·分布式·数据处理·联邦计算·跨模型分析·多模型数据库
P.H. Infinity5 小时前
【Elasticsearch】05-DSL查询
大数据·elasticsearch·django
字节数据平台5 小时前
火山引擎数据飞轮帮助音乐APP充分洞察用户需求,提升用户粘性
大数据·信息可视化·火山引擎
CQU_JIAKE6 小时前
11.13[CQU JAVEE_EXP3][JAVA WEB]
大数据·hadoop·分布式