详解 Flink Table API 和 Flink SQL 之时间特性

一、介绍

Table API 和 SQL 进行基于时间的操作（比如时间窗口）时需要定义相关的时间语义和时间数据来源的信息。因此会给表单独提供一个逻辑上的时间字段，专门用来在表处理程序中指示时间
时间属性（time attributes），其实就是每个表模式结构（schema）的一部分。它可以在创建表的 DDL 里直接定义为一个字段，也可以在 DataStream 转换成表时定义。一旦定义了时间属性，就可以作为一个普通字段引用，并且可以在基于时间的操作中使用
时间属性的数据类型为 TIMESTAMP，类似于常规时间戳，可以直接访问并且进行计算。
按照时间语义的不同，可以把时间属性的定义分成事件时间（event time）和处理时间（processing time）

二、处理时间定义

java 复制代码

/**
	处理时间既不需要提取时间戳，也不需要生成 watermark
*/
public class TestTableProcessingTime {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
        
        /*
        	方式一：在 DataStream 转化时直接指定
        	注意：
        		1.使用 .proctime，定义处理时间字段
        		2.proctime 属性只能通过附加逻辑字段，来扩展物理 schema。因此，只能在 schema 定义的末尾定义
        */
        DataStream<String> inputStream = env.readTextFile("./sensor.txt");
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        });
        
        Table sensorTable = tableEnv.fromDataStream(dataStream, "id, timestamp as ts, temperature, pt.proctime");
        
        /*
        	方式二：在定义 Table Schema时指定
        */
        tableEnv.connect(new FileSystem().path("./sensor.txt")) 
            .withFormat(new Csv()) 
            .withSchema(
            	new Schema() 
            	.field("id", DataTypes.STRING()) 
            	.field("timestamp", DataTypes.BIGINT())
            	.field("temperature", DataTypes.DOUBLE())
            	.field("pt", DataTypes.TIMESTAMP(3))
				.proctime()
             )
             .createTemporaryTable("sensorTable");
        
        /*
        	方式三：在创建表的 DDL 中指定
        	注意：运行这个 DDL，必须使用 Blink Planner 版本依赖
        */
        String sinkDDL= "create table sensorTable (" +
                        " id varchar(20) not null, " +
                        " ts bigint, " +
            			" temperature double, " +
            			" pt AS PROCTIME() " +
                        ") with (" +
                        " 'connector.type' = 'filesystem', " +
                        " 'connector.path' = '/sensor.txt', " +
                        " 'format.type' = 'csv')";

        tableEnv.sqlUpdate(sinkDDL);
        
        env.execute();
    }
}

三、事件时间定义

java 复制代码

/**
	事件时间定义需要从事件数据中，提取时间戳，并设置用于推进事件时间的进展的 watermark
*/
public class TestTableEventTime {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        //开启事件时间语义
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
        
        /*
        	方式一：在 DataStream 转化时直接指定
        	注意：
        		1.首先必须在转换的数据流中分配时间戳和 watermark
        		2.使用 .rowtime，定义事件时间字段
        		3.事件时间字段既可以作为新字段追加到 schema，也可以用现有字段替换
        */
        DataStream<String> inputStream = env.readTextFile("./sensor.txt");
        DataStream<SensorReading> dataStream = inputStream.map(line -> {
            String[] fields = line.split(",");
            return new SensorReading(fields[0], new Long(fields[1]), new Double(fields[2]));
        })
            //提取事件时间戳和设置watermark
            .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<SensorReading>(Time.seconds(2)) {
            @Override
            public long extractTimestamp(SensorReading element) {
                return element.getTimestamp() * 1000L;
            }
        });
        
        //Table sensorTable = tableEnv.fromDataStream(dataStream, "id, timestamp.rowtime as ts, temperature"); //将现有字段替换为事件时间字段
        Table sensorTable = tableEnv.fromDataStream(dataStream, "id, timestamp as ts, temperature, rt.rowtime"); //将事件时间字段作为新字段追加
        
        /*
        	方式二：在定义 Table Schema时指定
        */
        tableEnv.connect(new FileSystem().path("./sensor.txt")) 
            .withFormat(new Csv()) 
            .withSchema(
            	new Schema() 
            	.field("id", DataTypes.STRING()) 
            	.field("timestamp", DataTypes.BIGINT())
            	.rowtime( //在事件时间戳字段后调用 rowtime 方法
                	new RowTime()
                    .timestampsFromField("timestamp")  // 从字段中提取事件时间戳
					.watermarksPeriodicBounded(1000)   // 设置watermark 延迟 1 秒
                )
            	.field("temperature", DataTypes.DOUBLE())
             )
             .createTemporaryTable("sensorTable");
        
        /*
        	方式三：在创建表的 DDL 中指定
        	注意：运行这个 DDL，必须使用 Blink Planner 版本依赖
        	说明：FROM_UNIXTIME 是系统内置的时间函数，用来将一个整数（秒数）转换成 "YYYY-MM-DD hh:mm:ss"格式（默认，也可以作为第二个 String 参数传入）的日期时间字符串（date time string）；然后再用 TO_TIMESTAMP 将其转换成 Timestamp
        */
        String sinkDDL = "create table dataTable (" +
                        " id varchar(20) not null, " + 
                        " ts bigint, " +
                        " temperature double, " +
                        " rt AS TO_TIMESTAMP( FROM_UNIXTIME(ts) ), " +
                        " watermark for rt as rt - interval '1' second" +
                        ") with (" +
                        " 'connector.type' = 'filesystem', " +
                        " 'connector.path' = '/sensor.txt', " +
                        " 'format.type' = 'csv')";

        tableEnv.sqlUpdate(sinkDDL);
        
        env.execute();
    }
}