Flink中自定义序列化器

Flink中有自己的序列化器和Kryo序列化器，当不满足Flink中类型定义的要求的的时候，就会回退到使用Kryo序列化器，而通常使用Kryo序列化器比使用Flink的序列化器性能要低很多。

当然Flink提供了一些当回退到了Kryo的时候，可以根据自己的类型来注册自定义的序列化器，位置见：https://nightlies.apache.org/flink/flink-docs-release-2.1/zh/docs/dev/datastream/fault-tolerance/serialization/third_party_serializers/

这里要求要实现Kryo的Serializer类，类全路径为：com.ericsoftware.kryo.Serializer，下面给一个实现的例子出来：

java 复制代码

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.Serializer;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import org.apache.fory.Fory;
import org.apache.fory.ThreadSafeFory;
import org.apache.fory.config.Language;

public class ForySerializer <T> extends Serializer<T> {
	public static final ThreadSafeFory fory = Fory.builder()
			.withLanguage(Language.JAVA)
			.withRefTracking(false)
			.requireClassRegistration(false)
			.buildThreadSafeFory();

	@Override
	public void write(Kryo kryo, Output output, T object) {
		byte[] bytes = fory.serialize(object);
		output.writeInt(bytes.length);
		output.writeBytes(bytes);
	}

	@Override
	public T read(Kryo kryo, Input input, Class<? extends T> type) {
		int length = input.readInt();
		byte[] bytes = input.readBytes(length);
		return (T) fory.deserialize(bytes);
	}


	@Override
	public boolean isImmutable() {
		return false;
	}

}

虽然Flink提供了以上的方式来自定义序列化器，但是当遇到以下这种情况的时候，用上面提到的方式，却是不会生效的：

java 复制代码

@NoArgsConstructor
@AllArgsConstructor
@Data
public class CustomDataType {

    private String status;

    private RawValue rawValue;

    @NoArgsConstructor
    @AllArgsConstructor
    @Data
    public static class RawValue {
        private String key;

        private Object value;
    }

}

接着我们翻阅官方发现了，可以通过定义TypeInfomationFactory的方式来，保证我们无法走Flink序列化器的类型走自定义的类型：

https://nightlies.apache.org/flink/flink-docs-release-2.1/zh/docs/dev/datastream/fault-tolerance/serialization/types_serialization/#defining-type-information-using-a-factory

以下是针对TypeInfomationFactory方式实现的详情：

java 复制代码

public class CustomTypeInfoFactory extends TypeInfoFactory<CustomDataType> {

    @Override
    public TypeInformation<CustomDataType> createTypeInfo(Type t, Map<String, TypeInformation<?>> genericParameters) {
        return new CustomTypeInformation();
    }
}

java 复制代码

public class CustomTypeInformation extends TypeInformation<CustomDataType> {

    private static final long serialVersionUID = 1L;

    @Override
    public boolean isBasicType() {
        return false;
    }

    @Override
    public boolean isTupleType() {
        return false;
    }

    @Override
    public int getArity() {
        return 1;
    }

    @Override
    public int getTotalFields() {
        return CustomDataType.class.getFields().length;
    }

    @Override
    public Class<CustomDataType> getTypeClass() {
        return CustomDataType.class;
    }

    @Override
    public boolean isKeyType() {
        return false;
    }

    @Override
    public TypeSerializer<CustomDataType> createSerializer(SerializerConfig config) {
        return new CustomTypeSerializer();
    }

    @Override
    public String toString() {
        return "CustomTypeInformation<CustomDataType>";
    }

    @Override
    public boolean equals(Object obj) {
        return obj instanceof CustomTypeInformation;
    }

    @Override
    public int hashCode() {
        return CustomTypeInformation.class.hashCode();
    }

    @Override
    public boolean canEqual(Object obj) {
        return obj instanceof CustomTypeInformation;
    }
}

这里是最核心的定义序列化器的地方，这里使用了Apache Fory来序列化数据：

java 复制代码

public class CustomTypeSerializer extends TypeSerializer<CustomDataType> {

    private static final long serialVersionUID = 1L;

    public static final ThreadSafeFory fory = Fory.builder()
            .withLanguage(Language.JAVA)
            .withRefTracking(false)
            .requireClassRegistration(false)
            .buildThreadSafeFory();

    public CustomTypeSerializer() {
        
    }

    @Override
    public boolean isImmutableType() {
        return false;
    }

    @Override
    public TypeSerializer<CustomDataType> duplicate() {
        return new CustomTypeSerializer();
    }

    @Override
    public CustomDataType createInstance() {
        return new CustomDataType();
    }

    @Override
    public CustomDataType copy(CustomDataType from) {
        CustomDataType.RawValue rawValue = new CustomDataType.RawValue();
        rawValue.setKey(from.getRawValue().getKey());
        rawValue.setValue(from.getRawValue().getValue());
        return new CustomDataType(from.getStatus(), rawValue);
    }

    @Override
    public CustomDataType copy(CustomDataType from, CustomDataType reuse) {
        reuse.setStatus(from.getStatus());
        reuse.setRawValue(from.getRawValue());
        return reuse;
    }

    @Override
    public int getLength() {
        return -1; // 可变长度
    }

    @Override
    public void serialize(CustomDataType record, DataOutputView target) throws IOException {
        // 序列化逻辑
        byte[] bytes = fory.serialize(record);
        target.writeInt(bytes.length);
        target.write(bytes);
    }

    @Override
    public CustomDataType deserialize(DataInputView source) throws IOException {
        // 反序列化逻辑
        int length = source.readInt();
        byte[] bytes = new byte[length];
        source.read(bytes);
        return (CustomDataType) fory.deserialize(bytes);
    }

    @Override
    public CustomDataType deserialize(CustomDataType reuse, DataInputView source) throws IOException {
        return reuse;
    }

    @Override
    public void copy(DataInputView source, DataOutputView target) throws IOException {
        serialize(deserialize(source), target);
    }

    @Override
    public boolean equals(Object obj) {
        return obj instanceof CustomTypeSerializer;
    }

    @Override
    public int hashCode() {
        return CustomTypeSerializer.class.hashCode();
    }

    @Override
    public TypeSerializerSnapshot<CustomDataType> snapshotConfiguration() {
        return new CustomTypeSerializerSnapshot();
    }

    // 序列化器快照（用于状态兼容性）
    public static final class CustomTypeSerializerSnapshot extends SimpleTypeSerializerSnapshot<CustomDataType> {
        public CustomTypeSerializerSnapshot() {
            super(CustomTypeSerializer::new);
        }
    }
}

最后是我们的数据类型上加上注解：@TypeInfo(CustomTypeInfoFactory.class)。

接下来提供一个例子来跑通以上的自定义序列化器：

java 复制代码

public class SerializerDemo {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        configuration.set(RestOptions.PORT, 8081);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(configuration);
        env.setParallelism(2);
        env.disableOperatorChaining();

        env.addSource(new SourceFunction<String>() {

            private volatile boolean flag = true;

            private Random rand = new Random();

            @Override
            public void run(SourceContext<String> ctx) throws Exception {
                while (flag) {
                    Thread.sleep(1000);
                    ctx.collect(String.format("%s@%s", rand.nextInt(100), LocalDateTime.now()));
                }
            }

            @Override
            public void cancel() {
                flag = false;
            }
        }).map(new MapFunction<String, CustomDataType>() {
            @Override
            public CustomDataType map(String value) throws Exception {
                String[] split = value.split("@");
                CustomDataType customDataType = new CustomDataType();
                customDataType.setStatus("alive");
                CustomDataType.RawValue rawValue = new CustomDataType.RawValue();
                rawValue.setKey(split[0]);
                rawValue.setValue(split[1]);
                customDataType.setRawValue(rawValue);
                return customDataType;
            }
        }).addSink(new SinkFunction<CustomDataType>() {
            @Override
            public void invoke(CustomDataType value) throws Exception {
                System.out.println("结果为:"+value);
            }
        });

        env.execute();
    }

}

其中最后附上我debug代码的时候，拷贝下来的代码栈，方便代码理解整个代码逻辑：

java 复制代码

createTypeInfo:13, CustomTypeInfoFactory (com.bonree.serializers)
createTypeInfoFromFactory:1385, TypeExtractor (org.apache.flink.api.java.typeutils)
createTypeInfoFromFactory:1353, TypeExtractor (org.apache.flink.api.java.typeutils)
getTypeInfoFactory:1730, TypeExtractor (org.apache.flink.api.java.typeutils)          // 在这里去读取Pojo类上定义的@TypeInfo注解获取对应的TypeInfoFactory的
getClosestFactory:1790, TypeExtractor (org.apache.flink.api.java.typeutils)
createTypeInfoFromFactory:1340, TypeExtractor (org.apache.flink.api.java.typeutils)
createTypeInfoWithTypeHierarchy:882, TypeExtractor (org.apache.flink.api.java.typeutils)
privateCreateTypeInfo:861, TypeExtractor (org.apache.flink.api.java.typeutils)
getUnaryOperatorReturnType:608, TypeExtractor (org.apache.flink.api.java.typeutils)
getMapReturnTypes:184, TypeExtractor (org.apache.flink.api.java.typeutils)
map:425, DataStream (org.apache.flink.streaming.api.datastream)
main:41, SerializerDemo (com.bonree.serializers)