用vulkan来实现一个算子

边学边做，法力无边

MNN

阿里开源的推理引擎，能部署到设备端，执行各种模型任务。其定义了多种执行后端，CPU，opencl，vulkan 等等。

多的就不介绍了。仅看vulkan的实现来做参考。比如实现一个Grid Sample.

vulkan compute

vulkan有多种stage支持，其中compute stage可以做gpu的计算加速。为了不掉入一个巨大引擎的框架，花费太多时间梳理框架的代码，直接简化逻辑，写一段通用代码来初始化vulkan使用它。

arduino 复制代码

// 务必包含这两个头文件, libdl 对比arm等移动端是必须，否则link时候总是报错。
#include <dlfcn.h>
#include <vulkan/vulkan.h>

vulkan lib

接下来初始化vulkan 的库，对于libdl方式，需要dlsym 逐个加载我们需要的vulkan api. 用一个宏包裹这些API的列表，然后再用dlpoen， dlsym的方式加载vulkan

scss 复制代码

#define VK_FUNCTION_LIST \
    PFN(vkEnumerateInstanceVersion) \
    PFN(vkEnumerateInstanceLayerProperties) \
    PFN(vkCreateInstance) \
    PFN(vkEnumerateInstanceExtensionProperties) \
    PFN(vkGetInstanceProcAddr) \
    PFN(vkMapMemory) \
    PFN(vkUnmapMemory) \
    PFN(vkGetBufferMemoryRequirements) \
    PFN(vkGetPhysicalDeviceMemoryProperties) \
    PFN(vkAllocateMemory) \
    PFN(vkAllocateCommandBuffers) \
    PFN(vkBindBufferMemory) \
    PFN(vkCmdBindPipeline) \
    PFN(vkCmdDispatch) \
    PFN(vkCmdWriteTimestamp) \
    PFN(vkCmdBindDescriptorSets) \
    PFN(vkCmdResetQueryPool) \
    PFN(vkBeginCommandBuffer) \
    PFN(vkEndCommandBuffer) \
    PFN(vkQueueSubmit) \
    PFN(vkQueueWaitIdle) \
    PFN(vkCreateBuffer) \
    PFN(vkCreateQueryPool) \
    PFN(vkCreateDescriptorPool) \
    PFN(vkAllocateDescriptorSets) \
    PFN(vkUpdateDescriptorSets) \
    PFN(vkCreateCommandPool) \
    PFN(vkCreateComputePipelines) \
    PFN(vkCreateDevice) \
    PFN(vkGetDeviceQueue) \
    PFN(vkCreateDescriptorSetLayout) \
    PFN(vkCreatePipelineLayout) \
    PFN(vkDestroyBuffer) \
    PFN(vkDestroyQueryPool) \
    PFN(vkDestroyDescriptorPool) \
    PFN(vkDestroyPipeline) \
    PFN(vkDestroyPipelineLayout) \
    PFN(vkDestroyDescriptorSetLayout) \
    PFN(vkDestroyDevice) \
    PFN(vkDestroyInstance) \
    PFN(vkGetQueryPoolResults) \
    PFN(vkCreateShaderModule) \
    PFN(vkDestroyShaderModule) \
    PFN(vkDestroyCommandPool) \
    PFN(vkFreeMemory) \
    PFN(vkGetPhysicalDeviceQueueFamilyProperties) \
    PFN(vkGetPhysicalDeviceProperties2) \
    PFN(vkEnumeratePhysicalDevices) \
    PFN(vkEnumerateDeviceExtensionProperties) \
    PFN(vkResetCommandBuffer) \
    PFN(vkFreeCommandBuffers) \
    PFN(vkGetPhysicalDeviceFeatures) \
    PFN(vkGetPhysicalDeviceFeatures2) \
    PFN(vkBindBufferMemory2)

class VulkanLib {
private:
    void *lib;
    std::unique_ptr<std::map<std::string, void *> > symbols;
public:
    VulkanLib() {
        symbols = std::make_unique<std::map<std::string, void *> >();
#ifdef __APPLE__
        lib = dlopen("libvulkan.dylib", RTLD_LAZY | RTLD_LOCAL);
        if (!lib)
            lib = dlopen("libvulkan.1.dylib", RTLD_LAZY | RTLD_LOCAL);
        if (!lib)
		    lib = dlopen("libMoltenVK.dylib", RTLD_NOW | RTLD_LOCAL);
        if (!lib && getenv("DYLD_FALLBACK_LIBRARY_PATH") == nullptr)
            lib = dlopen("/usr/local/lib/libvulkan.dylib", RTLD_NOW | RTLD_LOCAL);
#elif defined __linux__
        lib = dlopen("libvulkan.so.1", RTLD_LAZY | RTLD_LOCAL);
        if (!lib)
            lib = dlopen("libvulkan.so", RTLD_LAZY | RTLD_LOCAL);
#endif
        if (!lib) {
            std::cerr << "Failed to load vulkan library ," << dlerror() << std::endl;
            return ;
        }
#define PFN(name) name = reinterpret_cast<PFN_##name>(dlsym(lib, #name));
            VK_FUNCTION_LIST
#undef PFN
    }
    ~VulkanLib() {
        dlclose(lib);
    }
    void *getSymbol(const char *name) {
        return symbols->at(name);
    }
#define PFN(name) PFN_##name name;
    VK_FUNCTION_LIST
#undef PFN
};

上面的类实现了对vulkan库的简单初始化，可以定义一个全局变量来保证vulkan api全局可以访问的到。

arduino 复制代码

VulkanLib vklib;
#define OP(name) vklib.name

vulkan instance

接下来实现的是操作 vulkan instance, 没有把validation layer支持也加进来，仅看关键大概100行代码，注意VK_KHR_portability_enumeration支持，对于macos跑vulkan 1.4的时候是必须的。

ini 复制代码

class VulkanInstance {
private:
    VkInstance instance;

    void checkInstanceExtension()
    {
        uint32_t pPropertyCount;
        OP(vkEnumerateInstanceExtensionProperties)(nullptr, &pPropertyCount, nullptr);
        ext_properties.resize(pPropertyCount);
        OP(vkEnumerateInstanceExtensionProperties)(nullptr, &pPropertyCount, ext_properties.data());
    }
    bool checkInstanceExtensionFeature(const char *name)
    {
        for (auto ext : this->ext_properties) {
            if (std::string(ext.extensionName).compare(name) == 0) {
                return true;
            }
        }
        return false;
    }

    VkInstance OpCreateInstance(std::vector<const char *> &enabledLayerNames) {
        uint32_t version = getVulkanVersion();
        VkApplicationInfo applicationInfo = {};
        applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
        applicationInfo.pApplicationName = "Vulkan Compute Shader Benchmark";
        // SPV_KHR_vulkan_memory_model, use_vulkan_memory_model in spirv requires 1.2.0
        applicationInfo.apiVersion = version; // use the libvulkan version directly
        applicationInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
        applicationInfo.pEngineName = "Vulkan bench";
        applicationInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);

        VkInstanceCreateInfo instanceCreateInfo = {};
        instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
        instanceCreateInfo.pApplicationInfo = &applicationInfo;

        instanceCreateInfo.enabledLayerCount = static_cast<uint32_t>(enabledLayerNames.size());
        instanceCreateInfo.ppEnabledLayerNames = enabledLayerNames.data();

#if VK_KHR_portability_enumeration
        if (checkInstanceExtensionFeature(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME)) {
            enabledExtensionNames.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
            instanceCreateInfo.flags = VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
        }
#endif
        if (enabledExtensionNames.size() > 0) {
            instanceCreateInfo.enabledExtensionCount = static_cast<uint32_t>(enabledExtensionNames.size());
            instanceCreateInfo.ppEnabledExtensionNames = enabledExtensionNames.data();
        }
        VkInstance instance = VK_NULL_HANDLE;
        VkResult error = OP(vkCreateInstance)(&instanceCreateInfo, nullptr, &instance);
        if (error != VK_SUCCESS) {
            std::cout << "Fail to create instance " << error << std::endl;
            if (error == VK_ERROR_LAYER_NOT_PRESENT) {
                std::cout << "VK_ERROR_LAYER_NOT_PRESENT" << std::endl;
            } else if (error == VK_ERROR_INCOMPATIBLE_DRIVER) {
                std::cout << "VK_ERROR_INCOMPATIBLE_DRIVER" << std::endl;
            }
            return nullptr;
        }

        return instance;
    }

    std::vector<const char *> enabledExtensionNames;
    std::vector<VkExtensionProperties> ext_properties;

public:
    VulkanInstance() {
        std::vector<const char *> enabledLayerNames;

        checkInstanceExtension();
        instance = OpCreateInstance(enabledLayerNames);
        if (!instance) {
            throw std::runtime_error("Failed to create Vulkan instance.");            
        }

    }

    ~VulkanInstance() {
        OP(vkDestroyInstance)(instance, nullptr);
    }

    std::vector<std::pair<VkPhysicalDevice, uint32_t>> getDeviceAndQeueue(void) {
        std::vector<std::pair<VkPhysicalDevice, uint32_t>> ret;
        uint32_t count;

        VkResult error = OP(vkEnumeratePhysicalDevices)(instance, &count, nullptr);
        if (error != VK_SUCCESS) {
            return ret;
        }
        std::vector<VkPhysicalDevice> physicalDevices(count);
        error = OP(vkEnumeratePhysicalDevices)(instance, &count, physicalDevices.data());
        if (error != VK_SUCCESS) {
            return ret;
        }
        std::cout << "Found " << count << " physical devices." << std::endl;

        for (auto device : physicalDevices) {
            OP(vkGetPhysicalDeviceQueueFamilyProperties)(device, &count, nullptr);
            std::vector<VkQueueFamilyProperties> queueFamilyProperties(count);
            OP(vkGetPhysicalDeviceQueueFamilyProperties)(device, &count,
                                                    queueFamilyProperties.data());
            uint32_t index = 0;
            for (auto &properties : queueFamilyProperties) {
                if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) {
                    ret.push_back({device, index});
                    break;
                }
                index++;
            }
        }
        return ret;
    }
};

vulkan device

接下来是操作device，注意的是push各种device extension 支持，这里涉及不同设备支持，但其实不加也能跑，不过如果是validation layer 开启后，会产生很多告警。

ini 复制代码

class ComputeDevice {
private:
    std::vector<VkExtensionProperties> ext_properties;
    VkPhysicalDeviceProperties deviceProperties;
    void checkDeviceDataTypeFeatures(void)
    {
        VkPhysicalDeviceFeatures deviceFeatures = {};
        OP(vkGetPhysicalDeviceFeatures)(physicalDevice, &deviceFeatures);
        this->features |= (deviceFeatures.shaderInt64 ? FEATURE_INT64 : 0);
        this->features |= (deviceFeatures.shaderFloat64 ? FEATURE_FP64 : 0);
        this->features |= FEATURE_FP32 | FEATURE_INT32;
        this->features |= (deviceFeatures.shaderInt16 ? FEATURE_INT16 : 0);

        VkPhysicalDeviceShaderFloat16Int8Features float16Int8Features = {};
        float16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR;


        VkPhysicalDeviceFeatures2 features2 = {};
        features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
        features2.pNext = &float16Int8Features;

        OP(vkGetPhysicalDeviceFeatures2)(physicalDevice, &features2);
        this->features |= (float16Int8Features.shaderFloat16 ? FEATURE_FP16 : 0);
        this->features |= (float16Int8Features.shaderInt8 ? FEATURE_INT8 : 0);
    }

    void checkDeviceExtension(void)
    {
        uint32_t extensionCount = 0;
        OP(vkEnumerateDeviceExtensionProperties)(physicalDevice, NULL, &extensionCount, NULL);
        this->ext_properties.resize(extensionCount);
        OP(vkEnumerateDeviceExtensionProperties)(physicalDevice, NULL, &extensionCount, this->ext_properties.data());
        // std::cout << "Device Extensions:" << std::endl;
        // for (uint32_t i = 0; i < extensionCount; i++) {
        //     std::cout << ext_properties[i].extensionName << ": " << ext_properties[i].specVersion << std::endl;
        // }
    }

    void getDeviceTimeLimits(void)
    {
        VkPhysicalDeviceSubgroupProperties subgroup_properties = {};
        subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
        subgroup_properties.pNext = nullptr;
        VkPhysicalDeviceProperties2 properties2 = {};
        properties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
        properties2.pNext = &subgroup_properties;

        OP(vkGetPhysicalDeviceProperties2)(physicalDevice, &properties2);
        deviceProperties = properties2.properties;
        this->timestampPeriod = deviceProperties.limits.timestampPeriod;
        std::cout << "GPU " << deviceProperties.deviceName << std::endl;
    }
    VkResult createDevice(void)
    {
        std::vector<uintptr_t> enabledFeatures;
        std::vector<const char *> enabledExtensions;
        VkPhysicalDeviceFeatures features = {};
        features.robustBufferAccess = VK_TRUE;
        if (this->features & FEATURE_INT64)
            features.shaderInt64 = VK_TRUE;
        if (this->features & FEATURE_FP64)
            features.shaderFloat64 = VK_TRUE;
        if (this->features & FEATURE_INT16)
            features.shaderInt16 = VK_TRUE;

        VkPhysicalDeviceFloat16Int8FeaturesKHR float16Int8Features = {};
        float16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;

        VkPhysicalDevice8BitStorageFeatures storage8bitFeatures = {};
        storage8bitFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES;
        storage8bitFeatures.uniformAndStorageBuffer8BitAccess = VK_TRUE;
        storage8bitFeatures.storageBuffer8BitAccess = VK_TRUE;

#ifdef VK_KHR_16bit_storage
        VkPhysicalDevice16BitStorageFeatures storage16bitFeatures = {};
        storage16bitFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES;
        storage16bitFeatures.uniformAndStorageBuffer16BitAccess = VK_TRUE;
        storage16bitFeatures.storageBuffer16BitAccess = VK_TRUE;
        storage16bitFeatures.storageInputOutput16 = VK_TRUE;
#endif

        if (this->features & FEATURE_INT8) {
            float16Int8Features.shaderInt8 = VK_TRUE;
            if (checkDeviceExtensionFeature(VK_KHR_8BIT_STORAGE_EXTENSION_NAME)) {
                enabledExtensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
                enabledFeatures.push_back(reinterpret_cast<uintptr_t>(&storage8bitFeatures));
            }
        }
        if (this->features & FEATURE_FP16) {
            float16Int8Features.shaderFloat16 = VK_TRUE;
            if (checkDeviceExtensionFeature(VK_KHR_16BIT_STORAGE_EXTENSION_NAME)) {
                enabledExtensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
                if (deviceProperties.vendorID != 4318) {
                    // tested on Nvidia A2000, it supports 16bit storage feature but did not need to enable it
                    // enable it will cause validation error VK_ERROR_FEATURE_NOT_PRESENT
                    enabledFeatures.push_back(reinterpret_cast<uintptr_t>(&storage16bitFeatures));
                }
            }
#ifdef VK_AMD_gpu_shader_half_float
            if (deviceProperties.vendorID == 4098) {
                // for AMD card, do we really need this ? over VK_KHR_shader_float16_int8
                if (checkDeviceExtensionFeature(VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME)) {
                    enabledExtensions.push_back(VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME);
                }
            }
#endif
        }
        if (this->features & (FEATURE_INT8 |FEATURE_FP16)) {
            if (checkDeviceExtensionFeature(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME)) {
                enabledFeatures.push_back(reinterpret_cast<uintptr_t>(&float16Int8Features));
                enabledExtensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
            }
        }
        
#ifdef VK_KHR_bind_memory2
        if (checkDeviceExtensionFeature(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME)) {
            enabledExtensions.push_back(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME);
        }
#endif
        struct GeneralFeature {
            VkStructureType sType;
            void*     pNext;
        };
        void* pFirst = nullptr;
        if (enabledFeatures.size() > 0) {
            pFirst = reinterpret_cast<void *>(enabledFeatures[0]);
            struct GeneralFeature* ptr = reinterpret_cast<struct GeneralFeature*>(pFirst);
            for (size_t i = 1; i < enabledFeatures.size(); i++) {
                struct GeneralFeature* feat = reinterpret_cast<struct GeneralFeature*>(enabledFeatures[i]);
                ptr->pNext = feat;
                ptr = feat;
            }
        }

        VkDeviceQueueCreateInfo queueCreateInfo = {};
        queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
        queueCreateInfo.queueCount = 1;
        queueCreateInfo.queueFamilyIndex = queueFamilyIndex;
        float queuePriority = 1.0f;  // specifies if this queue gets preference
        queueCreateInfo.pQueuePriorities = &queuePriority;

        VkDeviceCreateInfo deviceCreateInfo = {};
        deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
        deviceCreateInfo.queueCreateInfoCount = 1;
        deviceCreateInfo.pQueueCreateInfos = &queueCreateInfo;
        deviceCreateInfo.enabledLayerCount = 0;
        deviceCreateInfo.ppEnabledLayerNames = nullptr;
        deviceCreateInfo.pEnabledFeatures = &features;
        deviceCreateInfo.enabledExtensionCount = static_cast<uint32_t>(enabledExtensions.size());
        deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
        deviceCreateInfo.pNext = pFirst;

        return OP(vkCreateDevice)(this->physicalDevice, &deviceCreateInfo, nullptr, &this->device);
    }

    void getDeviceQueue(void)
    {
        OP(vkGetDeviceQueue)(device, queueFamilyIndex, 0, &this->queue);
    }

public:
    ComputeDevice(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex):
        physicalDevice(physicalDevice), queueFamilyIndex(queueFamilyIndex) {
        checkDeviceDataTypeFeatures();
        checkDeviceExtension();
        getDeviceTimeLimits();
        VkResult err = createDevice();
        if (err != VK_SUCCESS) {
            std::map<int, std::string> errstrings;
            errstrings[VK_ERROR_OUT_OF_HOST_MEMORY] = "VK_ERROR_OUT_OF_HOST_MEMORY";
            errstrings[VK_ERROR_OUT_OF_DEVICE_MEMORY] = "VK_ERROR_OUT_OF_DEVICE_MEMORY";
            errstrings[VK_ERROR_INITIALIZATION_FAILED] = "VK_ERROR_INITIALIZATION_FAILED";
            errstrings[VK_ERROR_DEVICE_LOST] = "VK_ERROR_DEVICE_LOST";
            errstrings[VK_ERROR_EXTENSION_NOT_PRESENT] = "VK_ERROR_EXTENSION_NOT_PRESENT";
            errstrings[VK_ERROR_FEATURE_NOT_PRESENT] = "VK_ERROR_FEATURE_NOT_PRESENT";
            errstrings[VK_ERROR_TOO_MANY_OBJECTS] = "VK_ERROR_TOO_MANY_OBJECTS";
            throw std::runtime_error("Failed to create device " + errstrings[err]);
        }
        getDeviceQueue();
#if VK_KHR_shader_integer_dot_product
        if (this->features & FEATURE_DOT)
            check_shader_integer_dot_product_support();
#endif
    };
    ~ComputeDevice() {
        OP(vkDestroyDevice)(device, nullptr);
    };

    VkDevice device;
    VkPhysicalDevice physicalDevice;
    uint32_t queueFamilyIndex;
    VkQueue queue;
    float timestampPeriod;

    uint32_t features;
    bool checkDeviceExtensionFeature(const char *name)
    {
        for (auto ext : this->ext_properties) {
            if (std::string(ext.extensionName).compare(name) == 0) {
                return true;
            }
        }
        return false;
    }
    std::string getDeviceName(void)
    {
        return std::string(deviceProperties.deviceName);
    }

};

vulkan shader

设备已经创建，如何加载执行一个 shader呢。

shader就是glsl语言编写的gpu内核。利用shaderc来编译为spirv，然后xxd -n 指定数组名称，转化为一个h文件，include后，直接访问这个数组名称，就能直接load spirv的代码了。

接下来就是创建pipeline需要的各种资源，分别调用以下接口创建，代码略

复制代码

vkCreateDescriptorSetLayout
vkCreatePipelineLayout
vkCreateShaderModule
vkCreateComputePipelines

vkBindBufferMemory
vkCreateDescriptorPool
vkAllocateDescriptorSets
vkUpdateDescriptorSets
vkCreateCommandPool

vulkan buffer

接下来就是最重要的两个object，一个是buffer，一个是image，对比的介绍也有一些，我看先用用看。对于buffer来说，重要的是两个flag，一个是bufferflag，一个是memoryflag。

bufferflag主要是定义再shader中的用途，比如VK_BUFFER_USAGE_STORAGE_BUFFER_BIT表示通用存储。

memoryflag定义了内存本身是否可以被cpu读，cpu写等特性。

ini 复制代码

class ComputeBuffer {
private:
    int32_t findMemoryTypeFromProperties(uint32_t memoryTypeBits,
            VkPhysicalDeviceMemoryProperties properties,
            VkMemoryPropertyFlags requiredProperties)
    {
        for (uint32_t index = 0; index < properties.memoryTypeCount; ++index) {
            if (((memoryTypeBits & (1 << index))) &&
                ((properties.memoryTypes[index].propertyFlags & requiredProperties) ==
                requiredProperties)) {
                return (int32_t)index;
            }
        }
        return -1;
    }
    VkResult __OpCreateBuffer(int bufferflags, int memoryflags, int num_element, size_t element_size)
    {
        uint32_t queueFamilyIndex = computedevice->queueFamilyIndex;
        VkDevice device = computedevice->device;
        VkDeviceMemory memory;
        VkBuffer buffer;
        VkResult error;

        // create the buffers which will hold the data to be consumed by shader
        VkBufferCreateInfo bufferCreateInfo = {};
        bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
        bufferCreateInfo.size = element_size * num_element;
        bufferCreateInfo.usage = bufferflags;
        bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
        bufferCreateInfo.queueFamilyIndexCount = 1;
        bufferCreateInfo.pQueueFamilyIndices = &queueFamilyIndex;

        error = OP(vkCreateBuffer)(device, &bufferCreateInfo, nullptr, &buffer);
        if (error) {
            std::cout << "failed to create buffer!" << std::endl;
            return error;
        }
        this->buffer = buffer;

        VkMemoryRequirements memoryRequirements;
        OP(vkGetBufferMemoryRequirements)(device, buffer, &memoryRequirements);

        VkPhysicalDeviceMemoryProperties memoryProperties;
        OP(vkGetPhysicalDeviceMemoryProperties)(computedevice->physicalDevice, &memoryProperties);

        auto memoryTypeIndex = findMemoryTypeFromProperties(
            memoryRequirements.memoryTypeBits, memoryProperties,
            memoryflags);
        if (0 > memoryTypeIndex) {
            std::cout << "failed to find compatible memory type" << std::endl;
            return VK_ERROR_UNKNOWN;
        }

        VkMemoryAllocateInfo allocateInfo = {};
        allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
        allocateInfo.allocationSize = memoryRequirements.size;
        allocateInfo.memoryTypeIndex = memoryTypeIndex;

        error = OP(vkAllocateMemory)(device, &allocateInfo, nullptr, &memory);
        if (error) {
            std::cout << "failed to allocate memory!" << std::endl;
            return error;
        }
        this->memory = memory;

        return VK_SUCCESS;
    }
    
    std::shared_ptr<ComputeDevice> computedevice;
    VkBuffer buffer;
    VkDeviceMemory memory;
public:
    ComputeBuffer(std::shared_ptr<ComputeDevice> computedevice, int bufferflags, int memoryflags, int num_element, size_t element_size):
        computedevice(computedevice) {
        VkResult error = __OpCreateBuffer(bufferflags, memoryflags, num_element, element_size);
        if (error) {
            throw std::runtime_error("failed to create buffer1");
        }
    };
    ~ComputeBuffer() {
        if (buffer)
            OP(vkDestroyBuffer)(computedevice->device, buffer, nullptr);
        if (memory)
            OP(vkFreeMemory)(computedevice->device, memory, nullptr);
    };
    VkDeviceMemory getMemory() {
        return memory;
    };
    VkBuffer getBuffer() {
        return buffer;
    };
    void *getMemoryPtr(size_t size) {
        void *ptr;
        OP(vkMapMemory)(computedevice->device, memory, 0, size, 0, &ptr);
        return ptr;
    };
    void unmapMemory() {
        OP(vkUnmapMemory)(computedevice->device, memory);
    }
};

vulkan image

类似vulkan buffer，不过buffer本身理解为连续内存，但是image是可以针对gpu内存做优化的，访问方式不一定是连续内存的。当然如果创建时候定义的是LINEAR layout则本质上也是buffer。

大部分操作类似vulkan buffer 省略，仅列一个创建部分。

vulkan在创建image对象的同时需要多两个对象，imageview，sampler。

sampler可以多个图片共用一个，在MNN中就是用全局的sampler。

ini 复制代码

    VkResult __OpCreateImage(int usage, int memoryflags, int height, int width, VkFormat format)
    {
        std::cout << "Creating image w: " << width << ", h:" << height << std::endl;
        VkResult error = VK_SUCCESS;
        VkDevice device = computedevice->device;
        uint32_t queueFamilyIndex = computedevice->queueFamilyIndex;
        VkImage image = VK_NULL_HANDLE;
        VkDeviceMemory memory = VK_NULL_HANDLE;

        VkImageCreateInfo imageCreateInfo = {};
        imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
        imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
        imageCreateInfo.format = format;
        imageCreateInfo.extent.width = width;
        imageCreateInfo.extent.height = height;
        imageCreateInfo.extent.depth = 1; // 2D image, it is always 1
        imageCreateInfo.mipLevels = 1;
        imageCreateInfo.arrayLayers = 1;
        imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
        imageCreateInfo.tiling = VK_IMAGE_TILING_LINEAR;
        imageCreateInfo.usage = usage;
        imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
        imageCreateInfo.flags = VK_IMAGE_CREATE_EXTENDED_USAGE_BIT;
        imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        imageCreateInfo.queueFamilyIndexCount = 1;
        imageCreateInfo.pQueueFamilyIndices = &queueFamilyIndex;
        error = OP(vkCreateImage)(device, &imageCreateInfo, nullptr, &image);
        if (error) {
            std::cout << "failed to create image!" << std::endl;
            return error;
        }
        this->image = image;

        VkMemoryRequirements memoryRequirements;
        OP(vkGetImageMemoryRequirements)(device, image, &memoryRequirements);
        std::cout << "image memory size " << memoryRequirements.size <<"  "<< memoryRequirements.memoryTypeBits << std::endl;
        VkPhysicalDeviceMemoryProperties memoryProperties;
        OP(vkGetPhysicalDeviceMemoryProperties)(computedevice->physicalDevice, &memoryProperties);
        
        auto memoryTypeIndex = findMemoryTypeFromProperties(
            memoryRequirements.memoryTypeBits, memoryProperties,
            memoryflags);
        if (0 > memoryTypeIndex) {
            std::cout << "failed to find compatible memory type " << memoryRequirements.memoryTypeBits << " ,flags " <<  memoryflags << std::endl;
            return VK_ERROR_UNKNOWN;
        }

        VkMemoryAllocateInfo allocateInfo = {};
        allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
        allocateInfo.allocationSize = memoryRequirements.size;
        allocateInfo.memoryTypeIndex = memoryTypeIndex;
        
        error = OP(vkAllocateMemory)(device, &allocateInfo, nullptr, &memory);
        if (error) {
            std::cout << "failed to allocate memory for image !" << std::endl;
            return error;
        }
        this->memory = memory;

        error = OP(vkBindImageMemory)(device, this->image, this->memory, 0);
        if (error) {
            std::cerr << "failed to bind image memory!" << std::endl;
            return error;
        }

        VkImageView view;
        VkImageViewCreateInfo info = {};
        info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
        info.image = image;
        info.viewType = VK_IMAGE_VIEW_TYPE_2D;
        info.format = format;
        info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        info.subresourceRange.baseMipLevel = 0;
        info.subresourceRange.levelCount = 1;
        info.subresourceRange.baseArrayLayer = 0;
        info.subresourceRange.layerCount = 1;

        error = OP(vkCreateImageView)(device, &info, nullptr, &view);
        if (error) {
            std::cout << "failed to create image view!" << std::endl;
            return error;
        }
        this->imageView = view;

        VkFilter filter = VK_FILTER_NEAREST;
        VkSamplerAddressMode mode = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
        VkSamplerCreateInfo samplerInfo = {};
        VkSampler sampler;
        samplerInfo.sType            = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
        samplerInfo.magFilter        = filter;
        samplerInfo.minFilter        = filter;
        samplerInfo.mipmapMode       = VK_SAMPLER_MIPMAP_MODE_NEAREST;
        samplerInfo.addressModeU     = mode;
        samplerInfo.addressModeV     = mode;
        samplerInfo.addressModeW     = mode;
        samplerInfo.mipLodBias       = 0.0f;
        samplerInfo.borderColor      = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
        samplerInfo.anisotropyEnable = VK_FALSE;
        samplerInfo.maxAnisotropy    = 1.0f;
        samplerInfo.compareEnable    = VK_FALSE;
        samplerInfo.minLod           = 0.0f;
        samplerInfo.maxLod           = 0.0f;
        OP(vkCreateSampler)(device, &samplerInfo, nullptr, &sampler);
        this->sampler = sampler;

        return VK_SUCCESS;
    }

对于image的操作，如前面说，如果是一个VK_IMAGE_TILING_LINEAR的话，是可以和buffer一样，先vkMapMemory，在cpu侧直接操作，然后在unmap就可以。

但是如果不是线性，使用VK_IMAGE_TILING_OPTIMAL，那么这个内存是无法直接操作的。 vulkan提供了接口来实现buffer与image之间互相copy的操作。 vkCmdCopyBufferToImage，vkCmdCopyImageToBuffer。

通常在非arm gpu上可以用staging buffer来实现。比如gooogle的uVkCompute中就是如此。但是arm mali gpu使用的是unified memory，gpu是没有专用内存的，端侧设备gpu、cpu是共享内存的，不建议使用staging buffer。如果是image 这种情况，没有办法避免一次copy。

在MNN中，由于考虑算子是NCHW的格式输入，所以MNN实现了ncgwToImage的shader，本质上也是一次buffer到image的转换，只不过带上了自己的格式变化。少量注释如下，要理解这个shader，首先要先看下输出的image uOutput是如何创建的。

ini 复制代码

#version 450 core
layout(std430) buffer;
layout(set=0, binding=0) writeonly uniform image2D uOutput;

layout(set=0, binding=1) readonly buffer sourceBuffer{
    float data[];
} uInBuffer;

layout(set=0, binding=2) uniform constBuffer{
    ivec4 size; // w, h, c, n
    ivec4 stride; //1, w, w * h, w * h * c
} uConstant;

layout(set=0, binding=3) uniform offsetBuffer {
    ivec4 offset; // Offset w, h, c, n : xstart, ystart, 0, 0
    ivec4 size;//w, h, 0, w*h*c
} uOffset;
layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

void main()
{
    int posX = ivec3(gl_GlobalInvocationID).x;

    if (posX < uOffset.size.w)
    {
        ivec2 pos;
        // Convert posX to 2D position in the output image
        pos.x = posX % uOffset.size.x;
        pos.y = posX / uOffset.size.x;
        ivec2 spos = pos + uOffset.offset.xy;
        int n = spos.y / uConstant.size.y;
        int h = spos.y % uConstant.size.y;
        int c = spos.x / uConstant.size.x;
        int w = spos.x % uConstant.size.x;
        //n * uConstant.stride.w : count batch offset bytes
        //4 * c * uConstant.stride.z : count channel offset bytes
        //h * uConstant.stride.y : count height offset bytes
        //w * uConstant.stride.x : count width offset bytes
        int basicOffset = 0
            + n * uConstant.stride.w
            + 4 * c * uConstant.stride.z
            + h * uConstant.stride.y
            + w * uConstant.stride.x;

        vec4 color = vec4(0);
        color.r = uInBuffer.data[basicOffset+0];
        int imgHW = uConstant.stride.z;
        if (4 * c + 1 < uConstant.size.z) {
            color.g = uInBuffer.data[basicOffset+1*imgHW];
        }
        if (4 * c + 2 < uConstant.size.z) {
            color.b = uInBuffer.data[basicOffset+2*imgHW];
        }
        if (4 * c + 3 < uConstant.size.z) {
            color.a = uInBuffer.data[basicOffset+3*imgHW];
        }
        imageStore(uOutput, pos, color);
    }
}

image创建也是要指定 width，height，depth, 看到MNN如果是2D图片，depth固定是1。实际是取决约format

ini 复制代码

    info.imageType         = imageType;
    info.extent.width      = width;
    info.extent.height     = height;
    info.extent.depth      = 1;
    info.mipLevels         = 1;
    info.arrayLayers       = 1;
    info.format            = format;
    info.tiling            = VK_IMAGE_TILING_OPTIMAL;

其中最重要的逻辑是当channel 大于4的时候。因为MNN的2D的image都是用的RGBA32 float的格式，所以在一个image中，最多是可以包含4个channel。如果当channel大于4的时候，多出来的数据直接拓展到width数据上。

NCHW的数据格式，就是逐层展开，比如一个N=1 C=3 H=2 W=2 的数据，排布为

arduino 复制代码

RR // 首先开展宽度
RR // 然后是高度
GG // 然后是Channel
GG
BB

用R G B A P 表示5个channel，举例子来做这个变化。

arduino 复制代码

//NCHW , 1 5 8 2
// 这里HW重复四次, 填充RGBA的image没有问题,
// 第 5个 channel的数据, 放在到第一个channel的数据中,
// 但是记住,输入的数据依然是NCHW的数据, 
// 新数据 width拓宽后, C变少了。这种方式可以放到了一个2D的vkimage中了。
// NCHW, 1, 4, 8, 4
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// GG
// GG
// GG
// GG
// GG
// GG
// GG
// GG
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// AA
// AA
// AA
// AA
// AA
// AA
// AA
// AA

grid sample 算子

为了简化我们的例子，我们直接使用VK_IMAGE_TILING_LINEAR的image格式。这样避免引入copy的逻辑，验证算子。

pytorch中的函数原型如下,后面三个我们选择bilinear，zeros，none，默认方式，仅首先这一种情况的算子。

ini 复制代码

// Prototype:
 torch.nn.functional.grid_sample(input,
                                 grid, 
                                 mode='bilinear',                
                                 padding_mode='zeros', align_corners=None)

算子逻辑直接用MNN的算子

ini 复制代码

#version 450 core
#extension GL_EXT_scalar_block_layout : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable


layout(set=0, binding=0) writeonly uniform image2D uOutput;
layout(set=0, binding=1) uniform sampler2D uInput;
layout(set=0, binding=2) uniform sampler2D uGrid;

layout(set=0, binding=3) uniform gridSampleBuffer{
	ivec4 outImgSize;  // 输出图像的大小，HWNC
	ivec2 inShape;     // 输入图像的宽度和高度
	ivec2 outShape;    // 输出图像的宽度和高度
}uGridSampleParam;

layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;

vec4 LoadSample(int positionX, int positionY, int width, int height, int c, int n) {
	vec4 value;
	if (positionX < 0 || positionX >= width || positionY < 0 || positionY >= height) {
		value = vec4(0.0);
	} else {
		value = texelFetch(uInput, ivec2(c * width + positionX, n * height + positionY), 0);
	}
	return value;
}

void main()
{
	ivec3 pos = ivec3(gl_GlobalInvocationID);

	ivec3 outputImgSize = uGridSampleParam.outImgSize.xyz;

	ivec2 inputShape = uGridSampleParam.inShape;
	ivec2 outputShape = uGridSampleParam.outShape;

	if(pos.x < outputImgSize.x && pos.y < outputImgSize.y)
	{
		// 计算输出图像中的位置, 以输出图像的宽度和高度, 得到
		int n = pos.y / outputShape.y;
		int h = pos.y % outputShape.y;
		int c = pos.x / outputShape.x;
		int w = pos.x % outputShape.x;

		// 获取网格中的位置
		int h_C4   = h / 4;
		int remain = h % 4;
		float gridX = texelFetch(uGrid, ivec2(h_C4 * 2 + 0, n * outputShape.x + w), 0)[remain];
		float gridY = texelFetch(uGrid, ivec2(h_C4 * 2 + 1, n * outputShape.x + w), 0)[remain];

		// 计算输入图像中的位置
		float cordH = ((1 + gridY) * (inputShape.y) - 1) * 0.5f;
		float cordW = ((1 + gridX) * (inputShape.x) - 1) * 0.5f;

		int w0_h = int(floor(cordH));
		int w0_w = int(floor(cordW));
		int w1_h = w0_h + 1;
		int w1_w = w0_w + 1;

		vec4 i00 = LoadSample(w0_w, w0_h, inputShape.x, inputShape.y, c, n);
		vec4 i01 = LoadSample(w1_w, w0_h, inputShape.x, inputShape.y, c, n);
		vec4 i10 = LoadSample(w0_w, w1_h, inputShape.x, inputShape.y, c, n);
		vec4 i11 = LoadSample(w1_w, w1_h, inputShape.x, inputShape.y, c, n);

		vec4 oneV = vec4(1.0);
		vec4 f0 = vec4(float(w1_w) - cordW);
		vec4 f1 = oneV - f0;
		vec4 h0 = vec4(float(w1_h) - cordH);
		vec4 h1 = oneV - h0;
		
		vec4 i0 = i00 * f0 + i01 * f1;
		vec4 i1 = i10 * f0 + i11 * f1;

		vec4 value = i0 * h0 + i1 * h1;

		imageStore(uOutput, pos.xy, value);
	}
}

这个算子如果对于cpu上实现，可以看下面的逻辑，本质上一样的逻辑，也是从MNN中扒出来的。注释也解释了什么是双线性插值，本质就是用周围四个点估算目标位置取值。

ini 复制代码

// 反归一化处理坐标
/*
* x_norm 范围在 [-1.0, 1.0], 那么首先 +1 后范围在 [0, 2]
* 那么乘以 range  后范围在 [0, 2*range - 1]
* 再除以 2 后范围在 [0, range - 0.5]
*/
static float getPosition(float x_norm, int range, bool alignCorners) {
    (void)alignCorners;
    return (((1.0f + x_norm) * (range) - 1.0f) * 0.5f);
}

// padding zero
// 获取坐标对应的值, 如果超过范围, 补零
static float sample(int y, int x, const float *buffer, int height, int width) {
    if (y < 0 || y >= height || x < 0 || x >= width) {
        return 0.0f;
    }

    return buffer[y * width + x];
}

// 双线性插值算法,
/*
* 首先计算出上下左右四个点的坐标, 对浮点取整,floor下值,ceil上值
* 然后sample取值, 计算出四个点的值
* 计算出权重,
*/
static float interpolate(float h, float w, const float *buffer, int height, int width) {
    // mode == GridSampleMode_BILINEAR
    int w0_h = ::floor(h);
    int w0_w = ::floor(w);
    int w1_h = ::ceil(h);
    int w1_w = ::ceil(w);

    // 左下角
    float i00 = sample(w0_h, w0_w, buffer, height, width);
    // 右下角
    float i01 = sample(w0_h, w1_w, buffer, height, width);
    // 左上角
    float i10 = sample(w1_h, w0_w, buffer, height, width);
    // 右上角
    float i11 = sample(w1_h, w1_w, buffer, height, width);

    // 权重, 左边界归一化
    float fx2 = w - w0_w;
    // 右边界归一化
    float fx1 = 1.0f - fx2;
    // 上边界归一化
    float fy2 = h - w0_h;
    // 下边界归一化
    float fy1 = 1.0f - fy2;

    // 插值. 水平方向
    float i0 = ((i00) * fx1 + (i01) * fx2);
    float i1 = ((i10) * fx1 + (i11) * fx2);

    // 插值, 竖直方向
    return ((i0 * fy1) + (i1 * fy2));
}

static void reference_grid_sample(const float *inputPtr, const float *gridPtr, std::vector<float> &output,
                                  int batch, int inHeight, int inWidth, int outHeight, int outWidth, int depth,
                                  bool alignCorners) {
    output.resize(batch * outHeight * outWidth * depth);

    float *outputPtr = output.data();

    // 按照 NCHW 的顺序, HW 以output 为目标,
    // grid 的hw 和output是一致的
    // input不参与循环hw, 在每个NC的循环中, 直接整个图以HW尺寸输入,保证grid操作单个channel
    for (auto b = 0; b < batch; ++b) {
        const float *_inputPtr = inputPtr + b * inHeight * inWidth * depth;
        const float *_gridPtr = gridPtr + b * outHeight * outWidth * 2;
        float *_outputPtr = outputPtr + b * outHeight * outWidth * depth;

        for (auto c = 0; c < depth; ++c) {
            auto __inputPtr = _inputPtr + c * inHeight * inWidth;
            auto __outputPtr = _outputPtr + c * outHeight * outWidth;

            for (auto h = 0; h < outHeight; ++h) {
                auto __gridPtr = _gridPtr + h * outWidth * 2;
                auto ___outputPtr = __outputPtr + h * outWidth;

                for (auto w = 0; w < outWidth; ++w) {
                    // 首先反归一化得到坐标
                    auto x = getPosition(__gridPtr[2 * w + 0], inWidth, alignCorners);
                    auto y = getPosition(__gridPtr[2 * w + 1], inHeight, alignCorners);
                    // 然后插值,得到的值输出
                    ___outputPtr[w] = interpolate(y, x, __inputPtr, inHeight, inWidth);
                }
            }
        }
    }
}

可以用CPU的实现作为参考，计算的结果来和GPU的结果对比，验证算子正确性。

剩下的是如何填充image。对于gridsample操作，我们有两个输入。

NCHW 的 input

NHWC 的 grid

直接map每个image的memory，直接操作填充。填充的时候要把C > 4 的场景按照上面的逻辑，转化到width这一层。

ini 复制代码

void setImageData(float *data, std::vector<int> nchw)
    {
        int batch = nchw[0];
        int depth = nchw[1];
        int height = nchw[2];
        int width = nchw[3];

        int stride_w = 1;
        int stride_h = width;
        int stride_c = width * height;
        int stride_n = width * height * depth;

        int realdepth = UP_DIV(depth, 4);
        void *ptr;
        // since format is VK_FORMAT_R32G32B32A32_SFLOAT
        ptr = getMemoryPtr(VK_WHOLE_SIZE);

        VkSubresourceLayout layout;
        VkImageSubresource subresource = {};
        subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        subresource.mipLevel = 0;
        subresource.arrayLayer = 0;
        OP(vkGetImageSubresourceLayout)(computedevice->device, image, &subresource, &layout);
        // take care about the pitch value for new row
        const uint32_t rowPitch = layout.rowPitch;
        std::cout << "row pitch " << rowPitch << std::endl;
        std::cout << "height " << height << ", width " << width << std::endl;
        float *dst = reinterpret_cast<float *>(ptr);
        for (int b = 0; b < batch; b++) {
            for (int c = 0; c < realdepth; c++) {
                dst = reinterpret_cast<float *>(ptr) + c * width * 4;
                for (int h = 0; h < height; h++) {
                    for (int w = 0; w < width; w++) {
                        int offset = b * stride_n + 4 * c * stride_c + h * stride_h + w * stride_w;
                        float r = data[offset];
                        float g = (4 * c + 1 < depth) ? data[stride_c + offset] : 0.0f;
                        float b = (4 * c + 2 < depth) ? data[2 * stride_c + offset] : 0.0f;
                        float a = (4 * c + 3 < depth) ? data[3 * stride_c + offset] : 0.0f;

                        // Write RGBA values to the Vulkan image memory
                        dst[w * 4 + 0] = r;
                        dst[w * 4 + 1] = g;
                        dst[w * 4 + 2] = b;
                        dst[w * 4 + 3] = a;
                    }
                    // Move to the next row in the Vulkan image memory
                    dst = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(dst) + rowPitch);
                }
            }
        }
        unmapMemory();
    }

最后就是提交任务到gpu，然后运行拿回结果进行比较。

ini 复制代码

typedef int ivec4[4];
typedef int ivec2[2];
struct GpuGridSampleParam {
    ivec4 outImgSize;
    ivec2 inShape;
    ivec2 outShape;
};

void OpSubmitWork()
    {
        float *inputPtr = originInputData.data();
        float *gridPtr = originGridData.data();
        this->images[1]->setImageData(inputPtr, {batch, depth, inHeight, inWidth});//NCHW
        //NCHW, w = UPDIV(c, 4) * w, but for grid, we need to take NHWC
        // so here the witdh = UPDIV(h, 4) * c, height = w, depth = h
        this->images[2]->setImageData(gridPtr, {batch,  outHeight, outWidth, 2});//NCHW, so take NHWC

        void *aptr = buffers[0]->getMemoryPtr(sizeof(struct GpuGridSampleParam));
        if (!aptr) {
            std::cout << "failed to map memory!" << std::endl;
            return;
        }
        struct GpuGridSampleParam *para = static_cast<struct GpuGridSampleParam *>(aptr);
        para->outImgSize[0] = outWidth;
        para->outImgSize[1] = outHeight;
        para->outImgSize[2] = 1;
        para->outImgSize[3] = 0;
        para->inShape[0] = inWidth;
        para->inShape[1] = inHeight;
        para->outShape[0] = outWidth;
        para->outShape[1] = outHeight;
        std::cout << "output image size " << outWidth << " " << outHeight << std::endl;
        std::cout << "input image size " << inWidth << " " << inHeight << std::endl;
        buffers[0]->unmapMemory();
        
        VkSubmitInfo submitInfo = {};
        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
        submitInfo.commandBufferCount = 1;
        submitInfo.pCommandBuffers = &this->commandBuffer;

        OP(vkQueueSubmit)(computedevice->queue, 1, &submitInfo, nullptr);
        OP(vkQueueWaitIdle)(computedevice->queue);
    }

回顾小结

学习了vulkan如果用compute pipeline实现计算加速的编程。
阅读学习了uvkCompute和MNN的vulkan的实现
理解了grid sample的原理和实现
理解了MNN nchw nc4hw4这种格式，特别是nchwToImage变化中涉及的数据填充的搬移。并理解后实现了cpu侧直接的fill data的过程。