边学边做,法力无边
MNN
阿里开源的推理引擎,能部署到设备端,执行各种模型任务。其定义了多种执行后端,CPU,opencl,vulkan 等等。
多的就不介绍了。仅看vulkan的实现来做参考。比如实现一个Grid Sample.
vulkan compute
vulkan有多种stage支持,其中compute stage可以做gpu的计算加速。为了不掉入一个巨大引擎的框架,花费太多时间梳理框架的代码,直接简化逻辑,写一段通用代码来初始化vulkan使用它。
arduino
// 务必包含这两个头文件, libdl 对比arm等移动端是必须,否则link时候总是报错。
#include <dlfcn.h>
#include <vulkan/vulkan.h>
vulkan lib
接下来初始化vulkan 的库,对于libdl方式,需要dlsym 逐个加载我们需要的vulkan api. 用一个宏包裹这些API的列表,然后再用dlpoen, dlsym的方式加载vulkan
scss
#define VK_FUNCTION_LIST \
PFN(vkEnumerateInstanceVersion) \
PFN(vkEnumerateInstanceLayerProperties) \
PFN(vkCreateInstance) \
PFN(vkEnumerateInstanceExtensionProperties) \
PFN(vkGetInstanceProcAddr) \
PFN(vkMapMemory) \
PFN(vkUnmapMemory) \
PFN(vkGetBufferMemoryRequirements) \
PFN(vkGetPhysicalDeviceMemoryProperties) \
PFN(vkAllocateMemory) \
PFN(vkAllocateCommandBuffers) \
PFN(vkBindBufferMemory) \
PFN(vkCmdBindPipeline) \
PFN(vkCmdDispatch) \
PFN(vkCmdWriteTimestamp) \
PFN(vkCmdBindDescriptorSets) \
PFN(vkCmdResetQueryPool) \
PFN(vkBeginCommandBuffer) \
PFN(vkEndCommandBuffer) \
PFN(vkQueueSubmit) \
PFN(vkQueueWaitIdle) \
PFN(vkCreateBuffer) \
PFN(vkCreateQueryPool) \
PFN(vkCreateDescriptorPool) \
PFN(vkAllocateDescriptorSets) \
PFN(vkUpdateDescriptorSets) \
PFN(vkCreateCommandPool) \
PFN(vkCreateComputePipelines) \
PFN(vkCreateDevice) \
PFN(vkGetDeviceQueue) \
PFN(vkCreateDescriptorSetLayout) \
PFN(vkCreatePipelineLayout) \
PFN(vkDestroyBuffer) \
PFN(vkDestroyQueryPool) \
PFN(vkDestroyDescriptorPool) \
PFN(vkDestroyPipeline) \
PFN(vkDestroyPipelineLayout) \
PFN(vkDestroyDescriptorSetLayout) \
PFN(vkDestroyDevice) \
PFN(vkDestroyInstance) \
PFN(vkGetQueryPoolResults) \
PFN(vkCreateShaderModule) \
PFN(vkDestroyShaderModule) \
PFN(vkDestroyCommandPool) \
PFN(vkFreeMemory) \
PFN(vkGetPhysicalDeviceQueueFamilyProperties) \
PFN(vkGetPhysicalDeviceProperties2) \
PFN(vkEnumeratePhysicalDevices) \
PFN(vkEnumerateDeviceExtensionProperties) \
PFN(vkResetCommandBuffer) \
PFN(vkFreeCommandBuffers) \
PFN(vkGetPhysicalDeviceFeatures) \
PFN(vkGetPhysicalDeviceFeatures2) \
PFN(vkBindBufferMemory2)
class VulkanLib {
private:
void *lib;
std::unique_ptr<std::map<std::string, void *> > symbols;
public:
VulkanLib() {
symbols = std::make_unique<std::map<std::string, void *> >();
#ifdef __APPLE__
lib = dlopen("libvulkan.dylib", RTLD_LAZY | RTLD_LOCAL);
if (!lib)
lib = dlopen("libvulkan.1.dylib", RTLD_LAZY | RTLD_LOCAL);
if (!lib)
lib = dlopen("libMoltenVK.dylib", RTLD_NOW | RTLD_LOCAL);
if (!lib && getenv("DYLD_FALLBACK_LIBRARY_PATH") == nullptr)
lib = dlopen("/usr/local/lib/libvulkan.dylib", RTLD_NOW | RTLD_LOCAL);
#elif defined __linux__
lib = dlopen("libvulkan.so.1", RTLD_LAZY | RTLD_LOCAL);
if (!lib)
lib = dlopen("libvulkan.so", RTLD_LAZY | RTLD_LOCAL);
#endif
if (!lib) {
std::cerr << "Failed to load vulkan library ," << dlerror() << std::endl;
return ;
}
#define PFN(name) name = reinterpret_cast<PFN_##name>(dlsym(lib, #name));
VK_FUNCTION_LIST
#undef PFN
}
~VulkanLib() {
dlclose(lib);
}
void *getSymbol(const char *name) {
return symbols->at(name);
}
#define PFN(name) PFN_##name name;
VK_FUNCTION_LIST
#undef PFN
};
上面的类实现了对vulkan库的简单初始化,可以定义一个全局变量来保证vulkan api全局可以访问的到。
arduino
VulkanLib vklib;
#define OP(name) vklib.name
vulkan instance
接下来实现的是操作 vulkan instance, 没有把validation layer支持也加进来,仅看关键大概100行代码,注意VK_KHR_portability_enumeration支持,对于macos跑vulkan 1.4的时候是必须的。
ini
class VulkanInstance {
private:
VkInstance instance;
void checkInstanceExtension()
{
uint32_t pPropertyCount;
OP(vkEnumerateInstanceExtensionProperties)(nullptr, &pPropertyCount, nullptr);
ext_properties.resize(pPropertyCount);
OP(vkEnumerateInstanceExtensionProperties)(nullptr, &pPropertyCount, ext_properties.data());
}
bool checkInstanceExtensionFeature(const char *name)
{
for (auto ext : this->ext_properties) {
if (std::string(ext.extensionName).compare(name) == 0) {
return true;
}
}
return false;
}
VkInstance OpCreateInstance(std::vector<const char *> &enabledLayerNames) {
uint32_t version = getVulkanVersion();
VkApplicationInfo applicationInfo = {};
applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
applicationInfo.pApplicationName = "Vulkan Compute Shader Benchmark";
// SPV_KHR_vulkan_memory_model, use_vulkan_memory_model in spirv requires 1.2.0
applicationInfo.apiVersion = version; // use the libvulkan version directly
applicationInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
applicationInfo.pEngineName = "Vulkan bench";
applicationInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
VkInstanceCreateInfo instanceCreateInfo = {};
instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
instanceCreateInfo.pApplicationInfo = &applicationInfo;
instanceCreateInfo.enabledLayerCount = static_cast<uint32_t>(enabledLayerNames.size());
instanceCreateInfo.ppEnabledLayerNames = enabledLayerNames.data();
#if VK_KHR_portability_enumeration
if (checkInstanceExtensionFeature(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME)) {
enabledExtensionNames.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
instanceCreateInfo.flags = VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
}
#endif
if (enabledExtensionNames.size() > 0) {
instanceCreateInfo.enabledExtensionCount = static_cast<uint32_t>(enabledExtensionNames.size());
instanceCreateInfo.ppEnabledExtensionNames = enabledExtensionNames.data();
}
VkInstance instance = VK_NULL_HANDLE;
VkResult error = OP(vkCreateInstance)(&instanceCreateInfo, nullptr, &instance);
if (error != VK_SUCCESS) {
std::cout << "Fail to create instance " << error << std::endl;
if (error == VK_ERROR_LAYER_NOT_PRESENT) {
std::cout << "VK_ERROR_LAYER_NOT_PRESENT" << std::endl;
} else if (error == VK_ERROR_INCOMPATIBLE_DRIVER) {
std::cout << "VK_ERROR_INCOMPATIBLE_DRIVER" << std::endl;
}
return nullptr;
}
return instance;
}
std::vector<const char *> enabledExtensionNames;
std::vector<VkExtensionProperties> ext_properties;
public:
VulkanInstance() {
std::vector<const char *> enabledLayerNames;
checkInstanceExtension();
instance = OpCreateInstance(enabledLayerNames);
if (!instance) {
throw std::runtime_error("Failed to create Vulkan instance.");
}
}
~VulkanInstance() {
OP(vkDestroyInstance)(instance, nullptr);
}
std::vector<std::pair<VkPhysicalDevice, uint32_t>> getDeviceAndQeueue(void) {
std::vector<std::pair<VkPhysicalDevice, uint32_t>> ret;
uint32_t count;
VkResult error = OP(vkEnumeratePhysicalDevices)(instance, &count, nullptr);
if (error != VK_SUCCESS) {
return ret;
}
std::vector<VkPhysicalDevice> physicalDevices(count);
error = OP(vkEnumeratePhysicalDevices)(instance, &count, physicalDevices.data());
if (error != VK_SUCCESS) {
return ret;
}
std::cout << "Found " << count << " physical devices." << std::endl;
for (auto device : physicalDevices) {
OP(vkGetPhysicalDeviceQueueFamilyProperties)(device, &count, nullptr);
std::vector<VkQueueFamilyProperties> queueFamilyProperties(count);
OP(vkGetPhysicalDeviceQueueFamilyProperties)(device, &count,
queueFamilyProperties.data());
uint32_t index = 0;
for (auto &properties : queueFamilyProperties) {
if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) {
ret.push_back({device, index});
break;
}
index++;
}
}
return ret;
}
};
vulkan device
接下来是操作device,注意的是push各种device extension 支持,这里涉及不同设备支持,但其实不加也能跑,不过如果是validation layer 开启后,会产生很多告警。
ini
class ComputeDevice {
private:
std::vector<VkExtensionProperties> ext_properties;
VkPhysicalDeviceProperties deviceProperties;
void checkDeviceDataTypeFeatures(void)
{
VkPhysicalDeviceFeatures deviceFeatures = {};
OP(vkGetPhysicalDeviceFeatures)(physicalDevice, &deviceFeatures);
this->features |= (deviceFeatures.shaderInt64 ? FEATURE_INT64 : 0);
this->features |= (deviceFeatures.shaderFloat64 ? FEATURE_FP64 : 0);
this->features |= FEATURE_FP32 | FEATURE_INT32;
this->features |= (deviceFeatures.shaderInt16 ? FEATURE_INT16 : 0);
VkPhysicalDeviceShaderFloat16Int8Features float16Int8Features = {};
float16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR;
VkPhysicalDeviceFeatures2 features2 = {};
features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
features2.pNext = &float16Int8Features;
OP(vkGetPhysicalDeviceFeatures2)(physicalDevice, &features2);
this->features |= (float16Int8Features.shaderFloat16 ? FEATURE_FP16 : 0);
this->features |= (float16Int8Features.shaderInt8 ? FEATURE_INT8 : 0);
}
void checkDeviceExtension(void)
{
uint32_t extensionCount = 0;
OP(vkEnumerateDeviceExtensionProperties)(physicalDevice, NULL, &extensionCount, NULL);
this->ext_properties.resize(extensionCount);
OP(vkEnumerateDeviceExtensionProperties)(physicalDevice, NULL, &extensionCount, this->ext_properties.data());
// std::cout << "Device Extensions:" << std::endl;
// for (uint32_t i = 0; i < extensionCount; i++) {
// std::cout << ext_properties[i].extensionName << ": " << ext_properties[i].specVersion << std::endl;
// }
}
void getDeviceTimeLimits(void)
{
VkPhysicalDeviceSubgroupProperties subgroup_properties = {};
subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
subgroup_properties.pNext = nullptr;
VkPhysicalDeviceProperties2 properties2 = {};
properties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
properties2.pNext = &subgroup_properties;
OP(vkGetPhysicalDeviceProperties2)(physicalDevice, &properties2);
deviceProperties = properties2.properties;
this->timestampPeriod = deviceProperties.limits.timestampPeriod;
std::cout << "GPU " << deviceProperties.deviceName << std::endl;
}
VkResult createDevice(void)
{
std::vector<uintptr_t> enabledFeatures;
std::vector<const char *> enabledExtensions;
VkPhysicalDeviceFeatures features = {};
features.robustBufferAccess = VK_TRUE;
if (this->features & FEATURE_INT64)
features.shaderInt64 = VK_TRUE;
if (this->features & FEATURE_FP64)
features.shaderFloat64 = VK_TRUE;
if (this->features & FEATURE_INT16)
features.shaderInt16 = VK_TRUE;
VkPhysicalDeviceFloat16Int8FeaturesKHR float16Int8Features = {};
float16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
VkPhysicalDevice8BitStorageFeatures storage8bitFeatures = {};
storage8bitFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES;
storage8bitFeatures.uniformAndStorageBuffer8BitAccess = VK_TRUE;
storage8bitFeatures.storageBuffer8BitAccess = VK_TRUE;
#ifdef VK_KHR_16bit_storage
VkPhysicalDevice16BitStorageFeatures storage16bitFeatures = {};
storage16bitFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES;
storage16bitFeatures.uniformAndStorageBuffer16BitAccess = VK_TRUE;
storage16bitFeatures.storageBuffer16BitAccess = VK_TRUE;
storage16bitFeatures.storageInputOutput16 = VK_TRUE;
#endif
if (this->features & FEATURE_INT8) {
float16Int8Features.shaderInt8 = VK_TRUE;
if (checkDeviceExtensionFeature(VK_KHR_8BIT_STORAGE_EXTENSION_NAME)) {
enabledExtensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
enabledFeatures.push_back(reinterpret_cast<uintptr_t>(&storage8bitFeatures));
}
}
if (this->features & FEATURE_FP16) {
float16Int8Features.shaderFloat16 = VK_TRUE;
if (checkDeviceExtensionFeature(VK_KHR_16BIT_STORAGE_EXTENSION_NAME)) {
enabledExtensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
if (deviceProperties.vendorID != 4318) {
// tested on Nvidia A2000, it supports 16bit storage feature but did not need to enable it
// enable it will cause validation error VK_ERROR_FEATURE_NOT_PRESENT
enabledFeatures.push_back(reinterpret_cast<uintptr_t>(&storage16bitFeatures));
}
}
#ifdef VK_AMD_gpu_shader_half_float
if (deviceProperties.vendorID == 4098) {
// for AMD card, do we really need this ? over VK_KHR_shader_float16_int8
if (checkDeviceExtensionFeature(VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME)) {
enabledExtensions.push_back(VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME);
}
}
#endif
}
if (this->features & (FEATURE_INT8 |FEATURE_FP16)) {
if (checkDeviceExtensionFeature(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME)) {
enabledFeatures.push_back(reinterpret_cast<uintptr_t>(&float16Int8Features));
enabledExtensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
}
}
#ifdef VK_KHR_bind_memory2
if (checkDeviceExtensionFeature(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME)) {
enabledExtensions.push_back(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME);
}
#endif
struct GeneralFeature {
VkStructureType sType;
void* pNext;
};
void* pFirst = nullptr;
if (enabledFeatures.size() > 0) {
pFirst = reinterpret_cast<void *>(enabledFeatures[0]);
struct GeneralFeature* ptr = reinterpret_cast<struct GeneralFeature*>(pFirst);
for (size_t i = 1; i < enabledFeatures.size(); i++) {
struct GeneralFeature* feat = reinterpret_cast<struct GeneralFeature*>(enabledFeatures[i]);
ptr->pNext = feat;
ptr = feat;
}
}
VkDeviceQueueCreateInfo queueCreateInfo = {};
queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
queueCreateInfo.queueCount = 1;
queueCreateInfo.queueFamilyIndex = queueFamilyIndex;
float queuePriority = 1.0f; // specifies if this queue gets preference
queueCreateInfo.pQueuePriorities = &queuePriority;
VkDeviceCreateInfo deviceCreateInfo = {};
deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
deviceCreateInfo.queueCreateInfoCount = 1;
deviceCreateInfo.pQueueCreateInfos = &queueCreateInfo;
deviceCreateInfo.enabledLayerCount = 0;
deviceCreateInfo.ppEnabledLayerNames = nullptr;
deviceCreateInfo.pEnabledFeatures = &features;
deviceCreateInfo.enabledExtensionCount = static_cast<uint32_t>(enabledExtensions.size());
deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
deviceCreateInfo.pNext = pFirst;
return OP(vkCreateDevice)(this->physicalDevice, &deviceCreateInfo, nullptr, &this->device);
}
void getDeviceQueue(void)
{
OP(vkGetDeviceQueue)(device, queueFamilyIndex, 0, &this->queue);
}
public:
ComputeDevice(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex):
physicalDevice(physicalDevice), queueFamilyIndex(queueFamilyIndex) {
checkDeviceDataTypeFeatures();
checkDeviceExtension();
getDeviceTimeLimits();
VkResult err = createDevice();
if (err != VK_SUCCESS) {
std::map<int, std::string> errstrings;
errstrings[VK_ERROR_OUT_OF_HOST_MEMORY] = "VK_ERROR_OUT_OF_HOST_MEMORY";
errstrings[VK_ERROR_OUT_OF_DEVICE_MEMORY] = "VK_ERROR_OUT_OF_DEVICE_MEMORY";
errstrings[VK_ERROR_INITIALIZATION_FAILED] = "VK_ERROR_INITIALIZATION_FAILED";
errstrings[VK_ERROR_DEVICE_LOST] = "VK_ERROR_DEVICE_LOST";
errstrings[VK_ERROR_EXTENSION_NOT_PRESENT] = "VK_ERROR_EXTENSION_NOT_PRESENT";
errstrings[VK_ERROR_FEATURE_NOT_PRESENT] = "VK_ERROR_FEATURE_NOT_PRESENT";
errstrings[VK_ERROR_TOO_MANY_OBJECTS] = "VK_ERROR_TOO_MANY_OBJECTS";
throw std::runtime_error("Failed to create device " + errstrings[err]);
}
getDeviceQueue();
#if VK_KHR_shader_integer_dot_product
if (this->features & FEATURE_DOT)
check_shader_integer_dot_product_support();
#endif
};
~ComputeDevice() {
OP(vkDestroyDevice)(device, nullptr);
};
VkDevice device;
VkPhysicalDevice physicalDevice;
uint32_t queueFamilyIndex;
VkQueue queue;
float timestampPeriod;
uint32_t features;
bool checkDeviceExtensionFeature(const char *name)
{
for (auto ext : this->ext_properties) {
if (std::string(ext.extensionName).compare(name) == 0) {
return true;
}
}
return false;
}
std::string getDeviceName(void)
{
return std::string(deviceProperties.deviceName);
}
};
vulkan shader
设备已经创建,如何加载执行一个 shader呢。
shader就是glsl语言编写的gpu内核。利用shaderc来编译为spirv,然后xxd -n 指定数组名称,转化为一个h文件,include后,直接访问这个数组名称,就能直接load spirv的代码了。
接下来就是创建pipeline需要的各种资源,分别调用以下接口创建,代码略
vkCreateDescriptorSetLayout
vkCreatePipelineLayout
vkCreateShaderModule
vkCreateComputePipelines
vkBindBufferMemory
vkCreateDescriptorPool
vkAllocateDescriptorSets
vkUpdateDescriptorSets
vkCreateCommandPool
vulkan buffer
接下来就是最重要的两个object,一个是buffer,一个是image,对比的介绍也有一些,我看先用用看。 对于buffer来说,重要的是两个flag,一个是bufferflag,一个是memoryflag。
bufferflag主要是定义再shader中的用途,比如VK_BUFFER_USAGE_STORAGE_BUFFER_BIT表示通用存储。
memoryflag定义了内存本身是否可以被cpu读,cpu写等特性。
ini
class ComputeBuffer {
private:
int32_t findMemoryTypeFromProperties(uint32_t memoryTypeBits,
VkPhysicalDeviceMemoryProperties properties,
VkMemoryPropertyFlags requiredProperties)
{
for (uint32_t index = 0; index < properties.memoryTypeCount; ++index) {
if (((memoryTypeBits & (1 << index))) &&
((properties.memoryTypes[index].propertyFlags & requiredProperties) ==
requiredProperties)) {
return (int32_t)index;
}
}
return -1;
}
VkResult __OpCreateBuffer(int bufferflags, int memoryflags, int num_element, size_t element_size)
{
uint32_t queueFamilyIndex = computedevice->queueFamilyIndex;
VkDevice device = computedevice->device;
VkDeviceMemory memory;
VkBuffer buffer;
VkResult error;
// create the buffers which will hold the data to be consumed by shader
VkBufferCreateInfo bufferCreateInfo = {};
bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
bufferCreateInfo.size = element_size * num_element;
bufferCreateInfo.usage = bufferflags;
bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
bufferCreateInfo.queueFamilyIndexCount = 1;
bufferCreateInfo.pQueueFamilyIndices = &queueFamilyIndex;
error = OP(vkCreateBuffer)(device, &bufferCreateInfo, nullptr, &buffer);
if (error) {
std::cout << "failed to create buffer!" << std::endl;
return error;
}
this->buffer = buffer;
VkMemoryRequirements memoryRequirements;
OP(vkGetBufferMemoryRequirements)(device, buffer, &memoryRequirements);
VkPhysicalDeviceMemoryProperties memoryProperties;
OP(vkGetPhysicalDeviceMemoryProperties)(computedevice->physicalDevice, &memoryProperties);
auto memoryTypeIndex = findMemoryTypeFromProperties(
memoryRequirements.memoryTypeBits, memoryProperties,
memoryflags);
if (0 > memoryTypeIndex) {
std::cout << "failed to find compatible memory type" << std::endl;
return VK_ERROR_UNKNOWN;
}
VkMemoryAllocateInfo allocateInfo = {};
allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocateInfo.allocationSize = memoryRequirements.size;
allocateInfo.memoryTypeIndex = memoryTypeIndex;
error = OP(vkAllocateMemory)(device, &allocateInfo, nullptr, &memory);
if (error) {
std::cout << "failed to allocate memory!" << std::endl;
return error;
}
this->memory = memory;
return VK_SUCCESS;
}
std::shared_ptr<ComputeDevice> computedevice;
VkBuffer buffer;
VkDeviceMemory memory;
public:
ComputeBuffer(std::shared_ptr<ComputeDevice> computedevice, int bufferflags, int memoryflags, int num_element, size_t element_size):
computedevice(computedevice) {
VkResult error = __OpCreateBuffer(bufferflags, memoryflags, num_element, element_size);
if (error) {
throw std::runtime_error("failed to create buffer1");
}
};
~ComputeBuffer() {
if (buffer)
OP(vkDestroyBuffer)(computedevice->device, buffer, nullptr);
if (memory)
OP(vkFreeMemory)(computedevice->device, memory, nullptr);
};
VkDeviceMemory getMemory() {
return memory;
};
VkBuffer getBuffer() {
return buffer;
};
void *getMemoryPtr(size_t size) {
void *ptr;
OP(vkMapMemory)(computedevice->device, memory, 0, size, 0, &ptr);
return ptr;
};
void unmapMemory() {
OP(vkUnmapMemory)(computedevice->device, memory);
}
};
vulkan image
类似vulkan buffer,不过buffer本身理解为连续内存,但是image是可以针对gpu内存做优化的,访问方式不一定是连续内存的。当然如果创建时候定义的是LINEAR layout则本质上也是buffer。
大部分操作类似vulkan buffer 省略,仅列一个创建部分。
vulkan在创建image对象的同时需要多两个对象,imageview,sampler。
sampler可以多个图片共用一个,在MNN中就是用全局的sampler。
ini
VkResult __OpCreateImage(int usage, int memoryflags, int height, int width, VkFormat format)
{
std::cout << "Creating image w: " << width << ", h:" << height << std::endl;
VkResult error = VK_SUCCESS;
VkDevice device = computedevice->device;
uint32_t queueFamilyIndex = computedevice->queueFamilyIndex;
VkImage image = VK_NULL_HANDLE;
VkDeviceMemory memory = VK_NULL_HANDLE;
VkImageCreateInfo imageCreateInfo = {};
imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
imageCreateInfo.format = format;
imageCreateInfo.extent.width = width;
imageCreateInfo.extent.height = height;
imageCreateInfo.extent.depth = 1; // 2D image, it is always 1
imageCreateInfo.mipLevels = 1;
imageCreateInfo.arrayLayers = 1;
imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
imageCreateInfo.tiling = VK_IMAGE_TILING_LINEAR;
imageCreateInfo.usage = usage;
imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
imageCreateInfo.flags = VK_IMAGE_CREATE_EXTENDED_USAGE_BIT;
imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
imageCreateInfo.queueFamilyIndexCount = 1;
imageCreateInfo.pQueueFamilyIndices = &queueFamilyIndex;
error = OP(vkCreateImage)(device, &imageCreateInfo, nullptr, &image);
if (error) {
std::cout << "failed to create image!" << std::endl;
return error;
}
this->image = image;
VkMemoryRequirements memoryRequirements;
OP(vkGetImageMemoryRequirements)(device, image, &memoryRequirements);
std::cout << "image memory size " << memoryRequirements.size <<" "<< memoryRequirements.memoryTypeBits << std::endl;
VkPhysicalDeviceMemoryProperties memoryProperties;
OP(vkGetPhysicalDeviceMemoryProperties)(computedevice->physicalDevice, &memoryProperties);
auto memoryTypeIndex = findMemoryTypeFromProperties(
memoryRequirements.memoryTypeBits, memoryProperties,
memoryflags);
if (0 > memoryTypeIndex) {
std::cout << "failed to find compatible memory type " << memoryRequirements.memoryTypeBits << " ,flags " << memoryflags << std::endl;
return VK_ERROR_UNKNOWN;
}
VkMemoryAllocateInfo allocateInfo = {};
allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocateInfo.allocationSize = memoryRequirements.size;
allocateInfo.memoryTypeIndex = memoryTypeIndex;
error = OP(vkAllocateMemory)(device, &allocateInfo, nullptr, &memory);
if (error) {
std::cout << "failed to allocate memory for image !" << std::endl;
return error;
}
this->memory = memory;
error = OP(vkBindImageMemory)(device, this->image, this->memory, 0);
if (error) {
std::cerr << "failed to bind image memory!" << std::endl;
return error;
}
VkImageView view;
VkImageViewCreateInfo info = {};
info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
info.image = image;
info.viewType = VK_IMAGE_VIEW_TYPE_2D;
info.format = format;
info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
info.subresourceRange.baseMipLevel = 0;
info.subresourceRange.levelCount = 1;
info.subresourceRange.baseArrayLayer = 0;
info.subresourceRange.layerCount = 1;
error = OP(vkCreateImageView)(device, &info, nullptr, &view);
if (error) {
std::cout << "failed to create image view!" << std::endl;
return error;
}
this->imageView = view;
VkFilter filter = VK_FILTER_NEAREST;
VkSamplerAddressMode mode = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
VkSamplerCreateInfo samplerInfo = {};
VkSampler sampler;
samplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
samplerInfo.magFilter = filter;
samplerInfo.minFilter = filter;
samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
samplerInfo.addressModeU = mode;
samplerInfo.addressModeV = mode;
samplerInfo.addressModeW = mode;
samplerInfo.mipLodBias = 0.0f;
samplerInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
samplerInfo.anisotropyEnable = VK_FALSE;
samplerInfo.maxAnisotropy = 1.0f;
samplerInfo.compareEnable = VK_FALSE;
samplerInfo.minLod = 0.0f;
samplerInfo.maxLod = 0.0f;
OP(vkCreateSampler)(device, &samplerInfo, nullptr, &sampler);
this->sampler = sampler;
return VK_SUCCESS;
}
对于image的操作,如前面说,如果是一个VK_IMAGE_TILING_LINEAR的话,是可以和buffer一样,先vkMapMemory,在cpu侧直接操作,然后在unmap就可以。
但是如果不是线性,使用VK_IMAGE_TILING_OPTIMAL,那么这个内存是无法直接操作的。 vulkan提供了接口来实现buffer与image之间互相copy的操作。 vkCmdCopyBufferToImage,vkCmdCopyImageToBuffer。
通常在非arm gpu上可以用staging buffer来实现。比如gooogle的uVkCompute中就是如此。 但是arm mali gpu使用的是unified memory,gpu是没有专用内存的,端侧设备gpu、cpu是共享内存的,不建议使用staging buffer。如果是image 这种情况,没有办法避免一次copy。
在MNN中,由于考虑算子是NCHW的格式输入,所以MNN实现了ncgwToImage的shader,本质上也是一次buffer到image的转换,只不过带上了自己的格式变化。少量注释如下,要理解这个shader,首先要先看下输出的image uOutput是如何创建的。
ini
#version 450 core
layout(std430) buffer;
layout(set=0, binding=0) writeonly uniform image2D uOutput;
layout(set=0, binding=1) readonly buffer sourceBuffer{
float data[];
} uInBuffer;
layout(set=0, binding=2) uniform constBuffer{
ivec4 size; // w, h, c, n
ivec4 stride; //1, w, w * h, w * h * c
} uConstant;
layout(set=0, binding=3) uniform offsetBuffer {
ivec4 offset; // Offset w, h, c, n : xstart, ystart, 0, 0
ivec4 size;//w, h, 0, w*h*c
} uOffset;
layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
void main()
{
int posX = ivec3(gl_GlobalInvocationID).x;
if (posX < uOffset.size.w)
{
ivec2 pos;
// Convert posX to 2D position in the output image
pos.x = posX % uOffset.size.x;
pos.y = posX / uOffset.size.x;
ivec2 spos = pos + uOffset.offset.xy;
int n = spos.y / uConstant.size.y;
int h = spos.y % uConstant.size.y;
int c = spos.x / uConstant.size.x;
int w = spos.x % uConstant.size.x;
//n * uConstant.stride.w : count batch offset bytes
//4 * c * uConstant.stride.z : count channel offset bytes
//h * uConstant.stride.y : count height offset bytes
//w * uConstant.stride.x : count width offset bytes
int basicOffset = 0
+ n * uConstant.stride.w
+ 4 * c * uConstant.stride.z
+ h * uConstant.stride.y
+ w * uConstant.stride.x;
vec4 color = vec4(0);
color.r = uInBuffer.data[basicOffset+0];
int imgHW = uConstant.stride.z;
if (4 * c + 1 < uConstant.size.z) {
color.g = uInBuffer.data[basicOffset+1*imgHW];
}
if (4 * c + 2 < uConstant.size.z) {
color.b = uInBuffer.data[basicOffset+2*imgHW];
}
if (4 * c + 3 < uConstant.size.z) {
color.a = uInBuffer.data[basicOffset+3*imgHW];
}
imageStore(uOutput, pos, color);
}
}
image创建也是要指定 width,height,depth, 看到MNN如果是2D图片,depth固定是1。实际是取决约format
ini
info.imageType = imageType;
info.extent.width = width;
info.extent.height = height;
info.extent.depth = 1;
info.mipLevels = 1;
info.arrayLayers = 1;
info.format = format;
info.tiling = VK_IMAGE_TILING_OPTIMAL;
其中最重要的逻辑是当channel 大于4的时候。因为MNN的2D的image都是用的RGBA32 float的格式,所以在一个image中,最多是可以包含4个channel。如果当channel大于4的时候,多出来的数据直接拓展到width数据上。
NCHW的数据格式,就是逐层展开,比如一个N=1 C=3 H=2 W=2 的数据,排布为
arduino
RR // 首先开展宽度
RR // 然后是高度
GG // 然后是Channel
GG
BB
用R G B A P 表示5个channel,举例子来做这个变化。
arduino
//NCHW , 1 5 8 2
// 这里HW重复四次, 填充RGBA的image没有问题,
// 第 5个 channel的数据, 放在到第一个channel的数据中,
// 但是记住,输入的数据依然是NCHW的数据,
// 新数据 width拓宽后, C变少了。这种方式可以放到了一个2D的vkimage中了。
// NCHW, 1, 4, 8, 4
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// RR PP
// GG
// GG
// GG
// GG
// GG
// GG
// GG
// GG
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// BB
// AA
// AA
// AA
// AA
// AA
// AA
// AA
// AA
grid sample 算子
为了简化我们的例子,我们直接使用VK_IMAGE_TILING_LINEAR的image格式。这样避免引入copy的逻辑,验证算子。
pytorch中的函数原型如下,后面三个我们选择bilinear,zeros,none,默认方式,仅首先这一种情况的算子。
ini
// Prototype:
torch.nn.functional.grid_sample(input,
grid,
mode='bilinear',
padding_mode='zeros', align_corners=None)
算子逻辑直接用MNN的算子
ini
#version 450 core
#extension GL_EXT_scalar_block_layout : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable
layout(set=0, binding=0) writeonly uniform image2D uOutput;
layout(set=0, binding=1) uniform sampler2D uInput;
layout(set=0, binding=2) uniform sampler2D uGrid;
layout(set=0, binding=3) uniform gridSampleBuffer{
ivec4 outImgSize; // 输出图像的大小,HWNC
ivec2 inShape; // 输入图像的宽度和高度
ivec2 outShape; // 输出图像的宽度和高度
}uGridSampleParam;
layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
vec4 LoadSample(int positionX, int positionY, int width, int height, int c, int n) {
vec4 value;
if (positionX < 0 || positionX >= width || positionY < 0 || positionY >= height) {
value = vec4(0.0);
} else {
value = texelFetch(uInput, ivec2(c * width + positionX, n * height + positionY), 0);
}
return value;
}
void main()
{
ivec3 pos = ivec3(gl_GlobalInvocationID);
ivec3 outputImgSize = uGridSampleParam.outImgSize.xyz;
ivec2 inputShape = uGridSampleParam.inShape;
ivec2 outputShape = uGridSampleParam.outShape;
if(pos.x < outputImgSize.x && pos.y < outputImgSize.y)
{
// 计算输出图像中的位置, 以输出图像的宽度和高度, 得到
int n = pos.y / outputShape.y;
int h = pos.y % outputShape.y;
int c = pos.x / outputShape.x;
int w = pos.x % outputShape.x;
// 获取网格中的位置
int h_C4 = h / 4;
int remain = h % 4;
float gridX = texelFetch(uGrid, ivec2(h_C4 * 2 + 0, n * outputShape.x + w), 0)[remain];
float gridY = texelFetch(uGrid, ivec2(h_C4 * 2 + 1, n * outputShape.x + w), 0)[remain];
// 计算输入图像中的位置
float cordH = ((1 + gridY) * (inputShape.y) - 1) * 0.5f;
float cordW = ((1 + gridX) * (inputShape.x) - 1) * 0.5f;
int w0_h = int(floor(cordH));
int w0_w = int(floor(cordW));
int w1_h = w0_h + 1;
int w1_w = w0_w + 1;
vec4 i00 = LoadSample(w0_w, w0_h, inputShape.x, inputShape.y, c, n);
vec4 i01 = LoadSample(w1_w, w0_h, inputShape.x, inputShape.y, c, n);
vec4 i10 = LoadSample(w0_w, w1_h, inputShape.x, inputShape.y, c, n);
vec4 i11 = LoadSample(w1_w, w1_h, inputShape.x, inputShape.y, c, n);
vec4 oneV = vec4(1.0);
vec4 f0 = vec4(float(w1_w) - cordW);
vec4 f1 = oneV - f0;
vec4 h0 = vec4(float(w1_h) - cordH);
vec4 h1 = oneV - h0;
vec4 i0 = i00 * f0 + i01 * f1;
vec4 i1 = i10 * f0 + i11 * f1;
vec4 value = i0 * h0 + i1 * h1;
imageStore(uOutput, pos.xy, value);
}
}
这个算子如果对于cpu上实现,可以看下面的逻辑,本质上一样的逻辑,也是从MNN中扒出来的。注释也解释了什么是双线性插值,本质就是用周围四个点估算目标位置取值。
ini
// 反归一化处理坐标
/*
* x_norm 范围在 [-1.0, 1.0], 那么首先 +1 后范围在 [0, 2]
* 那么乘以 range 后范围在 [0, 2*range - 1]
* 再除以 2 后范围在 [0, range - 0.5]
*/
static float getPosition(float x_norm, int range, bool alignCorners) {
(void)alignCorners;
return (((1.0f + x_norm) * (range) - 1.0f) * 0.5f);
}
// padding zero
// 获取坐标对应的值, 如果超过范围, 补零
static float sample(int y, int x, const float *buffer, int height, int width) {
if (y < 0 || y >= height || x < 0 || x >= width) {
return 0.0f;
}
return buffer[y * width + x];
}
// 双线性插值算法,
/*
* 首先计算出上下左右四个点的坐标, 对浮点取整,floor下值,ceil上值
* 然后sample取值, 计算出四个点的值
* 计算出权重,
*/
static float interpolate(float h, float w, const float *buffer, int height, int width) {
// mode == GridSampleMode_BILINEAR
int w0_h = ::floor(h);
int w0_w = ::floor(w);
int w1_h = ::ceil(h);
int w1_w = ::ceil(w);
// 左下角
float i00 = sample(w0_h, w0_w, buffer, height, width);
// 右下角
float i01 = sample(w0_h, w1_w, buffer, height, width);
// 左上角
float i10 = sample(w1_h, w0_w, buffer, height, width);
// 右上角
float i11 = sample(w1_h, w1_w, buffer, height, width);
// 权重, 左边界归一化
float fx2 = w - w0_w;
// 右边界归一化
float fx1 = 1.0f - fx2;
// 上边界归一化
float fy2 = h - w0_h;
// 下边界归一化
float fy1 = 1.0f - fy2;
// 插值. 水平方向
float i0 = ((i00) * fx1 + (i01) * fx2);
float i1 = ((i10) * fx1 + (i11) * fx2);
// 插值, 竖直方向
return ((i0 * fy1) + (i1 * fy2));
}
static void reference_grid_sample(const float *inputPtr, const float *gridPtr, std::vector<float> &output,
int batch, int inHeight, int inWidth, int outHeight, int outWidth, int depth,
bool alignCorners) {
output.resize(batch * outHeight * outWidth * depth);
float *outputPtr = output.data();
// 按照 NCHW 的顺序, HW 以output 为目标,
// grid 的hw 和output是一致的
// input不参与循环hw, 在每个NC的循环中, 直接整个图以HW尺寸输入,保证grid操作单个channel
for (auto b = 0; b < batch; ++b) {
const float *_inputPtr = inputPtr + b * inHeight * inWidth * depth;
const float *_gridPtr = gridPtr + b * outHeight * outWidth * 2;
float *_outputPtr = outputPtr + b * outHeight * outWidth * depth;
for (auto c = 0; c < depth; ++c) {
auto __inputPtr = _inputPtr + c * inHeight * inWidth;
auto __outputPtr = _outputPtr + c * outHeight * outWidth;
for (auto h = 0; h < outHeight; ++h) {
auto __gridPtr = _gridPtr + h * outWidth * 2;
auto ___outputPtr = __outputPtr + h * outWidth;
for (auto w = 0; w < outWidth; ++w) {
// 首先反归一化得到坐标
auto x = getPosition(__gridPtr[2 * w + 0], inWidth, alignCorners);
auto y = getPosition(__gridPtr[2 * w + 1], inHeight, alignCorners);
// 然后插值,得到的值输出
___outputPtr[w] = interpolate(y, x, __inputPtr, inHeight, inWidth);
}
}
}
}
}
可以用CPU的实现作为参考,计算的结果来和GPU的结果对比,验证算子正确性。
剩下的是如何填充image。对于gridsample操作,我们有两个输入。
NCHW 的 input
NHWC 的 grid
直接map每个image的memory,直接操作填充。填充的时候要把C > 4 的场景按照上面的逻辑,转化到width这一层。
ini
void setImageData(float *data, std::vector<int> nchw)
{
int batch = nchw[0];
int depth = nchw[1];
int height = nchw[2];
int width = nchw[3];
int stride_w = 1;
int stride_h = width;
int stride_c = width * height;
int stride_n = width * height * depth;
int realdepth = UP_DIV(depth, 4);
void *ptr;
// since format is VK_FORMAT_R32G32B32A32_SFLOAT
ptr = getMemoryPtr(VK_WHOLE_SIZE);
VkSubresourceLayout layout;
VkImageSubresource subresource = {};
subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
subresource.mipLevel = 0;
subresource.arrayLayer = 0;
OP(vkGetImageSubresourceLayout)(computedevice->device, image, &subresource, &layout);
// take care about the pitch value for new row
const uint32_t rowPitch = layout.rowPitch;
std::cout << "row pitch " << rowPitch << std::endl;
std::cout << "height " << height << ", width " << width << std::endl;
float *dst = reinterpret_cast<float *>(ptr);
for (int b = 0; b < batch; b++) {
for (int c = 0; c < realdepth; c++) {
dst = reinterpret_cast<float *>(ptr) + c * width * 4;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int offset = b * stride_n + 4 * c * stride_c + h * stride_h + w * stride_w;
float r = data[offset];
float g = (4 * c + 1 < depth) ? data[stride_c + offset] : 0.0f;
float b = (4 * c + 2 < depth) ? data[2 * stride_c + offset] : 0.0f;
float a = (4 * c + 3 < depth) ? data[3 * stride_c + offset] : 0.0f;
// Write RGBA values to the Vulkan image memory
dst[w * 4 + 0] = r;
dst[w * 4 + 1] = g;
dst[w * 4 + 2] = b;
dst[w * 4 + 3] = a;
}
// Move to the next row in the Vulkan image memory
dst = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(dst) + rowPitch);
}
}
}
unmapMemory();
}
最后就是提交任务到gpu,然后运行拿回结果进行比较。
ini
typedef int ivec4[4];
typedef int ivec2[2];
struct GpuGridSampleParam {
ivec4 outImgSize;
ivec2 inShape;
ivec2 outShape;
};
void OpSubmitWork()
{
float *inputPtr = originInputData.data();
float *gridPtr = originGridData.data();
this->images[1]->setImageData(inputPtr, {batch, depth, inHeight, inWidth});//NCHW
//NCHW, w = UPDIV(c, 4) * w, but for grid, we need to take NHWC
// so here the witdh = UPDIV(h, 4) * c, height = w, depth = h
this->images[2]->setImageData(gridPtr, {batch, outHeight, outWidth, 2});//NCHW, so take NHWC
void *aptr = buffers[0]->getMemoryPtr(sizeof(struct GpuGridSampleParam));
if (!aptr) {
std::cout << "failed to map memory!" << std::endl;
return;
}
struct GpuGridSampleParam *para = static_cast<struct GpuGridSampleParam *>(aptr);
para->outImgSize[0] = outWidth;
para->outImgSize[1] = outHeight;
para->outImgSize[2] = 1;
para->outImgSize[3] = 0;
para->inShape[0] = inWidth;
para->inShape[1] = inHeight;
para->outShape[0] = outWidth;
para->outShape[1] = outHeight;
std::cout << "output image size " << outWidth << " " << outHeight << std::endl;
std::cout << "input image size " << inWidth << " " << inHeight << std::endl;
buffers[0]->unmapMemory();
VkSubmitInfo submitInfo = {};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &this->commandBuffer;
OP(vkQueueSubmit)(computedevice->queue, 1, &submitInfo, nullptr);
OP(vkQueueWaitIdle)(computedevice->queue);
}
回顾小结
- 学习了vulkan如果用compute pipeline实现计算加速的编程。
- 阅读学习了uvkCompute和MNN的vulkan的实现
- 理解了grid sample的原理和实现
- 理解了MNN nchw nc4hw4这种格式,特别是nchwToImage变化中涉及的数据填充的搬移。并理解后实现了cpu侧直接的fill data的过程。