引言

在上一篇文章LightGBM中的CUDA算子（一）中我们介绍了LightGBM源码的结构和两种基础的cuda算子：并行规约 、双调排序 。这两类基础算子在LightGBM的boosting过程中用于直方图运算、增益计算、寻找分裂点等环节。接下来我们继续介绍LightGBM boosting过程中的一类cuda算子：直方图算子。

直方图算子

LightGBM直方图算子的定义与实现分别位于src/treelearner/cuda/cuda_histogram_constructor.h和cuda_histogram_constructor.cu中，包括直方图的构建 、直方图运算两类，其中直方图构建算子有分别针对不同case的实现，如：不同特征类型(dense or sparse)，不同的内存(global or shared)。下面我们以其中的一个case实现为例，介绍直方图构建。

直方图建构

CUDAConstructHistogramDenseKernel_GlobalMemory()算子展示了使用global memory对dense特征构造直方图的方法。首先我们要理清几个维度的概念：

n_data ：数据量或样本量;
n_column : 字段数量或特征数量;
n_partition : 对特征进行分区得到的分区数量;
n_column_per_partition : 每个分区下分到的最多特征数量，如果不能整除，最后一个partition下分到特征数少于n_column_per_partition;
n_bin : 单个特征在构造直方图时所分的段数;
n_data_hat: 划归到叶子节点样本集的数量

C++ 复制代码

// cuda_histogram_constructor.cu
template <typename BIN_TYPE>
__global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
        const CUDALeafSplitsStruct* smaller_leaf_splits, //封装了叶子节点分裂信息的结构体
        const score_t* cuda_gradients, //n_data
        const score_t* cuda_hessians, //n_data
        const BIN_TYPE* data, //n_data * n_column, 存放bin的编号
        const uint32_t* column_hist_offsets, //n_column
        const uint32_t* column_hist_offsets_full, // n_partition+1
        const int* feature_partition_column_index_offsets, // n_partition+1
        const data_size_t num_data, // n_data
        float* global_hist_buffer) // GridDim.x * n_partition * n_column_per_partition * n_Bin
{
    const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
    const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf; // 叶子节点分到的数据量n_data_hat <= n_data
    const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y; // 每个线程要处理数据量
    const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf; // 叶子节点样本集数据索引
    const unsigned int num_threads_per_block = blockDim.x * blockDim.y;

    // 在n_column维度下分了多个partition, 每个partition下feature数量 n_column_per_partion = n_column/n_partition
    // 单个block负责单个partition, n_partition = n_block
    const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x];
    const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1];
    // 该block对应data的起始位置
    const BIN_TYPE* data_ptr = data + static_cast<size_t>(partition_column_start) * num_data;
    // block下负责partition下的columns数量
    const int num_columns_in_partition = partition_column_end - partition_column_start;

    // block所负责partition在global_hist_buffer中地址偏移
    // 每个block在x维度上负责数据长度为n_column_per_partition*n_bin
    const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
    const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];

    // 2个位置分别用来放grad和hess
    const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
    const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
    // n_column * n_bin
    const int num_total_bin = column_hist_offsets_full[gridDim.x];

    // global_hist_buffer数据采用scatter二维错位布局, shared_hist对应当前partition起始位置
    float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
 
    // 循环兼容num_items_in_partition > block中线程数量场景，有个线程要处理多个地址
    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
        shared_hist[i] = 0.0f; // 将global_hist_buffer初始化为 0
    }
    __syncthreads();

    const unsigned int threadIdx_y = threadIdx.y;
    const unsigned int blockIdx_y = blockIdx.y;
    // block负责data的索引
    const data_size_t block_start = (static_cast<size_t>(blockIdx_y) * blockDim.y) * num_data_per_thread;
    // block负责data起始地址
    const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
    // block负责data的size，兼容不整除的场景
    data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));

    // 兼容block_num_data>=blockDim.y以及不能整除的场景
    const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
    const data_size_t remainder = block_num_data % blockDim.y;
    const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);

    data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
    const int column_index = static_cast<int>(threadIdx.x) + partition_column_start;
    if (threadIdx.x < static_cast<unsigned int>(num_columns_in_partition)) {
        float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1); // 当前partition下当前column的起始位置
        for (data_size_t i = 0; i < num_iteration_this; ++i) { // 循环兼容block_num_data>blockDim.y情形
            const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
            // 获取grad和hess
            const score_t grad = cuda_gradients[data_index];
            const score_t hess = cuda_hessians[data_index];
            // 获取线程负责data的bin编号
            const uint32_t bin = static_cast<uint32_t>(data_ptr[static_cast<size_t>(data_index) * num_columns_in_partition + threadIdx.x]);
            const uint32_t pos = bin << 1;
            float* pos_ptr = shared_hist_ptr + pos; // 当前partition、当前column、当前bin下起始位置
            atomicAdd_block(pos_ptr, grad); // bin下对应grad求和
            atomicAdd_block(pos_ptr + 1, hess); //bin下对应hession求和
            inner_data_index += blockDim.y;
        }
    }

    // 更新smaller_leaf_splits下的直方图
    __syncthreads();
    hist_t* feature_histogram_ptr = smaller_leaf_splits->hist_in_leaf + (partition_hist_start << 1);
    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
        atomicAdd_system(feature_histogram_ptr + i, shared_hist[i]);
    }
}

kernel中传入参数：

cuda_gradients, cuda_hessians : 分别存放梯度和二阶偏导信息，size=n_data;
data : 存放每个data-feature对的bin编号，size = n_data*n_column;
column_hist_offsets : 存放每个特征的直方图地址偏移，size = n_column; 相邻两个偏移间隔为n_bin;
column_hist_offsets_full : 存放每个partition对应直方图地址偏移，size = n_partition+1, 相邻两个偏移间隔为n_column_per_partition*n_bin;
feature_partition_column_index_offsets: 存放每个partition对应column地址偏移，size = n_partition+1, 相邻两个偏移间隔为n_column_per_partition。

以上是几个纯输入的参数，此外还有两个参数smaller_leaf_splits和global_hist_buffer，分别用于接收输出。其中samaller_leaf_splits为封装了叶子节点分裂信息的结构体：

C++ 复制代码

// cuda_leaf_splits.hpp
struct CUDALeafSplitsStruct {
public:
    int leaf_index;
    double sum_of_gradients;
    double sum_of_hessians;
    data_size_t num_data_in_leaf;
    double gain;
    double leaf_value;
    const data_size_t* data_indices_in_leaf;
    hist_t* hist_in_leaf;
};

其中，属性data_indices_in_leaf为叶子节点样本集对应的索引，在kernel中用于定位data的bin编号。属性hist_in_leaf在kernel最后会进行更新。 参数global_hist_buffer接收输出值，用于存放直方图数据，glocbal_hist_buffer采用的是scatter错位布局的方式：

图1 global_hist_buffer中数据布局

图1中上面的一张图展示了partition所对应的block在grid中的布局，下面的图展示单个block所负责的partition中数据布局：假如每个partition下有2个column，每个column分成2个bin，存储时每个bin下有2个位置，分别用于存储一个单元的gradients和hessians数据。

熟悉了参数的维度和数据布局之后，下面我们来详细介绍kernel中直方图的构建过程。kernel使用二维grid，x维度对应feature，y维度对应data，在x维度下每个block负责一个特征分区 。x和y两个维度的线程总size（gridDim*blockDim）通常少于n_column和n_feature，所以grid下有的线程可能要负责多份数据，在代码的44~46行，67~79行两个循环体针对n_column_per_partition>blockDim.x和n_data_hat>gridDim.y*blockDim.y所做的兼容。同时n_feature和n_data_hat往往并不能恰好被blockDim和gridDim整除，代码56~61行也针对性做了兼容。

整个kernel运行执行分为3个步骤：

将global_hist_buffer存放数据地方的值初始化0；
根据数据索引从cuda_gradients, cuda_hessians获取对应数据的grad和hess值，累加到global_hist_buffer对应地址的bin上；
更新smaller_leaf_splits直方图属性值。

第41行shared_hist定位到了当前block负责的partition在global_hist_buffer中起始数据位置。然后第44~46行对该block负责分区中的hist数据初始化为0。第49到第64行定义了一些索引，并针对不能整除的场景做了相应的兼容。

第66行shared_hist_ptr定位到了当前partition下、当前column在global_hist_buffer中的grid和hess起始数据位置 。67~79行为直方图构建的主体部分，首先根据smaller_leaf_splits下的data_indices_in_leaf找到叶子节点样本集在data中的索引，然后根据索引从cuda_gradients, cuda_hessians获取grad和hess值。第75行pos_ptr定位到了当前partition、当前column、当前bin在global_hist_buffer中起始数据位置，然后76，77行分别将获取的grid和hess值使用原子操作累加到global_hist_buffer对应的hist bin上。

最后在第84~87行对smaller_leaf_splits中属性hist_in_leaf的值进行更新，直方图构建完成。

直方图运算

直方图运算算子有2类，分别是直方图减法算子（SubtractHistogramKernel）和直方图"修剪"（FixHistogramKernel）。直方图减法算子非常简单，cuda_larger_leaf_splits直方图的grad和hess分别减去cuda_smaller_leaf_splits直方图的grad和hess：

C++ 复制代码

// cuda_histogram_constructor.cu
__global__ void SubtractHistogramKernel(
        const int num_total_bin,
        const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
        const CUDALeafSplitsStruct* cuda_larger_leaf_splits) {
    const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
    const int cuda_larger_leaf_index = cuda_larger_leaf_splits->leaf_index;
    if (cuda_larger_leaf_index >= 0) {
        const hist_t* smaller_leaf_hist = cuda_smaller_leaf_splits->hist_in_leaf;
        hist_t* larger_leaf_hist = cuda_larger_leaf_splits->hist_in_leaf;
        if (global_thread_index < 2 * num_total_bin) {
            // larger_leaf_hist 的grad和hess分别减去smaller_leaf_hist的grad和hess
            larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index];
        }
    }
}

直方图的"修剪"算子如下，其作用是找到cuda_smaller_leaf_splits中需要fix的feature，将该feature下频次最高的bin对应的grad和hess值修改为cuda_smaller_leaf_splits全局之和-该feature局部之和：

C++ 复制代码

__global__ void FixHistogramKernel(
        const uint32_t* cuda_feature_num_bins,
        const uint32_t* cuda_feature_hist_offsets,
        const uint32_t* cuda_feature_most_freq_bins,
        const int* cuda_need_fix_histogram_features,
        const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
        const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {

    // 定义一块共享内存，用于后续对该feature下各bin的grad和hess值规约求和
    __shared__ hist_t shared_mem_buffer[32];
    const unsigned int blockIdx_x = blockIdx.x;
    // 需要fix的feature索引
    const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
    // 需要fix的feature统一的bin个数
    const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];
    // 该feature对应直方图的offset
    const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index];
    // 该feature下频数最高的bin
    const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index];
    const double leaf_sum_gradients = cuda_smaller_leaf_splits->sum_of_gradients;
    const double leaf_sum_hessians = cuda_smaller_leaf_splits->sum_of_hessians;
    // 该feature直方图数据起始位置
    hist_t* feature_hist = cuda_smaller_leaf_splits->hist_in_leaf + feature_hist_offset * 2;
    const unsigned int threadIdx_x = threadIdx.x;
    // 该feature bin数量
    const uint32_t num_bin = cuda_feature_num_bins[feature_index];
    const uint32_t hist_pos = threadIdx_x << 1;
    const hist_t bin_gradient = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[hist_pos] : 0.0f;
    const hist_t bin_hessian = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[hist_pos + 1] : 0.0f;
    // 获取feature下各bin的grad和hess之和
    const hist_t sum_gradient = ShuffleReduceSum<hist_t>(bin_gradient, shared_mem_buffer, num_bin_aligned);
    const hist_t sum_hessian = ShuffleReduceSum<hist_t>(bin_hessian, shared_mem_buffer, num_bin_aligned);

    // 将频率最高bin下grad和hess值改为全局之和-该feature局部之和
    if (threadIdx_x == 0) {
        feature_hist[most_freq_bin << 1] = leaf_sum_gradients - sum_gradient;
        feature_hist[(most_freq_bin << 1) + 1] = leaf_sum_hessians - sum_hessian;
    }
}

其中参数cuda_need_fix_histogram_features存放了需要进行fix的feature，kernel使用的是1维的网格，每个block负责一个需要fix的feature，每个线程负责一个feature下的一个bin。FixHistogramKernel第23行定位到了线程负责的feature在hist_in_leaf的起始位置，然后第31行和32行分别计算了该feature下前num_bin_aligned个bin的局部grad、hess之和，第36、37行将全局之和减局部之和赋值该feature下most_freq_bin。

小节

以上我们介绍了LightGBM中直方图相关的cuda算子，包括直方图构建和直方图运算两类。直方图构建理解起来要稍微复杂一点。需要先了解数据的布局，特别是global_hist_buffer的scatter布局方式。后续我们将继续介绍boosting过程中其他业务类算子。

LightGBM中的CUDA算子（二）

引言

直方图算子

直方图建构

直方图运算

小节