高性能计算实践-遥遥领先！看看 streaming store 在矩阵转置中有多少提升

说明

本文作为上一篇矩阵转置 transpose 复现的补充测试。来看看 streaming store 到底什么实力。

性能测试

cpp 复制代码

FORCE_INLINE void transpose_8x8_store_contiguous(const uint8_t* src0, const uint8_t* src1, const uint8_t* src2, const uint8_t* src3,
                                                 const uint8_t* src4, const uint8_t* src5, const uint8_t* src6, const uint8_t* src7,
                                                 uint8_t* pDst) {
    __m128i r0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src0));
    __m128i r1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src1));
    __m128i r2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src2));
    __m128i r3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src3));
    __m128i r4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src4));
    __m128i r5 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src5));
    __m128i r6 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src6));
    __m128i r7 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src7));
    __m128i t0 = _mm_unpacklo_epi8(r0, r1); __m128i t1 = _mm_unpacklo_epi8(r2, r3);
    __m128i t2 = _mm_unpacklo_epi8(r4, r5); __m128i t3 = _mm_unpacklo_epi8(r6, r7);
    __m128i t4 = _mm_unpacklo_epi16(t0, t1); __m128i t5 = _mm_unpacklo_epi16(t2, t3);
    __m128i t6 = _mm_unpackhi_epi16(t0, t1); __m128i t7 = _mm_unpackhi_epi16(t2, t3);
    __m128i c0 = _mm_unpacklo_epi32(t4, t5); __m128i c1 = _mm_unpackhi_epi32(t4, t5);
    __m128i c2 = _mm_unpacklo_epi32(t6, t7); __m128i c3 = _mm_unpackhi_epi32(t6, t7);
    _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 0), c0);
    _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 16), c1);
    _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 32), c2);
    _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 48), c3);
}

template <bool UseStream>
FORCE_INLINE void
transpose_64x64_tile_impl(const uint8_t* pSrc, unsigned int srcStep, uint8_t* pDst, unsigned int dstStep) {
    alignas(64) uint8_t tmp[64 * 64];
    uint8_t* tmpPtr = tmp;
    size_t srcStep8 = (size_t)srcStep * 8;
    const uint8_t* s0 = pSrc;
    for (int y = 0; y < 64; y += 8) {
        transpose_8x8_store_contiguous(s0, s0+srcStep, s0+srcStep*2, s0+srcStep*3,
                                       s0+srcStep*4, s0+srcStep*5, s0+srcStep*6, s0+srcStep*7, tmpPtr);
        tmpPtr += 64;
        s0 += srcStep8;
    }
    for (int colBlock = 0; colBlock < 8; ++colBlock) {
        const uint8_t* bBase = tmp + colBlock * 64;
        for (int r = 0; r < 8; ++r) {
            int laneOffset = r * 8;
            __m128i b0 = _mm_loadl_epi64((const __m128i*)(bBase + 0 * 512 + laneOffset));
            __m128i b1 = _mm_loadl_epi64((const __m128i*)(bBase + 1 * 512 + laneOffset));
            __m128i b2 = _mm_loadl_epi64((const __m128i*)(bBase + 2 * 512 + laneOffset));
            __m128i b3 = _mm_loadl_epi64((const __m128i*)(bBase + 3 * 512 + laneOffset));
            __m128i b4 = _mm_loadl_epi64((const __m128i*)(bBase + 4 * 512 + laneOffset));
            __m128i b5 = _mm_loadl_epi64((const __m128i*)(bBase + 5 * 512 + laneOffset));
            __m128i b6 = _mm_loadl_epi64((const __m128i*)(bBase + 6 * 512 + laneOffset));
            __m128i b7 = _mm_loadl_epi64((const __m128i*)(bBase + 7 * 512 + laneOffset));
            __m128i v0 = _mm_unpacklo_epi64(b0, b1);
            __m128i v1 = _mm_unpacklo_epi64(b2, b3);
            __m128i v2 = _mm_unpacklo_epi64(b4, b5);
            __m128i v3 = _mm_unpacklo_epi64(b6, b7);
            uint8_t* dstRowPtr = pDst + (colBlock * 8 + r) * dstStep;
            if (UseStream) {
                _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 0), v0);
                _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 16), v1);
                _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 32), v2);
                _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 48), v3);
            } else {
                _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 0), v0);
                _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 16), v1);
                _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 32), v2);
                _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 48), v3);
            }
        }
    }
}

// 8x8 直接转置
FORCE_INLINE void
transpose_8x8_u8_to_strided(const uint8_t* pSrc, unsigned int srcStep, uint8_t* pDst, unsigned int dstStep) {
    __m128i r0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 0 * srcStep));
    __m128i r1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 1 * srcStep));
    __m128i r2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 2 * srcStep));
    __m128i r3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 3 * srcStep));
    __m128i r4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 4 * srcStep));
    __m128i r5 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 5 * srcStep));
    __m128i r6 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 6 * srcStep));
    __m128i r7 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 7 * srcStep));

    __m128i t0 = _mm_unpacklo_epi8(r0, r1); __m128i t1 = _mm_unpacklo_epi8(r2, r3);
    __m128i t2 = _mm_unpacklo_epi8(r4, r5); __m128i t3 = _mm_unpacklo_epi8(r6, r7);
    __m128i t4 = _mm_unpacklo_epi16(t0, t1); __m128i t5 = _mm_unpacklo_epi16(t2, t3);
    __m128i t6 = _mm_unpackhi_epi16(t0, t1); __m128i t7 = _mm_unpackhi_epi16(t2, t3);
    __m128i c0 = _mm_unpacklo_epi32(t4, t5); __m128i c1 = _mm_unpackhi_epi32(t4, t5);
    __m128i c2 = _mm_unpacklo_epi32(t6, t7); __m128i c3 = _mm_unpackhi_epi32(t6, t7);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 0 * dstStep), c0);
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 1 * dstStep), _mm_srli_si128(c0, 8));
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 2 * dstStep), c1);
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 3 * dstStep), _mm_srli_si128(c1, 8));
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 4 * dstStep), c2);
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 5 * dstStep), _mm_srli_si128(c2, 8));
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 6 * dstStep), c3);
    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 7 * dstStep), _mm_srli_si128(c3, 8));
}

// 如果内存是64字节对齐速度会更快
// void aligned_free_wrapper(void* ptr) { _aligned_free(ptr); }
//
// using AlignedUniquePtr = std::unique_ptr<uint8_t[], void(*)(void*)>;
//
// AlignedUniquePtr make_aligned_buffer(size_t size, size_t alignment) {
//     size_t remainder = size % alignment;
//     size_t alloc_size = (remainder == 0) ? size : (size + alignment - remainder);
//
//     void* ptr = nullptr;
//
//     ptr = _aligned_malloc(alloc_size, alignment);
//
//     return AlignedUniquePtr(static_cast<uint8_t*>(ptr), aligned_free_wrapper);
// }
//
// class TransposeFixture : public benchmark::Fixture {
// public:
//     AlignedUniquePtr src_owner{nullptr, std::free};
//     AlignedUniquePtr dst_owner{nullptr, std::free};
//
//     uint8_t* src = nullptr;
//     uint8_t* dst = nullptr;
//
//     const int width = 4096;
//     const int height = 4096;
//     size_t step;
//
//     void SetUp(const benchmark::State& state) override {
//         step = width;
//         size_t total_bytes = step * height;
//         size_t alignment = 64;
//
//         src_owner = make_aligned_buffer(total_bytes, alignment);
//         dst_owner = make_aligned_buffer(total_bytes, alignment);
//
//         if (!src_owner || !dst_owner) {
//             const_cast<benchmark::State&>(state).SkipWithError("Memory allocation failed!");
//             return;
//         }
//
//         src = src_owner.get();
//         dst = dst_owner.get();
//
//         std::memset(src, 128, total_bytes);
//         std::memset(dst, 0, total_bytes);
//     }
//
//     void TearDown(const benchmark::State& state) override {
//     }
// };

class TransposeFixture : public benchmark::Fixture {
public:
    // Changed to standard unique_ptr array
    std::unique_ptr<uint8_t[]> src_owner;
    std::unique_ptr<uint8_t[]> dst_owner;

    uint8_t* src = nullptr;
    uint8_t* dst = nullptr;

    const int width = 4096;
    const int height = 4096;
    size_t step;

    void SetUp(const benchmark::State& state) override {
        step = width;
        size_t total_bytes = step * height;

        // Removed alignment logic, using standard new[]
        try {
            src_owner = std::make_unique<uint8_t[]>(total_bytes);
            dst_owner = std::make_unique<uint8_t[]>(total_bytes);
        } catch (const std::bad_alloc&) {
            const_cast<benchmark::State&>(state).SkipWithError("Memory allocation failed!");
            return;
        }

        src = src_owner.get();
        dst = dst_owner.get();

        std::memset(src, 128, total_bytes);
        std::memset(dst, 0, total_bytes);
    }

    void TearDown(const benchmark::State& state) override {
    }
};

// 空跑的基准测试，目的是在实际测试开始前唤醒 CPU 到高频状态
void CPU_WarmUp(benchmark::State& state) {
    for (auto _ : state) {
        // 进行一些简单的浮点运算以消耗 CPU 周期
        volatile double x = 1.0;
        for (int i = 0; i < 1000; ++i) {
            x = x * 1.0001 + 0.001;
        }
        benchmark::DoNotOptimize(x);
    }
}
// 强制 WarmUp 至少运行 1 秒，并排在最前面
BENCHMARK(CPU_WarmUp)->MinTime(1.0);

BENCHMARK_F(TransposeFixture, Std_Memcpy)(benchmark::State& state) {
    size_t size = size_t(width) * height;
    for (auto _ : state) {
        std::memcpy(dst, src, size);
        benchmark::DoNotOptimize(dst);
    }
    state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(width) * int64_t(height) * 2);
}

BENCHMARK_F(TransposeFixture, Buffered_64x64_Stream)(benchmark::State& state) {
    for (auto _ : state) {
        // 按 64x64 块遍历
        for (int y = 0; y < height; y += 64) {
            for (int x = 0; x < width; x += 64) {
                const uint8_t* sTile = src + y * step + x;
                uint8_t* dTile = dst + x * step + y;

                transpose_64x64_tile_impl<true>(sTile, step, dTile, step);
            }
        }
        _mm_sfence();
    }
    state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(width) * int64_t(height) * 2);
}

BENCHMARK_F(TransposeFixture, Direct_8x8_StoreU)(benchmark::State& state) {
    for (auto _ : state) {
        for (int y = 0; y < height; y += 8) {
            const uint8_t* src_row_ptr = src + y * step;

            for (int x = 0; x < width; x += 8) {
                uint8_t* dst_block_ptr = dst + x * step + y;

                transpose_8x8_u8_to_strided(src_row_ptr + x, step, dst_block_ptr, step);
            }
        }
    }
    state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(width) * int64_t(height) * 2);
}

BENCHMARK_MAIN();

测试结果

复制代码

TransposeFixture/Std_Memcpy                602332 ns       593750 ns         1000 bytes_per_second=52.6316Gi/s
TransposeFixture/Buffered_64x64_Stream     922109 ns       920348 ns          747 bytes_per_second=33.9545Gi/s
TransposeFixture/Direct_8x8_StoreU       12036794 ns     12187500 ns           50 bytes_per_second=2.5641Gi/s

标准库的 memcpy 在 4Kx4K 这种完全顺序、完全对齐、完全不用做重排的场景里，是一个很好的上限参考。

显然，streaming store 在这个场景下确实已经遥遥领先。当然前提是用对。