cpp
bool sieve_sort_core(uint32_t* a, size_t n, uint32_t* result, int depth) {
return (n <= _256)
? sieve_sort_256(a, n, result)
: sieve_sort_omp(a, n, result, depth)
;
}
bool sieve_sort(uint32_t* a, size_t n, int depth = 32)
{
//max(n)==256P (2^60)
if (a == nullptr || n> _256P)
return false;
else if (n == 0)
return true;
else if (n == 1) {
return true;
}
else if (n == 2) {
uint32_t a0 = a[0], a1 = a[1];
a[0] = std::min(a0, a1);
a[1] = std::max(a0, a1);
return true;
}
else {
uint32_t* result = new uint32_t[n];
if (result != nullptr) {
done = sieve_sort_core(a, n, result, depth);
delete[] result;
}
}
return done;
}
上面贴出的,是调用排序内核的"驱动"部分。主要看的是内存分配这一句,需要一个辅助的result,因为无法实现原地排序(其实也是可以的,就是对内存使用控制的要求非常严格,处理起来非常复杂)。这里要注意的是输入数据总量不能超过256P(2的60次方)。虽然按照4个字节一个整数,最高可以做到1024P(2的62次方),但是考虑到还有计算Stride分区的问题,就不把所有的数都算尽了。而且,一般来说一台机器也没有那么大的内存。
归并算法是本算法的另一个重头戏,它要对16路已排序的数据进行归并,也就是16路数据就像16个流,每次都从每个流的最开始取1个数据,选择最小的,放入排序结果。这个算法目前需要两倍的内存(也许可以优化,暂时还没想好),并且可能需要调换源和目的的位置,简化起见,我们暂时用内存复制来处理。代码如下。
cpp
bool sieve_collect(size_t n, size_t loops, size_t stride, size_t reminder, __mmask16 mask,
bool flip,
uint32_t* source, uint32_t* destination) {
if (n == 0 || loops == 0 || loops > 16 || mask == 0 || source == nullptr || destination == nullptr)
return false;
if (flip) {
std::swap(source, destination);
}
const size_t large_stride_threshold = _16M; //(1ULL << 24); //(1ULL << 12))
const size_t extreme_large_stride_threshold = _16P; //(1ULL << 56);
if (stride <= large_stride_threshold) {
__m512i idx = zero;
__m512i top = zero;
uint32_t p = 0;
for (int i = 0; i < loops; i++) {
idx.m512i_u32[i] = p;
top.m512i_u32[i] = p + (uint32_t)((i == loops - 1 && reminder > 0) ? reminder : stride);
p += (uint32_t)stride;
}
int pc = 0, i = 0;
uint32_t _min = 0;
__mmask16 _mask_min = 0;
while (mask != 0 && i < n) {
__m512i values = _mm512_mask_i32gather_epi32(zero, mask, idx, source, sizeof(uint32_t));
if (!sieve_get_min(mask, values, _min, _mask_min)) break;
idx = _mm512_mask_add_epi32(idx, _mask_min, idx, ones);
mask &= (~_mm512_mask_cmpeq_epu32_mask(_mask_min, idx, top));
pc = __popcnt16(_mask_min);
for (int j = 0; j < pc; j++)
destination[i++] = _min;
}
}
else if(stride <= extreme_large_stride_threshold){
__m512i _idx_low_ = zero;
__m512i _idx_high = zero;
__m512i top_low_ = zero;
__m512i top_high = zero;
size_t loops_low_ = loops >= 8 ? 8 : loops;
size_t loops_high = loops >= 8 ? (loops - loops_low_) : 0;
size_t p = 0;
for (int i = 0; i < loops_low_; i++) {
_idx_low_.m512i_u64[i] = p;
top_low_.m512i_u64[i] = p + ((loops_high == 0 && i == loops - 1 && reminder > 0) ? reminder : stride);
p += stride;
}
for (int i = 0; i < loops_high; i++) {
_idx_high.m512i_u64[i] = p;
top_high.m512i_u64[i] = p + ((i == loops - 1 && reminder > 0) ? reminder : stride);
p += stride;
}
int pc = 0, i = 0;
uint32_t _min = 0;
__mmask8 _mask_low_ = 0;
__mmask8 _mask_high = 0;
__mmask8 _mask_min_low_ = 0;
__mmask8 _mask_min_high = 0;
__mmask16 _mask_min = 0;
while (mask != 0 && i < n) {
_mask_low_ = (__mmask8)(mask & 0xff);
_mask_high = (__mmask8)(mask >> 8);
__m256i values_low_ = _mm512_mask_i64gather_epi32(
_zero, _mask_low_, _idx_low_, source, sizeof(uint32_t));
__m256i values_high = _mm512_mask_i64gather_epi32(
_zero, _mask_high, _idx_high, source, sizeof(uint32_t));
__m512i values = _mm512_inserti64x4(
_mm512_castsi256_si512(values_low_), values_high, 1);
if (!sieve_get_min(mask, values, _min, _mask_min)) break;
_mask_min_low_ = (__mmask8)(_mask_min & 0xff);
_mask_min_high = (__mmask8)(_mask_min >> 8);
_idx_low_ = _mm512_mask_add_epi64(_idx_low_, _mask_min_low_, _idx_low_, ones64);
_idx_high = _mm512_mask_add_epi64(_idx_high, _mask_min_high, _idx_high, ones64);
_mask_low_ &= _mm512_mask_cmpeq_epu64_mask(_mask_min_low_, _idx_low_, top_low_);
_mask_high &= _mm512_mask_cmpeq_epu64_mask(_mask_min_high, _idx_high, top_high);
mask &= ~((((__mmask16)_mask_high) << 8) | (__mmask16)_mask_low_);
pc = __popcnt16(_mask_min);
for (int j = 0; j < pc; j++)
destination[i++] = _min;
}
}
else {
return false;
}
if (flip) {
memcpy(source, destination, sizeof(uint32_t) * n);
}
return true;
}
其中根据stride的大小,分为两种情况,stride的256倍小于32位的时候,使用8个平行的32位索引,stride的256倍小于64位的时候,使用2组个4个64位索引。
对于那些源和目的翻转的情况,暂时是把目的重新复制给源。事实上应该有更好的处理方式。
测试结果:
cpp
i=0,t=1
==================================
samples:0
repeats:1
omp: 16 threads
sieve sort speed:infK/s
std sort speed: infK/s
t1(seive):0 s
t2(std::):0 s
ratio:-nan(ind)%
==================================
samples:1
repeats:1
omp: 16 threads
sieve sort speed:10000K/s
std sort speed: 10000K/s
t1(seive):1e-07 s
t2(std::):1e-07 s
ratio:100%
==================================
samples:2
repeats:1
omp: 16 threads
sieve sort speed:2000K/s
std sort speed: 5000K/s
t1(seive):5e-07 s
t2(std::):2e-07 s
ratio:40%
==================================
samples:2
repeats:1
omp: 16 threads
sieve sort speed:3333.33K/s
std sort speed: 10000K/s
t1(seive):3e-07 s
t2(std::):1e-07 s
ratio:33.3333%
==================================
samples:3
repeats:1
omp: 16 threads
sieve sort speed:1666.67K/s
std sort speed: 10000K/s
t1(seive):6e-07 s
t2(std::):1e-07 s
ratio:16.6667%
==================================
samples:4
repeats:1
omp: 16 threads
sieve sort speed:2000K/s
std sort speed: 10000K/s
t1(seive):5e-07 s
t2(std::):1e-07 s
ratio:20%
==================================
samples:5
repeats:1
omp: 16 threads
sieve sort speed:1428.57K/s
std sort speed: 10000K/s
t1(seive):7e-07 s
t2(std::):1e-07 s
ratio:14.2857%
==================================
samples:6
repeats:1
omp: 16 threads
sieve sort speed:1428.57K/s
std sort speed: 3333.33K/s
t1(seive):7e-07 s
t2(std::):3e-07 s
ratio:42.8571%
==================================
samples:7
repeats:1
omp: 16 threads
sieve sort speed:2500K/s
std sort speed: 10000K/s
t1(seive):4e-07 s
t2(std::):1e-07 s
ratio:25%
==================================
samples:8
repeats:1
omp: 16 threads
sieve sort speed:2000K/s
std sort speed: 5000K/s
t1(seive):5e-07 s
t2(std::):2e-07 s
ratio:40%
==================================
samples:9
repeats:1
omp: 16 threads
sieve sort speed:1666.67K/s
std sort speed: 5000K/s
t1(seive):6e-07 s
t2(std::):2e-07 s
ratio:33.3333%
==================================
samples:10
repeats:1
omp: 16 threads
sieve sort speed:1250K/s
std sort speed: 5000K/s
t1(seive):8e-07 s
t2(std::):2e-07 s
ratio:25%
==================================
samples:11
repeats:1
omp: 16 threads
sieve sort speed:1111.11K/s
std sort speed: 3333.33K/s
t1(seive):9e-07 s
t2(std::):3e-07 s
ratio:33.3333%
==================================
samples:12
repeats:1
omp: 16 threads
sieve sort speed:2000K/s
std sort speed: 3333.33K/s
t1(seive):5e-07 s
t2(std::):3e-07 s
ratio:60%
==================================
samples:13
repeats:1
omp: 16 threads
sieve sort speed:1111.11K/s
std sort speed: 3333.33K/s
t1(seive):9e-07 s
t2(std::):3e-07 s
ratio:33.3333%
==================================
samples:14
repeats:1
omp: 16 threads
sieve sort speed:1250K/s
std sort speed: 3333.33K/s
t1(seive):8e-07 s
t2(std::):3e-07 s
ratio:37.5%
==================================
samples:15
repeats:1
omp: 16 threads
sieve sort speed:1000K/s
std sort speed: 3333.33K/s
t1(seive):1e-06 s
t2(std::):3e-07 s
ratio:30%
i=4,t=16
==================================
samples:15
repeats:1
omp: 16 threads
sieve sort speed:1428.57K/s
std sort speed: 5000K/s
t1(seive):7e-07 s
t2(std::):2e-07 s
ratio:28.5714%
==================================
samples:16
repeats:1
omp: 16 threads
sieve sort speed:1666.67K/s
std sort speed: 5000K/s
t1(seive):6e-07 s
t2(std::):2e-07 s
ratio:33.3333%
==================================
samples:17
repeats:1
omp: 16 threads
sieve sort speed:384.615K/s
std sort speed: 5000K/s
t1(seive):2.6e-06 s
t2(std::):2e-07 s
ratio:7.69231%
i=8,t=256
==================================
samples:255
repeats:1
omp: 16 threads
sieve sort speed:32.0513K/s
std sort speed: 136.986K/s
t1(seive):3.12e-05 s
t2(std::):7.3e-06 s
ratio:23.3974%
==================================
samples:256
repeats:1
omp: 16 threads
sieve sort speed:35.2113K/s
std sort speed: 138.889K/s
t1(seive):2.84e-05 s
t2(std::):7.2e-06 s
ratio:25.3521%
==================================
samples:257
repeats:1
omp: 16 threads
sieve sort speed:22.8311K/s
std sort speed: 138.889K/s
t1(seive):4.38e-05 s
t2(std::):7.2e-06 s
ratio:16.4384%
i=12,t=4096
==================================
samples:4095
repeats:1
omp: 16 threads
sieve sort speed:7.77001K/s
std sort speed: 5.6338K/s
t1(seive):0.0001287 s
t2(std::):0.0001775 s
ratio:137.918%
==================================
samples:4096
repeats:1
omp: 16 threads
sieve sort speed:10.0301K/s
std sort speed: 6.68896K/s
t1(seive):9.97e-05 s
t2(std::):0.0001495 s
ratio:149.95%
==================================
samples:4097
repeats:1
omp: 16 threads
sieve sort speed:2.48694K/s
std sort speed: 6.28931K/s
t1(seive):0.0004021 s
t2(std::):0.000159 s
ratio:39.5424%
i=16,t=65536
==================================
samples:65535
repeats:1
omp: 16 threads
sieve sort speed:0.568473K/s
std sort speed: 0.286189K/s
t1(seive):0.0017591 s
t2(std::):0.0034942 s
ratio:198.636%
==================================
samples:65536
repeats:1
omp: 16 threads
sieve sort speed:0.750751K/s
std sort speed: 0.282646K/s
t1(seive):0.001332 s
t2(std::):0.003538 s
ratio:265.616%
==================================
samples:65537
repeats:1
omp: 16 threads
sieve sort speed:0.136305K/s
std sort speed: 0.289001K/s
t1(seive):0.0073365 s
t2(std::):0.0034602 s
ratio:47.1642%
i=20,t=1048576
==================================
samples:1048575
repeats:1
omp: 16 threads
sieve sort speed:0.0322528K/s
std sort speed: 0.0137963K/s
t1(seive):0.0310051 s
t2(std::):0.0724831 s
ratio:233.778%
==================================
samples:1048576
repeats:1
omp: 16 threads
sieve sort speed:0.0406035K/s
std sort speed: 0.0141201K/s
t1(seive):0.0246284 s
t2(std::):0.0708211 s
ratio:287.559%
==================================
samples:1048577
repeats:1
omp: 16 threads
sieve sort speed:0.00715301K/s
std sort speed: 0.0164194K/s
t1(seive):0.139801 s
t2(std::):0.0609034 s
ratio:43.5643%
i=24,t=16777216
==================================
samples:16777215
repeats:1
omp: 16 threads
sieve sort speed:0.00244063K/s
std sort speed: 0.000860936K/s
t1(seive):0.40973 s
t2(std::):1.16153 s
ratio:283.486%
==================================
samples:16777216
repeats:1
omp: 16 threads
sieve sort speed:0.00245652K/s
std sort speed: 0.000856864K/s
t1(seive):0.40708 s
t2(std::):1.16705 s
ratio:286.687%
==================================
samples:16777217
repeats:1
omp: 16 threads
sieve sort speed:0.000410236K/s
std sort speed: 0.000853255K/s
t1(seive):2.43762 s
t2(std::):1.17198 s
ratio:48.0789%
i=28,t=268435456
==================================
samples:268435455
repeats:1
omp: 16 threads
sieve sort speed:0.000151989K/s
std sort speed: 4.52433e-05K/s
t1(seive):6.57942 s
t2(std::):22.1027 s
ratio:335.938%
==================================
samples:268435456
repeats:1
omp: 16 threads
sieve sort speed:0.000148853K/s
std sort speed: 4.49265e-05K/s
t1(seive):6.71804 s
t2(std::):22.2586 s
ratio:331.325%
==================================
samples:268435457
repeats:1
omp: 16 threads
sieve sort speed:2.1965e-05K/s
std sort speed: 4.32099e-05K/s
t1(seive):45.527 s
t2(std::):23.1428 s
ratio:50.8332%
i=32,t=4294967296
==================================
samples:4294967295
repeats:1
omp: 16 threads
sieve sort speed:3.8673e-06K/s
std sort speed: 2.12853e-06K/s
t1(seive):258.579 s
t2(std::):469.807 s
ratio:181.688%
测试结果可以发现,开启多线程的情况下,数量越大,效果越好,可以达到std::sort的速度的两倍到3倍。但是有小的余量的时候效果不佳,可能是因为反复折腾复制数据的原因。
这个部分应该再仔细考虑一下。