超越openmp通用核心的硬件

NUMA：非同一内存访问

控制线程亲和力：

嵌套并行构造：

检查线程亲和力：

SIMD：

主机-设备模式：

NUMA：非同一内存访问

数据局部性：

图12-6：有和没有首次接触

cpp 复制代码

//Step 1.a Initialization by initial thread only 
   for (j = 0; j < VectorSize; j++) { 
      a[j] = 1.0; b[j] = 2.0; c[j] = 0.0;}

//Step 1.b Initialization by all threads (first touch)
   omp_set_dynamic(0);
   #pragma omp parallel for schedule(static)
   for (j = 0; j < VectorSize; j++) { 
      a[j] = 1.0; b[j] = 2.0; c[j] = 0.0;}

//Step 2 Compute
   #pragma omp parallel for schedule(static)
   for (j = 0; j < VectorSize; j++) {
      a[j] = b[j] + d * c[j];}

控制线程亲和力：

place:线程对应的硬件资源

定义place ： export OMP_PLACES="{0,1,2,3},{4,5,6,7}"

export OMP_PLACES="{0:4},{4:4}"

export OMP_PLACES=threads(以硬件线程的粒度绑定openmp线程),cores,sockets

控制处理器绑定：OMP_PROC_BIND true，false，master，close（组中的线程被放置在靠近主线程的place，线程以循环的方式从主线程的右边位置开始被分配到连续的place），spread（将线程尽可能均匀的分布在各个place上）

嵌套并行构造：

通过嵌套并行构造来影响NUMA系统中的线程分布：

export OMP_NESTED=true(openmp5.0弃用)

export OMP_MAX_ACTIVATE_LEVELS=3

cpp 复制代码

#include <omp.h>
#include <stdio.h>
void report_num_threads(int level) 
{
   #pragma omp single 
   {
      printf("Level %d: number of threads in the team: %d\n", \
             level, omp_get_num_threads());
   }
}
int main()
{
   omp_set_dynamic(0);
   #pragma omp parallel num_threads(2) 
   {
      report_num_threads(1);
      #pragma omp parallel num_threads(2) 
      {
         report_num_threads(2);
         #pragma omp parallel num_threads(2) 
         {
            report_num_threads(3);
         }
      }  
   }
   return(0);
}

OMP_NUM_THREADS,OMP_PLACES,OMP_PROC_BIND环境变量被扩展为支持嵌套：

export OMP_PLACES=sockets,threads

export OMP_NUM_THREADS=2,4

export OMP_PROC_BIND=spread,close

程序开始时，有一个初始线程运行在核心0的第一个硬件线程上，我们遇到第一个并行区域，并使用OMP_NUM_THREADS和OMP_PROC_BIND的第一个值（2，spread）。我们创建两个线程，每个插槽上一个。注意，线程可以运行在任何核心和定义place的任何硬件线程上，本例中是socket，意味着它们可以运行在各自插槽的任何核心上。在创建一个并行区域后，内部控制变量会进入列表的下个值，4代表线程数，close代表处理器绑定。当每个线程遇到嵌套的并行区域时，他们会在同一核心上创建4个线程。

检查线程亲和力：

OMP_DISPLAY_AFFINITY

OMP_AFFINITY_FORMAT

图12-12：

cpp 复制代码

$ icc -qopenmp -DNTIMES=20 -DSTREAM_ARRAY_SIZE=64000000 -c stream.c
$ icc -qopenmp -o stream stream.o
$ export OMP_DISPLAY_AFFINITY=true
$ export OMP_AFFINITY_FORMAT="Thrd Lev=%3L, thrd_num=%5n, thrd_aff=%15A"
$ export OMP_PLACES=threads
$ export OMP_NUM_THREADS=8
$ export OMP_PROC_BIND=spread

$ ./stream | sort -k3   
<stream results omitted ...>
Thrd Lev=1  , thrd_num=0    , thrd_aff=0              
Thrd Lev=1  , thrd_num=1    , thrd_aff=8              
Thrd Lev=1  , thrd_num=2    , thrd_aff=16             
Thrd Lev=1  , thrd_num=3    , thrd_aff=24             
Thrd Lev=1  , thrd_num=4    , thrd_aff=1              
Thrd Lev=1  , thrd_num=5    , thrd_aff=9              
Thrd Lev=1  , thrd_num=6    , thrd_aff=17             
Thrd Lev=1  , thrd_num=7    , thrd_aff=25   

$ export OMP_PROC_BIND=close
$ ./stream |sort -k3 
<stream results omitted ...>
Thrd Lev=1  , thrd_num=0    , thread_aff=0              
Thrd Lev=1  , thrd_num=1    , thread_aff=32             
Thrd Lev=1  , thrd_num=2    , thread_aff=2              
Thrd Lev=1  , thrd_num=3    , thread_aff=34             
Thrd Lev=1  , thrd_num=4    , thread_aff=4              
Thrd Lev=1  , thrd_num=5    , thread_aff=36             
Thrd Lev=1  , thrd_num=6    , thread_aff=6              
Thrd Lev=1  , thrd_num=7    , thread_af=38

线程亲和力和数据局部性：

一般建议是每个NUMA域至少有一个进程（列入单个MPI rank或OS进程）。让Openmp线程对应一个NUMA域内的并行，将所需数据保持在同一个NUMA域内。这就减少了跨NUMA域边界时，正确初始化首次接触的任何错误的影响。另一个建议是将线程相隔很远（spread）以利用聚集的内存带宽，然后将最里面close的工作在嵌套的并行区域内分叉，以最大化缓存局部性。

SIMD：

图12-17：

openmp对PI程序进行向量化：

cpp 复制代码

#include <omp.h>
#include <stdio.h>
static long num_steps = 100000;
float step;
int main ()
{
   int i;
   float x, pi, sum = 0.0;

   step = 1.0f / (double) num_steps;

   #pragma omp simd private(x) reduction(+:sum)
      for (i = 0; i < num_steps; i++) {
         x = (i + 0.5f) * step;
         sum += 4.0f / (1.0f + x * x);
      }

   pi = step * sum;
   printf("pi=%lf \n", pi);
}

图12-18：

openmp对PI程序进行多线程和向量化：

cpp 复制代码

#include<stdio.h>
#include <omp.h>
static long num_steps = 100000000;
double step;
int main ()
{
   int i;
   double x, pi, sum = 0.0;

   step = 1.0f / (double) num_steps;

   #pragma omp parallel for simd private(x) reduction(+:sum)
   for (i = 0; i < num_steps; i++) {
      x = (i + 0.5f) * step;
      sum += 4.0f / (1.0f + x * x);
   }

   pi = step * sum;
   printf("pi=%f\n",pi);
}

主机-设备模式：

target指令及其相关的结构化块定义了卸载到设备上执行的目标区域。target指令还导致数据移动到设备上，当目标区域完成执行时这些数据将从设备副指挥主机

图12-19：

cpp 复制代码

#include<omp.h>
#include<stdio.h>
#define N 1024
int main()
{
   float a[N], b[N], c[N];
   int i;

// initialize a, b, and c (code not shown)

#pragma omp target
#pragma omp teams distribute parallel for simd
   for (i = 0;i < N; i++)
      c[i] += a[i] * b[i];
}

对于GPU来说target指令后面加上以下指令及其相关循环：

#pragma omp teams distribute parallel for simd

map子句：

图12-21：

cpp 复制代码

#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#define N 1024
int main()
{
   float *a, *b, *c, *d;
   int i;
   
   a = (float*) malloc(N * sizeof(float));
   b = (float*) malloc(N * sizeof(float));   
   c = (float*) malloc(N * sizeof(float));
   d = (float*) malloc(N * sizeof(float));
   
// initialize a, b, c, and d (code not shown)

#pragma omp target map(to:a[0:N],b[0:N]) map(tofrom:c[0:N])
#pragma omp teams distribute parallel for simd
   for (i = 0; i < N;i++)
      c[i] += a[i] * b[i];
     
#pragma omp target map(to:a[0:N],c[0:N]) map(tofrom:d[0:N])
#pragma omp teams distribute parallel for simd
   for (i = 0; i < N; i++)
      d[i] += a[i] + c[i];
}

图12-22：

cpp 复制代码

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define N  1024
int main()
{
   float *a, *b, *c, *d;
   int i;
   
   a = (float*)malloc(N*sizeof(float));
   b = (float*)malloc(N*sizeof(float));   
   c = (float*)malloc(N*sizeof(float));
   d = (float*)malloc(N*sizeof(float));
   
// initialize a, b, c, and d (code not shown)

#pragma omp target data map(to:a[0:N],b[0:N],c[0:N]) map(tofrom:d[0:N])
{
   #pragma omp target 
   #pragma omp teams distribute parallel for simd
   for (i = 0; i < N; i++)
      c[i] += a[i] * b[i];

   #pragma omp target 
   #pragma omp teams distribute parallel for simd
   for (i = 0; i < N; i++)
      d[i] += a[i] + c[i];
}

// continue in the program but only using d (not c)

}