MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs
Here's a comprehensive MPI code that demonstrates ghost data exchange for a 3D domain decomposition across multiple GPUs. This implementation assumes you're using CUDA-aware MPI for efficient GPU-to-GPU communication.
c
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime.h>
// Define the dimensions of the global domain
#define NX_GLOBAL 256
#define NY_GLOBAL 256
#define NZ_GLOBAL 256
// Define the ghost layer width
#define GHOST_WIDTH 2
// Macro for checking CUDA errors
#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){printf("<%s>:%i ",__FILE__,__LINE__); printf("[CUDA] Error: %s\n", cudaGetErrorString(error));exit(EXIT_FAILURE);}}
// Structure to hold domain decomposition information
typedef struct {
int rank; // MPI rank
int size; // Number of MPI processes
int dims[3]; // Process grid dimensions
int coords[3]; // Process coordinates in grid
MPI_Comm cart_comm; // Cartesian communicator
// Local domain sizes (excluding ghost layers)
int nx, ny, nz;
// Starting indices in global domain
int start_x, start_y, start_z;
// Device pointers for data and ghost buffers
double *d_data; // Main data array on device
double *d_send_buffers[6]; // Send buffers for each face
double *d_recv_buffers[6]; // Receive buffers for each face
} DomainInfo;
// Initialize domain decomposition
void init_domain_decomposition(DomainInfo *domain, int argc, char **argv) {
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &domain->rank);
MPI_Comm_size(MPI_COMM_WORLD, &domain->size);
// Create a 3D Cartesian topology
int periods[3] = {1, 1, 1}; // Periodic boundaries
domain->dims[0] = domain->dims[1] = domain->dims[2] = 0;
MPI_Dims_create(domain->size, 3, domain->dims);
MPI_Cart_create(MPI_COMM_WORLD, 3, domain->dims, periods, 1, &domain->cart_comm);
MPI_Cart_coords(domain->cart_comm, domain->rank, 3, domain->coords);
// Calculate local domain sizes (excluding ghost layers)
domain->nx = NX_GLOBAL / domain->dims[0];
domain->ny = NY_GLOBAL / domain->dims[1];
domain->nz = NZ_GLOBAL / domain->dims[2];
// Calculate starting indices in global domain
domain->start_x = domain->coords[0] * domain->nx;
domain->start_y = domain->coords[1] * domain->ny;
domain->start_z = domain->coords[2] * domain->nz;
// Adjust for non-divisible cases
if (domain->coords[0] == domain->dims[0]-1) domain->nx = NX_GLOBAL - domain->start_x;
if (domain->coords[1] == domain->dims[1]-1) domain->ny = NY_GLOBAL - domain->start_y;
if (domain->coords[2] == domain->dims[2]-1) domain->nz = NZ_GLOBAL - domain->start_z;
// Allocate device memory for main data array (including ghost layers)
size_t data_size = (domain->nx + 2*GHOST_WIDTH) *
(domain->ny + 2*GHOST_WIDTH) *
(domain->nz + 2*GHOST_WIDTH) * sizeof(double);
CUDA_CHECK(cudaMalloc(&domain->d_data, data_size));
CUDA_CHECK(cudaMemset(domain->d_data, 0, data_size));
// Allocate send and receive buffers for each face
for (int i = 0; i < 6; i++) {
size_t buffer_size = 0;
if (i < 2) { // X faces
buffer_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);
} else if (i < 4) { // Y faces
buffer_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);
} else { // Z faces
buffer_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH * sizeof(double);
}
CUDA_CHECK(cudaMalloc(&domain->d_send_buffers[i], buffer_size));
CUDA_CHECK(cudaMalloc(&domain->d_recv_buffers[i], buffer_size));
}
}
// Pack data into send buffers
__global__ void pack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {
int src_idx;
if (face == 0) { // Left face
src_idx = ghost_width * (ny + 2*ghost_width) * (nz + 2*ghost_width) +
j * (nz + 2*ghost_width) + k;
} else { // Right face
src_idx = (nx + ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) +
j * (nz + 2*ghost_width) + k;
}
int buf_idx = j * (nz + 2*ghost_width) + k;
buffer[buf_idx] = data[src_idx];
}
}
// Similar pack kernels for y and z faces would be defined here...
// Unpack data from receive buffers
__global__ void unpack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {
int dest_idx;
if (face == 0) { // Left face
dest_idx = j * (nz + 2*ghost_width) + k;
} else { // Right face
dest_idx = (nx + 2*ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) +
j * (nz + 2*ghost_width) + k;
}
int buf_idx = j * (nz + 2*ghost_width) + k;
data[dest_idx] = buffer[buf_idx];
}
}
// Similar unpack kernels for y and z faces would be defined here...
// Exchange ghost data with neighbors
void exchange_ghost_data(DomainInfo *domain) {
MPI_Request requests[12]; // 6 sends + 6 receives
int request_count = 0;
MPI_Status statuses[12];
// Determine neighbor ranks
int neighbors[6];
MPI_Cart_shift(domain->cart_comm, 0, 1, &neighbors[0], &neighbors[1]); // X direction
MPI_Cart_shift(domain->cart_comm, 1, 1, &neighbors[2], &neighbors[3]); // Y direction
MPI_Cart_shift(domain->cart_comm, 2, 1, &neighbors[4], &neighbors[5]); // Z direction
// Post receives first
for (int face = 0; face < 6; face++) {
if (neighbors[face] != MPI_PROC_NULL) {
size_t recv_size = 0;
if (face < 2) { // X faces
recv_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);
} else if (face < 4) { // Y faces
recv_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);
} else { // Z faces
recv_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;
}
MPI_Irecv(domain->d_recv_buffers[face], recv_size, MPI_DOUBLE,
neighbors[face], face, domain->cart_comm, &requests[request_count++]);
}
}
// Pack and send data
for (int face = 0; face < 6; face++) {
if (neighbors[face] != MPI_PROC_NULL) {
// Launch appropriate pack kernel based on face
dim3 block(16, 16);
dim3 grid;
if (face < 2) { // X faces
grid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;
grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;
pack_x_face<<<grid, block>>>(domain->d_data, domain->d_send_buffers[face],
domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);
}
// Similar for y and z faces would be here...
CUDA_CHECK(cudaDeviceSynchronize());
size_t send_size = 0;
if (face < 2) { // X faces
send_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);
} else if (face < 4) { // Y faces
send_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);
} else { // Z faces
send_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;
}
MPI_Isend(domain->d_send_buffers[face], send_size, MPI_DOUBLE,
neighbors[face], face, domain->cart_comm, &requests[request_count++]);
}
}
// Wait for all communications to complete
MPI_Waitall(request_count, requests, statuses);
// Unpack received data
for (int face = 0; face < 6; face++) {
if (neighbors[face] != MPI_PROC_NULL) {
// Launch appropriate unpack kernel based on face
dim3 block(16, 16);
dim3 grid;
if (face < 2) { // X faces
grid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;
grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;
unpack_x_face<<<grid, block>>>(domain->d_data, domain->d_recv_buffers[face],
domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);
}
// Similar for y and z faces would be here...
CUDA_CHECK(cudaDeviceSynchronize());
}
}
}
// Clean up resources
void cleanup(DomainInfo *domain) {
CUDA_CHECK(cudaFree(domain->d_data));
for (int i = 0; i < 6; i++) {
CUDA_CHECK(cudaFree(domain->d_send_buffers[i]));
CUDA_CHECK(cudaFree(domain->d_recv_buffers[i]));
}
MPI_Finalize();
}
int main(int argc, char **argv) {
DomainInfo domain;
init_domain_decomposition(&domain, argc, argv);
// Main simulation loop would go here
for (int step = 0; step < 10; step++) {
// Perform computation on local domain
// Exchange ghost data
exchange_ghost_data(&domain);
// Continue computation
}
cleanup(&domain);
return 0;
}
Key Features of This Implementation:
-
3D Domain Decomposition:
- Uses MPI Cartesian topology for logical process arrangement
- Handles both divisible and non-divisible domain sizes
-
GPU-Aware MPI:
- Directly passes GPU pointers to MPI calls (requires CUDA-aware MPI)
- Avoids host-staging for better performance
-
Efficient Communication:
- Non-blocking sends and receives for overlap opportunities
- Separate buffers for each face to prevent contention
-
Kernel-Based Packing/Unpacking:
- CUDA kernels for efficient data movement between main array and buffers
- Parallel packing/unpacking operations
-
Flexible Ghost Layer Width:
- Configurable ghost layer size via GHOST_WIDTH macro
Requirements:
- MPI implementation with CUDA-aware support (OpenMPI, MVAPICH2, etc.)
- CUDA toolkit
- Compilation with
nvcc
and MPI compiler wrappers
Usage Notes:
- You'll need to complete the pack/unpack kernels for Y and Z faces (omitted for brevity)
- The code assumes periodic boundaries (adjust
periods
array if needed) - For optimal performance, tune the block/grid dimensions in the kernels
- Add error checking for MPI operations in production code
This implementation provides a solid foundation for multi-GPU simulations with 3D domain decomposition and ghost data exchange.