MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

Here's a comprehensive MPI code that demonstrates ghost data exchange for a 3D domain decomposition across multiple GPUs. This implementation assumes you're using CUDA-aware MPI for efficient GPU-to-GPU communication.

c 复制代码

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime.h>

// Define the dimensions of the global domain
#define NX_GLOBAL 256
#define NY_GLOBAL 256
#define NZ_GLOBAL 256

// Define the ghost layer width
#define GHOST_WIDTH 2

// Macro for checking CUDA errors
#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){printf("<%s>:%i ",__FILE__,__LINE__); printf("[CUDA] Error: %s\n", cudaGetErrorString(error));exit(EXIT_FAILURE);}}

// Structure to hold domain decomposition information
typedef struct {
    int rank;           // MPI rank
    int size;           // Number of MPI processes
    int dims[3];        // Process grid dimensions
    int coords[3];      // Process coordinates in grid
    MPI_Comm cart_comm; // Cartesian communicator
    
    // Local domain sizes (excluding ghost layers)
    int nx, ny, nz;
    
    // Starting indices in global domain
    int start_x, start_y, start_z;
    
    // Device pointers for data and ghost buffers
    double *d_data;         // Main data array on device
    double *d_send_buffers[6]; // Send buffers for each face
    double *d_recv_buffers[6]; // Receive buffers for each face
} DomainInfo;

// Initialize domain decomposition
void init_domain_decomposition(DomainInfo *domain, int argc, char **argv) {
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &domain->rank);
    MPI_Comm_size(MPI_COMM_WORLD, &domain->size);
    
    // Create a 3D Cartesian topology
    int periods[3] = {1, 1, 1}; // Periodic boundaries
    domain->dims[0] = domain->dims[1] = domain->dims[2] = 0;
    MPI_Dims_create(domain->size, 3, domain->dims);
    MPI_Cart_create(MPI_COMM_WORLD, 3, domain->dims, periods, 1, &domain->cart_comm);
    MPI_Cart_coords(domain->cart_comm, domain->rank, 3, domain->coords);
    
    // Calculate local domain sizes (excluding ghost layers)
    domain->nx = NX_GLOBAL / domain->dims[0];
    domain->ny = NY_GLOBAL / domain->dims[1];
    domain->nz = NZ_GLOBAL / domain->dims[2];
    
    // Calculate starting indices in global domain
    domain->start_x = domain->coords[0] * domain->nx;
    domain->start_y = domain->coords[1] * domain->ny;
    domain->start_z = domain->coords[2] * domain->nz;
    
    // Adjust for non-divisible cases
    if (domain->coords[0] == domain->dims[0]-1) domain->nx = NX_GLOBAL - domain->start_x;
    if (domain->coords[1] == domain->dims[1]-1) domain->ny = NY_GLOBAL - domain->start_y;
    if (domain->coords[2] == domain->dims[2]-1) domain->nz = NZ_GLOBAL - domain->start_z;
    
    // Allocate device memory for main data array (including ghost layers)
    size_t data_size = (domain->nx + 2*GHOST_WIDTH) * 
                      (domain->ny + 2*GHOST_WIDTH) * 
                      (domain->nz + 2*GHOST_WIDTH) * sizeof(double);
    CUDA_CHECK(cudaMalloc(&domain->d_data, data_size));
    CUDA_CHECK(cudaMemset(domain->d_data, 0, data_size));
    
    // Allocate send and receive buffers for each face
    for (int i = 0; i < 6; i++) {
        size_t buffer_size = 0;
        if (i < 2) { // X faces
            buffer_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);
        } else if (i < 4) { // Y faces
            buffer_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);
        } else { // Z faces
            buffer_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH * sizeof(double);
        }
        
        CUDA_CHECK(cudaMalloc(&domain->d_send_buffers[i], buffer_size));
        CUDA_CHECK(cudaMalloc(&domain->d_recv_buffers[i], buffer_size));
    }
}

// Pack data into send buffers
__global__ void pack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int k = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {
        int src_idx;
        if (face == 0) { // Left face
            src_idx = ghost_width * (ny + 2*ghost_width) * (nz + 2*ghost_width) + 
                      j * (nz + 2*ghost_width) + k;
        } else { // Right face
            src_idx = (nx + ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + 
                      j * (nz + 2*ghost_width) + k;
        }
        
        int buf_idx = j * (nz + 2*ghost_width) + k;
        buffer[buf_idx] = data[src_idx];
    }
}

// Similar pack kernels for y and z faces would be defined here...

// Unpack data from receive buffers
__global__ void unpack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int k = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {
        int dest_idx;
        if (face == 0) { // Left face
            dest_idx = j * (nz + 2*ghost_width) + k;
        } else { // Right face
            dest_idx = (nx + 2*ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + 
                       j * (nz + 2*ghost_width) + k;
        }
        
        int buf_idx = j * (nz + 2*ghost_width) + k;
        data[dest_idx] = buffer[buf_idx];
    }
}

// Similar unpack kernels for y and z faces would be defined here...

// Exchange ghost data with neighbors
void exchange_ghost_data(DomainInfo *domain) {
    MPI_Request requests[12]; // 6 sends + 6 receives
    int request_count = 0;
    MPI_Status statuses[12];
    
    // Determine neighbor ranks
    int neighbors[6];
    MPI_Cart_shift(domain->cart_comm, 0, 1, &neighbors[0], &neighbors[1]); // X direction
    MPI_Cart_shift(domain->cart_comm, 1, 1, &neighbors[2], &neighbors[3]); // Y direction
    MPI_Cart_shift(domain->cart_comm, 2, 1, &neighbors[4], &neighbors[5]); // Z direction
    
    // Post receives first
    for (int face = 0; face < 6; face++) {
        if (neighbors[face] != MPI_PROC_NULL) {
            size_t recv_size = 0;
            if (face < 2) { // X faces
                recv_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);
            } else if (face < 4) { // Y faces
                recv_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);
            } else { // Z faces
                recv_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;
            }
            
            MPI_Irecv(domain->d_recv_buffers[face], recv_size, MPI_DOUBLE, 
                     neighbors[face], face, domain->cart_comm, &requests[request_count++]);
        }
    }
    
    // Pack and send data
    for (int face = 0; face < 6; face++) {
        if (neighbors[face] != MPI_PROC_NULL) {
            // Launch appropriate pack kernel based on face
            dim3 block(16, 16);
            dim3 grid;
            
            if (face < 2) { // X faces
                grid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;
                grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;
                pack_x_face<<<grid, block>>>(domain->d_data, domain->d_send_buffers[face], 
                                            domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);
            } 
            // Similar for y and z faces would be here...
            
            CUDA_CHECK(cudaDeviceSynchronize());
            
            size_t send_size = 0;
            if (face < 2) { // X faces
                send_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);
            } else if (face < 4) { // Y faces
                send_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);
            } else { // Z faces
                send_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;
            }
            
            MPI_Isend(domain->d_send_buffers[face], send_size, MPI_DOUBLE, 
                     neighbors[face], face, domain->cart_comm, &requests[request_count++]);
        }
    }
    
    // Wait for all communications to complete
    MPI_Waitall(request_count, requests, statuses);
    
    // Unpack received data
    for (int face = 0; face < 6; face++) {
        if (neighbors[face] != MPI_PROC_NULL) {
            // Launch appropriate unpack kernel based on face
            dim3 block(16, 16);
            dim3 grid;
            
            if (face < 2) { // X faces
                grid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;
                grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;
                unpack_x_face<<<grid, block>>>(domain->d_data, domain->d_recv_buffers[face], 
                                             domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);
            } 
            // Similar for y and z faces would be here...
            
            CUDA_CHECK(cudaDeviceSynchronize());
        }
    }
}

// Clean up resources
void cleanup(DomainInfo *domain) {
    CUDA_CHECK(cudaFree(domain->d_data));
    for (int i = 0; i < 6; i++) {
        CUDA_CHECK(cudaFree(domain->d_send_buffers[i]));
        CUDA_CHECK(cudaFree(domain->d_recv_buffers[i]));
    }
    MPI_Finalize();
}

int main(int argc, char **argv) {
    DomainInfo domain;
    init_domain_decomposition(&domain, argc, argv);
    
    // Main simulation loop would go here
    for (int step = 0; step < 10; step++) {
        // Perform computation on local domain
        
        // Exchange ghost data
        exchange_ghost_data(&domain);
        
        // Continue computation
    }
    
    cleanup(&domain);
    return 0;
}

Key Features of This Implementation:

3D Domain Decomposition:
- Uses MPI Cartesian topology for logical process arrangement
- Handles both divisible and non-divisible domain sizes
GPU-Aware MPI:
- Directly passes GPU pointers to MPI calls (requires CUDA-aware MPI)
- Avoids host-staging for better performance
Efficient Communication:
- Non-blocking sends and receives for overlap opportunities
- Separate buffers for each face to prevent contention
Kernel-Based Packing/Unpacking:
- CUDA kernels for efficient data movement between main array and buffers
- Parallel packing/unpacking operations
Flexible Ghost Layer Width:
- Configurable ghost layer size via GHOST_WIDTH macro

Requirements:

MPI implementation with CUDA-aware support (OpenMPI, MVAPICH2, etc.)
CUDA toolkit
Compilation with nvcc and MPI compiler wrappers

Usage Notes:

You'll need to complete the pack/unpack kernels for Y and Z faces (omitted for brevity)
The code assumes periodic boundaries (adjust periods array if needed)
For optimal performance, tune the block/grid dimensions in the kernels
Add error checking for MPI operations in production code

This implementation provides a solid foundation for multi-GPU simulations with 3D domain decomposition and ghost data exchange.