
// #include <iostream>
// #include <mpi.h>
// #include <cuda_runtime.h>

// int main(int argc, char** argv) {
//     MPI_Init(&argc, &argv);

//     int world_size;
//     MPI_Comm_size(MPI_COMM_WORLD, &world_size);

//     int world_rank;
//     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

//     int deviceCount;
//     cudaGetDeviceCount(&deviceCount);

//     if (deviceCount < world_size) {
//         std::cerr << "This code requires at least " << world_size << " CUDA devices available." << std::endl;
//         MPI_Finalize();
//         return 1;
//     }

//     int N = 10;
//     float *hostData = new float[N];

//     // Initialize data on the host of the root process (process 0)
//     if (world_rank == 0) {
//         for (int i = 0; i < N; ++i) {
//             hostData[i] = static_cast<float>(i);
//         }
//     }

//     float *deviceData;
//     cudaSetDevice(world_rank);
//     cudaMalloc((void**)&deviceData, N * sizeof(float));

//     // Copy data from host to device on all processes
//     cudaMemcpy(deviceData, hostData, N * sizeof(float), cudaMemcpyHostToDevice);

//     // Broadcast data from process 0 to all other processes
//     MPI_Bcast(deviceData, N, MPI_FLOAT, 0, MPI_COMM_WORLD);

//     // Copy data from device to host on all processes
//     cudaMemcpy(hostData, deviceData, N * sizeof(float), cudaMemcpyDeviceToHost);

//     // Print the data on each process
//     for (int i = 0; i < N; ++i) {
//         std::cout << "Process " << world_rank << " - hostData[" << i << "] = " << hostData[i] << std::endl;
//     }

//     // Cleanup
//     delete[] hostData;
//     cudaFree(deviceData);

//     MPI_Finalize();

//     return 0;
// }


// #include <iostream>
// #include <mpi.h>
// #include <cuda_runtime.h>

// int main(int argc, char** argv) {
//     MPI_Init(&argc, &argv);

//     int world_size;
//     MPI_Comm_size(MPI_COMM_WORLD, &world_size);

//     int world_rank;
//     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

//     int deviceCount;
//     cudaGetDeviceCount(&deviceCount);

//     if (deviceCount < world_size) {
//         std::cerr << "This code requires at least " << world_size << " CUDA devices available." << std::endl;
//         MPI_Finalize();
//         return 1;
//     }

//     int N = 10;
//     float *hostData = new float[N];

//     // Initialize data on the host of the root process (process 0)
//     if (world_rank == 0) {
//         for (int i = 0; i < N; ++i) {
//             hostData[i] = static_cast<float>(i);
//         }
//     }

//     float *deviceData;
//     cudaSetDevice(world_rank);
//     cudaMalloc((void**)&deviceData, N * sizeof(float));

//     // Copy data from host to device on all processes
//     cudaMemcpy(deviceData, hostData, N * sizeof(float), cudaMemcpyHostToDevice);

//     // Broadcast data from process 0 to all other processes using MPI_Bcast
//     MPI_Bcast(deviceData, N, MPI_FLOAT, 0, MPI_COMM_WORLD);

//     // Allocate memory for receiving data using MPI_Allrecv
//     float *recvData = new float[N * world_size];

//     // Gather data from all processes using MPI_Allrecv
//     MPI_Allgather(deviceData, N, MPI_FLOAT, recvData, N, MPI_FLOAT, MPI_COMM_WORLD);

//     // Print the received data on each process
//     for (int i = 0; i < world_size; ++i) {
//         if (world_rank == i) {
//             for (int j = 0; j < N; ++j) {
//                 std::cout << "Process " << world_rank << " - recvData[" << j << "] = " << recvData[i * N + j] << std::endl;
//             }
//         }
//         MPI_Barrier(MPI_COMM_WORLD); // Synchronize output
//     }

//     // Cleanup
//     delete[] hostData;
//     delete[] recvData;
//     cudaFree(deviceData);

//     MPI_Finalize();

//     return 0;
// }


#include <iostream>
#include <mpi.h>
#include <cuda_runtime.h>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    if (deviceCount < world_size) {
        std::cerr << "This code requires at least " << world_size << " CUDA devices available." << std::endl;
        MPI_Finalize();
        return 1;
    }

    int N = 10;
    float *hostData = new float[N];
    float *recvBuffer = nullptr;

    // Initialize data on the host of the root process (process 0)
    for (int i = 0; i < N; ++i) {
        hostData[i] = static_cast<float>(world_rank);
    }

    float *deviceData;
    cudaSetDevice(world_rank);
    cudaMalloc((void**)&deviceData, N * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(deviceData, hostData, N * sizeof(float), cudaMemcpyHostToDevice);
    // Starting communication between GPUs
    if(world_rank == 0) std::cout << "Process " << world_rank << " is starting the communication between GPUs." << std::endl;
    // Communicate data between GPUs using MPI (exchange with adjacent processes)
    int destRank = (world_rank + 1) % 4;
    int sourceRank = (world_rank + 3) % 4;
    MPI_Request sendRequest, recvRequest;
    MPI_Status status;
    MPI_Isend(deviceData, N, MPI_FLOAT, destRank, 0, MPI_COMM_WORLD, &sendRequest);
    MPI_Irecv(deviceData, N, MPI_FLOAT, sourceRank, 0, MPI_COMM_WORLD, &recvRequest);
    // Wait for the communication to complete
    MPI_Wait(&sendRequest, &status);
    MPI_Wait(&recvRequest, &status);
    // Finished communication between GPUs
    if(world_rank == 0) std::cout << "Process " << world_rank << " has finished the communication between GPUs." << std::endl;
    // Copy data from device to host
    cudaMemcpy(hostData, deviceData, N * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the data on each process
    if(world_rank == 0) {
        std::cout << "Results after communication between GPUs." << std::endl;
        for (int i = 0; i < N; ++i) {
            std::cout << "Process " << world_rank << " - hostData[" << i << "] = " << hostData[i] << std::endl;
        }
    }

    MPI_Barrier(MPI_COMM_WORLD); // Synchronize output

    // Copy data from host to device on all processes
    cudaMemcpy(deviceData, hostData, N * sizeof(float), cudaMemcpyHostToDevice);

    // Starting broadcast
    if(world_rank == 2) std::cout << "Process " << world_rank << " is starting the broadcast." << std::endl;
    // Broadcast data from process 2 to all other processes using MPI_Bcast
    MPI_Bcast(deviceData, N, MPI_FLOAT, 2, MPI_COMM_WORLD);
    // Copy data from device to host on all processes
    cudaMemcpy(hostData, deviceData, N * sizeof(float), cudaMemcpyDeviceToHost);
    // Print the data on each process
    if(world_rank == 0) {
        for (int i = 0; i < N; ++i) {
            std::cout << "Process " << world_rank << " After broadcast - hostData[" << i << "] = " << hostData[i] << std::endl;
        }
    }
    MPI_Barrier(MPI_COMM_WORLD); // Synchronize output

    //Starting gather
    if(world_rank == 0) std::cout << "Process " << world_rank << " is starting the MPI_Allreduce." << std::endl;

    // // Reduce data from all processes to process 0 using MPI_Reduce
    // recvBuffer = new float[N];
    // MPI_Reduce(deviceData, recvBuffer, N, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

    // // Broadcast the reduced data from process 0 to all other processes using MPI_Bcast
    // MPI_Bcast(recvBuffer, N, MPI_FLOAT, 0, MPI_COMM_WORLD);

    // Perform an MPI_Allreduce operation to sum the values across all processes
    MPI_Allreduce(MPI_IN_PLACE, deviceData, N, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);

    // Copy data from device to host on all processes
    cudaMemcpy(hostData, deviceData, N * sizeof(float), cudaMemcpyDeviceToHost);

    if(world_rank == 1) {
        // Print the received data on each process
        for (int i = 0; i < N; ++i) {
            std::cout << "MPI_Allreduce Process " << world_rank << " - hostData[" << i << "] = " << hostData[i] << std::endl;
        }
    }

    // Cleanup
    delete[] hostData;
    cudaFree(deviceData);

    MPI_Finalize();

    return 0;
}

