#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --ntasks-per-node=4
#SBATCH --time=00:15:00
#SBATCH --gres=gpu:4
#SBATCH --mem=10G

# Compiler environment
# ------------------------------
# ------------------------------
module purge
module load gcc/9.3.0

# 1) WORKS 22.1
# module load nvidia/nvhpc/22.1-cuda-11.5-majslurm
# ulimit -s 10240
# export OMPI_MCA_mtl=^mxm
# export OMPI_MCA_pml=^yalla
# export OMPI_MCA_mpi_cuda_support=1

# 2) DOES NOT WORKS 11.5-ompi405
# module load nvidia/nvhpc/22.1-cuda-11.5-ompi405-majslurm
# export PATH=/usr/local/nvidia_hpc_sdk/MAJSLURM/Linux_x86_64/22.1/comm_libs/hpcx/hpcx-2.10.beta/ompi/bin:${PATH}
# export LD_LIBRARY_PATH=/usr/local/nvidia_hpc_sdk/MAJSLURM/Linux_x86_64/22.1/comm_libs/hpcx/hpcx-2.10.beta/ompi/lib:${LD_LIBRARY_PATH}
# export UCX_MEMTYPE_CACHE=n

# 3) DOES NOT WORK 22.1 ompi405
# module load nvidia/nvhpc/22.1-cuda-11.5-ompi405-majslurm
# export UCX_MEMTYPE_CACHE=n
# export UCX_TLS=knem,dc

# 4) WORKS pgi 4.0.4-UCX
# module load openmpi/pgi/4.0.4-UCX-cuda
# export UCX_MEMTYPE_CACHE=n

# 5) WORKS 22.7
# module load nvidia/nvhpc/22.7-cuda-11.7-majslurm
# export PATH=/usr/local/nvidia_hpc_sdk/MAJSLURM//Linux_x86_64/22.7/comm_libs/hpcx/hpcx-2.11/ucx/bin:${PATH}
# export LD_LIBRARY_PATH=/usr/local/nvidia_hpc_sdk/MAJSLURM//Linux_x86_64/22.7/comm_libs/hpcx/hpcx-2.11/ompi/lib:${LD_LIBRARY_PATH}
# export UCX_MEMTYPE_CACHE=n

# New versions 

# 6) DOES WORK gnu 4.1.4.2
module load cuda/11.7
module load openmpi/gnu/4.1.4.2
# module load openmpi/gnu/ilp64/4.1.4.2
# module load openmpi/4.1.4.2
# module load openmpi/intel/4.1.4.2
# module load openmpi/intel/ilp64/4.1.4.2


export UCX_MEMTYPE_CACHE=n

make -f Makefile_openmpi clean
make -f Makefile_openmpi

#export OMPI_MCA_mpi_cuda_support=1

echo "MPIRUN"
ldd bcast_allreduce_test
mpirun -np 4  ./bcast_allreduce_test

jobinfo ${SLURM_JOBID}



