Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -82,6 +82,11 @@ set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm}) endforeach() + # Override default MAX_SM in src/target_impl.h if requested + if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM) + set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}") + endif() + # Activate RTL message dumps if requested by the user. set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL "Activate NVPTX device RTL debug messages.") @@ -96,7 +101,7 @@ list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory} -I${devicertl_nvptx_directory}/src) cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} - OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) + OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION}) # Install device RTL under the lib destination folder. install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}") @@ -159,7 +164,7 @@ get_filename_component(outfile ${src} NAME) add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION} -c ${infile} -o ${outfile}-sm_${sm}.bc DEPENDS ${infile} IMPLICIT_DEPENDS CXX ${infile} Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -47,16 +47,27 @@ // Maximum number of omp state objects per SM allocated statically in global // memory. -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 600 #define OMP_STATE_COUNT 32 +#else +#define OMP_STATE_COUNT 16 +#endif + +#if !defined(MAX_SM) +#if __CUDA_ARCH__ >= 900 +#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option +#elif __CUDA_ARCH__ >= 800 +// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs +// GA102 design has a maxinum of 84 SMs +#define MAX_SM 108 +#elif __CUDA_ARCH__ >= 700 #define MAX_SM 84 #elif __CUDA_ARCH__ >= 600 -#define OMP_STATE_COUNT 32 #define MAX_SM 56 #else -#define OMP_STATE_COUNT 16 #define MAX_SM 16 #endif +#endif #define OMP_ACTIVE_PARALLEL_LEVEL 128