diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1112,11 +1112,12 @@ // warps participate in parallel work. static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode) { - auto *GVMode = - new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), - Twine(Name, "_exec_mode")); + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD + : OMP_TGT_EXEC_MODE_GENERIC), + Twine(Name, "_exec_mode")); CGM.addCompilerUsedGlobal(GVMode); } diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -16,9 +16,9 @@ #define HEADER // Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2 template tx ftemplate(int n) { diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -12,9 +12,9 @@ // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] // Check that the execution mode of all 3 target regions is set to Spmd Mode. -// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2 template tx ftemplate(int n) { diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp @@ -16,10 +16,10 @@ #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 2 #define N 1000 diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp @@ -16,10 +16,10 @@ #define HEADER // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 2 +// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 2 #define N 1000 #define M 10 diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -128,6 +128,14 @@ LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask) }; +enum OMPTgtExecModeFlags : int8_t { + OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, + OMP_TGT_EXEC_MODE_SPMD = 1 << 1, + OMP_TGT_EXEC_MODE_GENERIC_SPMD = + OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD) +}; + } // end namespace omp } // end namespace llvm diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3282,15 +3282,18 @@ GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( (Kernel->getName() + "_exec_mode").str()); assert(ExecMode && "Kernel without exec mode?"); - assert(ExecMode->getInitializer() && - ExecMode->getInitializer()->isOneValue() && - "Initially non-SPMD kernel has SPMD exec mode!"); + assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); // Set the global exec mode flag to indicate SPMD-Generic mode. - constexpr int SPMDGeneric = 2; - if (!ExecMode->getInitializer()->isZeroValue()) - ExecMode->setInitializer( - ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric)); + assert(isa(ExecMode->getInitializer()) && + "ExecMode is not an integer!"); + const int8_t ExecModeVal = + cast(ExecMode->getInitializer())->getSExtValue(); + assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && + "Initially non-SPMD kernel has SPMD exec mode!"); + ExecMode->setInitializer( + ConstantInt::get(ExecMode->getInitializer()->getType(), + ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD)); // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. const int InitIsSPMDArgNo = 1; diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -11,7 +11,7 @@ ; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32 ; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ;. diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll --- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll @@ -13,7 +13,7 @@ ;. ; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 -; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -91,10 +91,10 @@ ;. ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" ; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 @@ -102,10 +102,10 @@ ;. ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" ; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll --- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll @@ -23,7 +23,7 @@ ;. ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_fd02_404433c2_main_l5_exec_mode], section "llvm.metadata" ;. define weak void @__omp_offloading_fd02_404433c2_main_l5(double* nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 { diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -45,7 +45,7 @@ ;. ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" ;. ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -22,7 +22,10 @@ # Define the suffix for the runtime messaging dumps. add_definitions(-DTARGET_NAME=CUDA) -include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) +include_directories( + ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS} + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS} +) set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF) option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA}) diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -28,6 +28,8 @@ #include "MemoryManager.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" + // Utility for retrieving and printing CUDA error string. #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ @@ -71,28 +73,17 @@ std::vector<__tgt_offload_entry> Entries; }; -enum ExecutionModeType { - SPMD, // constructors, destructors, - // combined constructs (`teams distribute parallel for [simd]`) - GENERIC, // everything else - SPMD_GENERIC, // Generic kernel with SPMD execution - NONE -}; - /// Use a single entity to encode a kernel and a set of flags. struct KernelTy { CUfunction Func; // execution mode of kernel - // 0 - SPMD mode (without master warp) - // 1 - Generic mode (with master warp) - // 2 - SPMD mode execution with Generic mode semantics. - int8_t ExecutionMode; + llvm::omp::OMPTgtExecModeFlags ExecutionMode; /// Maximal number of threads per block for this kernel. int MaxThreadsPerBlock = 0; - KernelTy(CUfunction _Func, int8_t _ExecutionMode) + KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) : Func(_Func), ExecutionMode(_ExecutionMode) {} }; @@ -867,7 +858,7 @@ DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); // default value GENERIC (in case symbol is missing from cubin file) - int8_t ExecModeVal = ExecutionModeType::GENERIC; + llvm::omp::OMPTgtExecModeFlags ExecModeVal; std::string ExecModeNameStr(E->name); ExecModeNameStr += "_exec_mode"; const char *ExecModeName = ExecModeNameStr.c_str(); @@ -876,9 +867,9 @@ size_t CUSize; Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(int8_t)) { + if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", - ExecModeName, CUSize, sizeof(int8_t)); + ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); return nullptr; } @@ -890,12 +881,6 @@ CUDA_ERR_STRING(Err); return nullptr; } - - if (ExecModeVal < 0 || ExecModeVal > 2) { - DP("Error wrong exec_mode value specified in cubin file: %d\n", - ExecModeVal); - return nullptr; - } } else { DP("Loading global exec_mode '%s' - symbol missing, using default " "value GENERIC (1)\n", @@ -1098,12 +1083,19 @@ KernelTy *KernelInfo = reinterpret_cast(TgtEntryPtr); + const bool IsSPMDGenericMode = + KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; + const bool IsSPMDMode = + KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; + const bool IsGenericMode = + KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; + int CudaThreadsPerBlock; if (ThreadLimit > 0) { DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); CudaThreadsPerBlock = ThreadLimit; // Add master warp if necessary - if (KernelInfo->ExecutionMode == GENERIC) { + if (IsGenericMode) { DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; } @@ -1136,13 +1128,21 @@ unsigned int CudaBlocksPerGrid; if (TeamNum <= 0) { if (LoopTripCount > 0 && EnvNumTeams < 0) { - if (KernelInfo->ExecutionMode == SPMD) { + if (IsSPMDGenericMode) { + // If we reach this point, then we are executing a kernel that was + // transformed from Generic-mode to SPMD-mode. This kernel has + // SPMD-mode execution, but needs its blocks to be scheduled + // differently because the current loop trip count only applies to the + // `teams distribute` region and will create var too few blocks using + // the regular SPMD-mode method. + CudaBlocksPerGrid = LoopTripCount; + } else if (IsSPMDMode) { // We have a combined construct, i.e. `target teams distribute // parallel for [simd]`. We launch so many teams so that each thread // will execute one iteration of the loop. round up to the nearest // integer CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; - } else if (KernelInfo->ExecutionMode == GENERIC) { + } else if (IsGenericMode) { // If we reach this point, then we have a non-combined construct, i.e. // `teams distribute` with a nested `parallel for` and each team is // assigned one iteration of the `distribute` loop. E.g.: @@ -1156,16 +1156,9 @@ // Threads within a team will execute the iterations of the `parallel` // loop. CudaBlocksPerGrid = LoopTripCount; - } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) { - // If we reach this point, then we are executing a kernel that was - // transformed from Generic-mode to SPMD-mode. This kernel has - // SPMD-mode execution, but needs its blocks to be scheduled - // differently because the current loop trip count only applies to the - // `teams distribute` region and will create var too few blocks using - // the regular SPMD-mode method. - CudaBlocksPerGrid = LoopTripCount; } else { - REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode); + REPORT("Unknown execution mode: %d\n", + static_cast(KernelInfo->ExecutionMode)); return OFFLOAD_FAIL; } DP("Using %d teams due to loop trip count %" PRIu32 @@ -1185,16 +1178,12 @@ } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Launching kernel %s with %d blocks and %d threads in %s " - "mode\n", + "Launching kernel %s with %d blocks and %d threads in %s mode\n", (getOffloadEntry(DeviceId, TgtEntryPtr)) ? getOffloadEntry(DeviceId, TgtEntryPtr)->name : "(null)", CudaBlocksPerGrid, CudaThreadsPerBlock, - (KernelInfo->ExecutionMode != SPMD - ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" - : "SPMD-Generic") - : "SPMD")); + (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); CUstream Stream = getStream(DeviceId, AsyncInfo); Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,