diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2886,8 +2886,12 @@ assert(ExecMode->getInitializer() && ExecMode->getInitializer()->isOneValue() && "Initially non-SPMD kernel has SPMD exec mode!"); - ExecMode->setInitializer( - ConstantInt::get(ExecMode->getInitializer()->getType(), 0)); + + // Set the global exec mode flag to indicate SPMD-Generic mode. + constexpr int SPMDGeneric = 2; + if (!ExecMode->getInitializer()->isZeroValue()) + ExecMode->setInitializer( + ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric)); // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. const int InitIsSPMDArgNo = 1; diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll --- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll @@ -13,7 +13,7 @@ ;. ; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 -; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 ; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -32,7 +32,7 @@ ;. ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata" ;. define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 { diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -127,9 +127,10 @@ }; enum ExecutionModeType { - SPMD, // constructors, destructors, - // combined constructs (`teams distribute parallel for [simd]`) - GENERIC, // everything else + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + SPMD_GENERIC, // Generic kernel with SPMD execution NONE }; @@ -240,6 +241,7 @@ // execution mode of kernel // 0 - SPMD mode (without master warp) // 1 - Generic mode (with master warp) + // 2 - SPMD mode execution with Generic mode semantics. int8_t ExecutionMode; int16_t ConstWGSize; int32_t device_id; @@ -1730,7 +1732,7 @@ DP("After loading global for %s ExecMode = %d\n", ExecModeName, ExecModeVal); - if (ExecModeVal < 0 || ExecModeVal > 1) { + if (ExecModeVal < 0 || ExecModeVal > 2) { DP("Error wrong exec_mode value specified in HSA code object file: " "%d\n", ExecModeVal); @@ -1965,7 +1967,11 @@ if (ExecutionMode == SPMD) { // round up to the nearest integer num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; - } else { + } else if (ExecutionMode == GENERIC) { + num_groups = loop_tripcount; + } else if (ExecutionMode == SPMD_GENERIC) { + // This is a generic kernel that was transformed to use SPMD-mode + // execution but uses Generic-mode semantics for scheduling. num_groups = loop_tripcount; } DP("Using %d teams due to loop trip count %" PRIu64 " and number of " diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -70,9 +70,10 @@ }; enum ExecutionModeType { - SPMD, // constructors, destructors, - // combined constructs (`teams distribute parallel for [simd]`) - GENERIC, // everything else + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + SPMD_GENERIC, // Generic kernel with SPMD execution NONE }; @@ -83,6 +84,7 @@ // execution mode of kernel // 0 - SPMD mode (without master warp) // 1 - Generic mode (with master warp) + // 2 - SPMD mode execution with Generic mode semantics. int8_t ExecutionMode; /// Maximal number of threads per block for this kernel. @@ -796,7 +798,7 @@ return nullptr; } - if (ExecModeVal < 0 || ExecModeVal > 1) { + if (ExecModeVal < 0 || ExecModeVal > 2) { DP("Error wrong exec_mode value specified in cubin file: %d\n", ExecModeVal); return nullptr; @@ -1045,7 +1047,7 @@ // will execute one iteration of the loop. round up to the nearest // integer CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; - } else { + } else if (KernelInfo->ExecutionMode == GENERIC) { // If we reach this point, then we have a non-combined construct, i.e. // `teams distribute` with a nested `parallel for` and each team is // assigned one iteration of the `distribute` loop. E.g.: @@ -1059,6 +1061,14 @@ // Threads within a team will execute the iterations of the `parallel` // loop. CudaBlocksPerGrid = LoopTripCount; + } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) { + // If we reach this point, then we are executing a kernel that was + // transformed from Generic-mode to SPMD-mode. This kernel has + // SPMD-mode execution, but needs its blocks to be scheduled + // differently because the current loop trip count only applies to the + // `teams distribute` region and will create var too few blocks using + // the regular SPMD-mode method. + CudaBlocksPerGrid = LoopTripCount; } DP("Using %d teams due to loop trip count %" PRIu32 " and number of threads per block %d\n", @@ -1083,7 +1093,9 @@ ? getOffloadEntry(DeviceId, TgtEntryPtr)->name : "(null)", CudaBlocksPerGrid, CudaThreadsPerBlock, - (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic"); + (KernelInfo->ExecutionMode != SPMD + ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic") + : "SPMD")); CUstream Stream = getStream(DeviceId, AsyncInfo); Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,