diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -205,6 +205,9 @@ /* kmp_task_t */ VoidPtr, Int32, /* kmp_task_affinity_info_t */ VoidPtr) +__OMP_RTL(__kmpc_get_hardware_num_blocks, false, Int32, ) +__OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, ) + __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) __OMP_RTL(omp_get_max_threads, false, Int32, ) @@ -596,6 +599,9 @@ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs)) +__OMP_RTL_ATTRS(__kmpc_get_hardware_num_blocks, GetterAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, AttributeSet(), ParamAttrs()) + __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), ParamAttrs()) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1763,6 +1763,10 @@ return Changed == ChangeStatus::CHANGED; } + /// Populate the Attributor with abstract attribute opportunities in the + /// function. + void registerFoldRuntimeCall(RuntimeFunction RF); + /// Populate the Attributor with abstract attribute opportunities in the /// function. void registerAAs(bool IsModulePass); @@ -3358,6 +3362,8 @@ case OMPRTL___kmpc_is_spmd_exec_mode: case OMPRTL___kmpc_for_static_fini: case OMPRTL___kmpc_global_thread_num: + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + case OMPRTL___kmpc_get_hardware_num_blocks: case OMPRTL___kmpc_single: case OMPRTL___kmpc_end_single: case OMPRTL___kmpc_master: @@ -3511,11 +3517,16 @@ ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; - switch (RFKind) { case OMPRTL___kmpc_is_spmd_exec_mode: Changed = Changed | foldIsSPMDExecMode(A); break; + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + Changed = Changed | foldHardwareNumThreads(A); + break; + case OMPRTL___kmpc_get_hardware_num_blocks: + Changed = Changed | foldHardwareNumTeams(A); + break; default: llvm_unreachable("Unhandled OpenMP runtime function!"); } @@ -3546,6 +3557,7 @@ auto &CallerKernelInfoAA = A.getAAFor( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); + // What actual kernels reach this function for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { auto &AA = A.getAAFor(*this, IRPosition::function(*K), DepClassTy::REQUIRED); @@ -3599,6 +3611,78 @@ : ChangeStatus::CHANGED; } + /// Fold __kmpc_get_hardware_num_blocks into a constant if possible. + /// The value is an attribute in the kernel + ChangeStatus foldHardwareNumTeams(Attributor &A) { + // Specialize only if all the calls agree with the number of teams + int32_t CurrentNumTeams = -1; + BooleanState StateBefore = getState(); + + auto &CallerKernelInfoAA = A.getAAFor( + *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); + + // What actual kernels reach this function + for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { + if (K->hasFnAttribute("NumTeams")) { + int32_t NumT = + std::stoi(K->getFnAttribute("NumTeams").getValueAsString().str()); + if (CurrentNumTeams != -1 && CurrentNumTeams != NumT) { + SimplifiedValue = nullptr; + return indicatePessimisticFixpoint(); + } else { + CurrentNumTeams = NumT; + } + } else { + // TODO: No attribute, then default? + } + } + + if (CurrentNumTeams != -1) { + auto &Ctx = getAnchorValue().getContext(); + SimplifiedValue = + ConstantInt::get(Type::getInt8Ty(Ctx), CurrentNumTeams); + } + + return getState() == StateBefore ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } + + /// Fold __kmpc_get_hardware_num_threads_in_block into a constant if possible. + /// The value is an attribute in the kernel + ChangeStatus foldHardwareNumThreads(Attributor &A) { + // Specialize only if all the calls agree with the number of threads + int32_t CurrentNumThreads = -1; + BooleanState StateBefore = getState(); + + auto &CallerKernelInfoAA = A.getAAFor( + *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); + + // What actual kernels reach this function + for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { + if (K->hasFnAttribute("NumThreads")) { + int32_t NumT = + std::stoi(K->getFnAttribute("NumThreads").getValueAsString().str()); + if (CurrentNumThreads != -1 && CurrentNumThreads != NumT) { + SimplifiedValue = nullptr; + return indicatePessimisticFixpoint(); + } else { + CurrentNumThreads = NumT; + } + } else { + // TODO: No attribute, then default? + } + } + + if (CurrentNumThreads != -1) { + auto &Ctx = getAnchorValue().getContext(); + SimplifiedValue = + ConstantInt::get(Type::getInt8Ty(Ctx), CurrentNumThreads); + } + + return getState() == StateBefore ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } + /// An optional value the associated value is assumed to fold to. That is, we /// assume the associated value (which is a call) can be replaced by this /// simplified value. @@ -3610,6 +3694,21 @@ } // namespace +/// Register folding callsite +void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) { + auto &RFI = OMPInfoCache.RFIs[RF]; + RFI.foreachUse(SCC, [&](Use &U, Function &) { + CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CI) + return false; + A.getOrCreateAAFor( + IRPosition::callsite_function(*CI), /* QueryingAA */ nullptr, + DepClassTy::NONE, /* ForceUpdate */ false, + /* UpdateAfterInit */ false); + return false; + }); +} + void OpenMPOpt::registerAAs(bool IsModulePass) { if (SCC.empty()) @@ -3619,23 +3718,16 @@ // update. This will make sure we register all value simplification // callbacks before any other AA has the chance to create an AAValueSimplify // or similar. - for (Function *Kernel : OMPInfoCache.Kernels) + for (Function *Kernel : OMPInfoCache.Kernels) { A.getOrCreateAAFor( IRPosition::function(*Kernel), /* QueryingAA */ nullptr, DepClassTy::NONE, /* ForceUpdate */ false, /* UpdateAfterInit */ false); + } - auto &IsSPMDRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode]; - IsSPMDRFI.foreachUse(SCC, [&](Use &U, Function &) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsSPMDRFI); - if (!CI) - return false; - A.getOrCreateAAFor( - IRPosition::callsite_function(*CI), /* QueryingAA */ nullptr, - DepClassTy::NONE, /* ForceUpdate */ false, - /* UpdateAfterInit */ false); - return false; - }); + registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); + registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block); + registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks); } // Create CallSite AA for all Getters. diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@G = external global i32 +;. +; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 +;. +define weak void @kernel0() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel0() #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: call void @helper0() +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @helper2() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + call void @helper0() + call void @helper1() + call void @helper2() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + ret void +} + +define weak void @kernel1() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel1() #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + call void @helper1() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + ret void +} + +define weak void @kernel2() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel2() #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +; CHECK-NEXT: call void @helper0() +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @helper2() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + call void @helper0() + call void @helper1() + call void @helper2() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + ret void +} + +define internal void @helper0() { +; CHECK-LABEL: define {{[^@]+}}@helper0() { +; CHECK-NEXT: store i32 666, i32* @G, align 1 +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + store i32 %threadLimit, i32* @G + ret void +} + +define internal void @helper1() { +; CHECK-LABEL: define {{[^@]+}}@helper1() { +; CHECK-NEXT: br label [[F:%.*]] +; CHECK: t: +; CHECK-NEXT: unreachable +; CHECK: f: +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + %c = icmp eq i32 %threadLimit, 666 + br i1 %c, label %t, label %f +t: + call void @helper0() + ret void +f: + ret void +} + +define internal void @helper2() { +; CHECK-LABEL: define {{[^@]+}}@helper2() { +; CHECK-NEXT: store i32 666, i8* @G +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + store i32 %threadLimit, i32* @G + ret void +} + +declare i32 @__kmpc_get_hardware_num_threads_in_block() +declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 + + +!llvm.module.flags = !{!0, !1} +!nvvm.annotations = !{!2, !3, !4} + +attributes #0 = { "NumThreads"="666" "NumTeams"="777"} + +!0 = !{i32 7, !"openmp", i32 50} +!1 = !{i32 7, !"openmp-device", i32 50} +!2 = !{void ()* @kernel0, !"kernel", i32 1} +!3 = !{void ()* @kernel1, !"kernel", i32 1} +!4 = !{void ()* @kernel2, !"kernel", i32 1} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel0, !"kernel", i32 1} +; CHECK: [[META3:![0-9]+]] = !{void ()* @kernel1, !"kernel", i32 1} +; CHECK: [[META4:![0-9]+]] = !{void ()* @kernel2, !"kernel", i32 1} +;. diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -122,12 +122,12 @@ } } // namespace -EXTERN int GetNumberOfBlocksInKernel() { +EXTERN int __kmpc_get_hardware_num_blocks() { return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); } -EXTERN int GetNumberOfThreadsInBlock() { +EXTERN int __kmpc_get_hardware_num_threads_in_block() { return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); @@ -140,7 +140,7 @@ } EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { - return GetNumberOfThreadsInBlock(); + return __kmpc_get_hardware_num_threads_in_block(); } // Atomics diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -61,7 +61,7 @@ EXTERN int omp_get_thread_limit(void) { if (__kmpc_is_spmd_exec_mode()) - return GetNumberOfThreadsInBlock(); + return __kmpc_get_hardware_num_threads_in_block(); int rc = threadLimit; PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); return rc; @@ -196,7 +196,7 @@ EXTERN int omp_get_team_size(int level) { if (__kmpc_is_spmd_exec_mode()) - return level == 1 ? GetNumberOfThreadsInBlock() : 1; + return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1; int rc = -1; unsigned parLevel = parallelLevel[GetWarpId()]; // If level is 0 or all parallel regions are not active - return 1. diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -94,7 +94,7 @@ if (GetLaneId() == 0) { parallelLevel[GetWarpId()] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + 1 + (__kmpc_get_hardware_num_threads_in_block() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); } __kmpc_data_sharing_init_stack(); diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -197,7 +197,7 @@ __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); - uint32_t NumTeams = GetNumberOfBlocksInKernel(); + uint32_t NumTeams = __kmpc_get_hardware_num_blocks(); static unsigned SHARED(Bound); static unsigned SHARED(ChunkTeamCount); diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -53,7 +53,7 @@ // // Called in Generic Execution Mode only. int GetMasterThreadID() { - return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); + return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1); } // The last warp is reserved for the master; other warps are workers. @@ -109,7 +109,7 @@ if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { rc = 1; } else if (isSPMDExecutionMode) { - rc = GetNumberOfThreadsInBlock(); + rc = __kmpc_get_hardware_num_threads_in_block(); } else { rc = threadsInTeam; } @@ -127,7 +127,7 @@ int GetNumberOfOmpTeams() { // omp_num_teams - return GetNumberOfBlocksInKernel(); // assume 1 block per team + return __kmpc_get_hardware_num_blocks(); // assume 1 block per team } //////////////////////////////////////////////////////////////////////////////// @@ -169,7 +169,7 @@ int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { if (!isSPMDExecutionMode) return GetNumberOfWorkersInTeam(); - return GetNumberOfThreadsInBlock(); + return __kmpc_get_hardware_num_threads_in_block(); } int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -50,8 +50,8 @@ int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -int GetNumberOfOmpTeams(); // omp_num_teams +NOINLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads +NOINLINE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -94,10 +94,10 @@ // Calls to the NVPTX layer (assuming 1D layout) EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); } -EXTERN int GetNumberOfBlocksInKernel() { +EXTERN int __kmpc_get_hardware_num_blocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } -EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } +EXTERN int __kmpc_get_hardware_num_threads_in_block() { return __nvvm_read_ptx_sreg_ntid_x(); } EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } EXTERN unsigned GetWarpSize() { return WARPSIZE; } EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -18,8 +18,8 @@ // Calls to the NVPTX layer (assuming 1D layout) EXTERN int GetThreadIdInBlock(); EXTERN int GetBlockIdInKernel(); -EXTERN int GetNumberOfBlocksInKernel(); -EXTERN int GetNumberOfThreadsInBlock(); +EXTERN NOINLINE int __kmpc_get_hardware_num_blocks(); +EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block(); EXTERN unsigned GetWarpId(); EXTERN unsigned GetWarpSize(); EXTERN unsigned GetLaneId();