diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -206,6 +206,9 @@ /* kmp_task_t */ VoidPtr, Int32, /* kmp_task_affinity_info_t */ VoidPtr) +__OMP_RTL(__kmpc_get_hardware_num_blocks, false, Int32, ) +__OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, ) + __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) __OMP_RTL(omp_get_max_threads, false, Int32, ) @@ -601,6 +604,9 @@ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs)) +__OMP_RTL_ATTRS(__kmpc_get_hardware_num_blocks, GetterAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, AttributeSet(), ParamAttrs()) + __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), ParamAttrs()) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1774,6 +1774,10 @@ return Changed == ChangeStatus::CHANGED; } + /// Populate the Attributor with abstract attribute opportunities in the + /// function. + void registerFoldRuntimeCall(RuntimeFunction RF); + /// Populate the Attributor with abstract attribute opportunities in the /// function. void registerAAs(bool IsModulePass); @@ -3358,6 +3362,8 @@ case OMPRTL___kmpc_is_spmd_exec_mode: case OMPRTL___kmpc_for_static_fini: case OMPRTL___kmpc_global_thread_num: + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + case OMPRTL___kmpc_get_hardware_num_blocks: case OMPRTL___kmpc_single: case OMPRTL___kmpc_end_single: case OMPRTL___kmpc_master: @@ -3515,7 +3521,6 @@ ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; - switch (RFKind) { case OMPRTL___kmpc_is_spmd_exec_mode: Changed |= foldIsSPMDExecMode(A); @@ -3523,6 +3528,12 @@ case OMPRTL___kmpc_is_generic_main_thread_id: Changed |= foldIsGenericMainThread(A); break; + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + Changed = Changed | foldHardwareNumThreads(A); + break; + case OMPRTL___kmpc_get_hardware_num_blocks: + Changed = Changed | foldHardwareNumTeams(A); + break; default: llvm_unreachable("Unhandled OpenMP runtime function!"); } @@ -3637,6 +3648,73 @@ return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; + + } + + /// Fold __kmpc_get_hardware_num_blocks into a constant if possible. + /// The value is an attribute in the kernel + ChangeStatus foldHardwareNumTeams(Attributor &A) { + // Specialize only if all the calls agree with the number of teams + int32_t CurrentNumTeams = -1; + Optional SimplifiedValueBefore = SimplifiedValue; + + auto &CallerKernelInfoAA = A.getAAFor( + *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); + + // What actual kernels reach this function + for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { + if (K->hasFnAttribute("omp_target_num_teams")) { + int32_t NumT = std::stoi( + K->getFnAttribute("omp_target_num_teams").getValueAsString().str()); + if (CurrentNumTeams != -1 && CurrentNumTeams != NumT) + return indicatePessimisticFixpoint(); + CurrentNumTeams = NumT; + continue; + } + return indicatePessimisticFixpoint(); + } + + if (CurrentNumTeams != -1) { + auto &Ctx = getAnchorValue().getContext(); + SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), CurrentNumTeams); + return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } + return indicatePessimisticFixpoint(); + } + + /// Fold __kmpc_get_hardware_num_threads_in_block into a constant if possible. + /// The value is an attribute in the kernel + ChangeStatus foldHardwareNumThreads(Attributor &A) { + // Specialize only if all the calls agree with the number of threads + int32_t CurrentNumThreads = -1; + Optional SimplifiedValueBefore = SimplifiedValue; + + auto &CallerKernelInfoAA = A.getAAFor( + *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); + + // What actual kernels reach this function + for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { + if (K->hasFnAttribute("omp_target_thread_limit")) { + int32_t NumT = std::stoi(K->getFnAttribute("omp_target_thread_limit") + .getValueAsString() + .str()); + if (CurrentNumThreads != -1 && CurrentNumThreads != NumT) + return indicatePessimisticFixpoint(); + CurrentNumThreads = NumT; + continue; + } + return indicatePessimisticFixpoint(); + } + + if (CurrentNumThreads != -1) { + auto &Ctx = getAnchorValue().getContext(); + SimplifiedValue = + ConstantInt::get(Type::getInt8Ty(Ctx), CurrentNumThreads); + return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } + return indicatePessimisticFixpoint(); } /// An optional value the associated value is assumed to fold to. That is, we @@ -3650,6 +3728,22 @@ } // namespace +/// Register folding callsite +void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) { + auto &RFI = OMPInfoCache.RFIs[RF]; + RFI.foreachUse(SCC, [&](Use &U, Function &F) { + CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI); + //errs() << IRPosition::callsite_function(*CI).getPositionKind() << " -- " << RF; + if (!CI) + return false; + A.getOrCreateAAFor( + IRPosition::callsite_function(*CI), /* QueryingAA */ nullptr, + DepClassTy::NONE, /* ForceUpdate */ false, + /* UpdateAfterInit */ false); + return false; + }); +} + void OpenMPOpt::registerAAs(bool IsModulePass) { if (SCC.empty()) @@ -3665,30 +3759,10 @@ DepClassTy::NONE, /* ForceUpdate */ false, /* UpdateAfterInit */ false); - auto &IsMainRFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_is_generic_main_thread_id]; - IsMainRFI.foreachUse(SCC, [&](Use &U, Function &F) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsMainRFI); - if (!CI) - return false; - A.getOrCreateAAFor( - IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, - DepClassTy::NONE, /* ForceUpdate */ false, - /* UpdateAfterInit */ false); - return false; - }); - - auto &IsSPMDRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode]; - IsSPMDRFI.foreachUse(SCC, [&](Use &U, Function &) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsSPMDRFI); - if (!CI) - return false; - A.getOrCreateAAFor( - IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, - DepClassTy::NONE, /* ForceUpdate */ false, - /* UpdateAfterInit */ false); - return false; - }); + registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); + registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); + registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block); + registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks); } // Create CallSite AA for all Getters. diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@G = external global i32 +;. +; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 +;. +define weak void @kernel0() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel0() #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: call void @helper0() +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @helper2() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + call void @helper0() + call void @helper1() + call void @helper2() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + ret void +} + +define weak void @kernel1() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel1() #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + call void @helper1() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + ret void +} + +define weak void @kernel2() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel2() #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +; CHECK-NEXT: call void @helper0() +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @helper2() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + call void @helper0() + call void @helper1() + call void @helper2() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + ret void +} + +define internal void @helper0() { +; CHECK-LABEL: define {{[^@]+}}@helper0() { +; CHECK-NEXT: store i32 666, i32* @G, align 1 +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + store i32 %threadLimit, i32* @G + ret void +} + +define internal void @helper1() { +; CHECK-LABEL: define {{[^@]+}}@helper1() { +; CHECK-NEXT: br label [[F:%.*]] +; CHECK: t: +; CHECK-NEXT: unreachable +; CHECK: f: +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + %c = icmp eq i32 %threadLimit, 666 + br i1 %c, label %t, label %f +t: + call void @helper0() + ret void +f: + ret void +} + +define internal void @helper2() { +; CHECK-LABEL: define {{[^@]+}}@helper2() { +; CHECK-NEXT: store i32 666, i8* @G +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + store i32 %threadLimit, i32* @G + ret void +} + +declare i32 @__kmpc_get_hardware_num_threads_in_block() +declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 + + +!llvm.module.flags = !{!0, !1} +!nvvm.annotations = !{!2, !3, !4} + +attributes #0 = { "NumThreads"="666" "NumTeams"="777"} + +!0 = !{i32 7, !"openmp", i32 50} +!1 = !{i32 7, !"openmp-device", i32 50} +!2 = !{void ()* @kernel0, !"kernel", i32 1} +!3 = !{void ()* @kernel1, !"kernel", i32 1} +!4 = !{void ()* @kernel2, !"kernel", i32 1} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel0, !"kernel", i32 1} +; CHECK: [[META3:![0-9]+]] = !{void ()* @kernel1, !"kernel", i32 1} +; CHECK: [[META4:![0-9]+]] = !{void ()* @kernel2, !"kernel", i32 1} +;. diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -18,8 +18,8 @@ // Calls to the NVPTX layer (assuming 1D layout) EXTERN int __kmpc_get_hardware_thread_id_in_block(); EXTERN int GetBlockIdInKernel(); -EXTERN int __kmpc_get_hardware_num_blocks(); -EXTERN int __kmpc_get_hardware_num_threads_in_block(); +EXTERN NOINLINE int __kmpc_get_hardware_num_blocks(); +EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block(); EXTERN unsigned GetWarpId(); EXTERN unsigned GetWarpSize(); EXTERN unsigned GetLaneId();