diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -206,6 +206,9 @@ /* kmp_task_t */ VoidPtr, Int32, /* kmp_task_affinity_info_t */ VoidPtr) +__OMP_RTL(__kmpc_get_hardware_num_blocks, false, Int32, ) +__OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, ) + __OMP_RTL(omp_get_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) __OMP_RTL(omp_get_max_threads, false, Int32, ) @@ -601,6 +604,9 @@ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs)) +__OMP_RTL_ATTRS(__kmpc_get_hardware_num_blocks, GetterAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, AttributeSet(), ParamAttrs()) + __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), ParamAttrs()) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1833,6 +1833,8 @@ return Changed == ChangeStatus::CHANGED; } + void registerFoldRuntimeCall(RuntimeFunction RF); + /// Populate the Attributor with abstract attribute opportunities in the /// function. void registerAAs(bool IsModulePass); @@ -3506,6 +3508,8 @@ case OMPRTL___kmpc_is_spmd_exec_mode: case OMPRTL___kmpc_for_static_fini: case OMPRTL___kmpc_global_thread_num: + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + case OMPRTL___kmpc_get_hardware_num_blocks: case OMPRTL___kmpc_single: case OMPRTL___kmpc_end_single: case OMPRTL___kmpc_master: @@ -3710,7 +3714,6 @@ ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; - switch (RFKind) { case OMPRTL___kmpc_is_spmd_exec_mode: Changed |= foldIsSPMDExecMode(A); @@ -3721,6 +3724,12 @@ case OMPRTL___kmpc_parallel_level: Changed |= foldParallelLevel(A); break; + case OMPRTL___kmpc_get_hardware_num_threads_in_block: + Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit"); + break; + case OMPRTL___kmpc_get_hardware_num_blocks: + Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams"); + break; default: llvm_unreachable("Unhandled OpenMP runtime function!"); } @@ -3892,7 +3901,39 @@ "Expected only non-SPMD kernels!"); SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0); } + return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } + + ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) { + // Specialize only if all the calls agree with the attribute constant value + int32_t CurrentAttrValue = -1; + Optional SimplifiedValueBefore = SimplifiedValue; + + auto &CallerKernelInfoAA = A.getAAFor( + *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); + if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) + return indicatePessimisticFixpoint(); + + // Iterate over the kernels that reach this function + for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { + int32_t NextAttrVal = -1; + if (K->hasFnAttribute(Attr)) + NextAttrVal = + std::stoi(K->getFnAttribute(Attr).getValueAsString().str()); + + if (NextAttrVal == -1 || + (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal)) + return indicatePessimisticFixpoint(); + CurrentAttrValue = NextAttrVal; + } + + if (CurrentAttrValue != -1) { + auto &Ctx = getAnchorValue().getContext(); + SimplifiedValue = + ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue); + } return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } @@ -3908,6 +3949,21 @@ } // namespace +/// Register folding callsite +void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) { + auto &RFI = OMPInfoCache.RFIs[RF]; + RFI.foreachUse(SCC, [&](Use &U, Function &F) { + CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CI) + return false; + A.getOrCreateAAFor( + IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, + DepClassTy::NONE, /* ForceUpdate */ false, + /* UpdateAfterInit */ false); + return false; + }); +} + void OpenMPOpt::registerAAs(bool IsModulePass) { if (SCC.empty()) @@ -3923,43 +3979,12 @@ DepClassTy::NONE, /* ForceUpdate */ false, /* UpdateAfterInit */ false); - auto &IsMainRFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_is_generic_main_thread_id]; - IsMainRFI.foreachUse(SCC, [&](Use &U, Function &F) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsMainRFI); - if (!CI) - return false; - A.getOrCreateAAFor( - IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, - DepClassTy::NONE, /* ForceUpdate */ false, - /* UpdateAfterInit */ false); - return false; - }); - auto &IsSPMDRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode]; - IsSPMDRFI.foreachUse(SCC, [&](Use &U, Function &) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsSPMDRFI); - if (!CI) - return false; - A.getOrCreateAAFor( - IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, - DepClassTy::NONE, /* ForceUpdate */ false, - /* UpdateAfterInit */ false); - return false; - }); - - auto &ParallelLevelRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_level]; - ParallelLevelRFI.foreachUse(SCC, [&](Use &U, Function &) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &ParallelLevelRFI); - if (!CI) - return false; - A.getOrCreateAAFor( - IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, - DepClassTy::NONE, /* ForceUpdate */ false, - /* UpdateAfterInit */ false); - - return false; - }); + registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); + registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); + registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level); + registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block); + registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks); } // Create CallSite AA for all Getters. diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@kernel0_exec_mode = weak constant i8 1 + +@G = external global i32 +;. +; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32 +;. +define weak void @kernel0() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel0() +; CHECK: #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: call void @helper0() +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @helper2() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + call void @helper0() + call void @helper1() + call void @helper2() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + ret void +} + +@kernel1_exec_mode = weak constant i8 1 + +define weak void @kernel1() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel1() +; CHECK: #[[ATTR0]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + call void @helper1() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + ret void +} + +@kernel2_exec_mode = weak constant i8 1 + +define weak void @kernel2() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel2() +; CHECK: #[[ATTR0]] { +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +; CHECK-NEXT: call void @helper0() +; CHECK-NEXT: call void @helper1() +; CHECK-NEXT: call void @helper2() +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: ret void +; + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + call void @helper0() + call void @helper1() + call void @helper2() + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + ret void +} + +define internal void @helper0() { +; CHECK-LABEL: define {{[^@]+}}@helper0() {{#[0-9]+}} { +; CHECK-NEXT: store i32 666, i32* @G, align 4 +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + store i32 %threadLimit, i32* @G + ret void +} + +define internal void @helper1() { +; CHECK-LABEL: define {{[^@]+}}@helper1() {{#[0-9]+}} { +; CHECK-NEXT: br label [[F:%.*]] +; CHECK: t: +; CHECK-NEXT: unreachable +; CHECK: f: +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + %c = icmp eq i32 %threadLimit, 666 + br i1 %c, label %f, label %t +t: + call void @helper0() + ret void +f: + ret void +} + +define internal void @helper2() { +; CHECK-LABEL: define {{[^@]+}}@helper2() {{#[0-9]+}} { +; CHECK-NEXT: store i32 666, i32* @G +; CHECK-NEXT: ret void +; + %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() + store i32 %threadLimit, i32* @G + ret void +} + +declare i32 @__kmpc_get_hardware_num_threads_in_block() +declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 + + +!llvm.module.flags = !{!0, !1} +!nvvm.annotations = !{!2, !3, !4} + +attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"} + +!0 = !{i32 7, !"openmp", i32 50} +!1 = !{i32 7, !"openmp-device", i32 50} +!2 = !{void ()* @kernel0, !"kernel", i32 1} +!3 = !{void ()* @kernel1, !"kernel", i32 1} +!4 = !{void ()* @kernel2, !"kernel", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" } +; +; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel0, !"kernel", i32 1} +; CHECK: [[META3:![0-9]+]] = !{void ()* @kernel1, !"kernel", i32 1} +; CHECK: [[META4:![0-9]+]] = !{void ()* @kernel2, !"kernel", i32 1} +;. diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -18,8 +18,8 @@ // Calls to the NVPTX layer (assuming 1D layout) EXTERN int __kmpc_get_hardware_thread_id_in_block(); EXTERN int GetBlockIdInKernel(); -EXTERN int __kmpc_get_hardware_num_blocks(); -EXTERN int __kmpc_get_hardware_num_threads_in_block(); +EXTERN NOINLINE int __kmpc_get_hardware_num_blocks(); +EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block(); EXTERN unsigned GetWarpId(); EXTERN unsigned GetWarpSize(); EXTERN unsigned GetLaneId();