diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -497,6 +497,16 @@ /// one we abort as the kernel is malformed. CallBase *KernelDeinitCB = nullptr; + /// A map from a function to its constant return value. If the value is + /// nullptr, the function cannot be folded. + SmallDenseMap FoldableFunctions; + + /// Flag to indicate if the associated function is a kernel entry. + bool IsKernelEntry = false; + + /// State to track what kernel entries can reach the associated function. + BooleanStateWithPtrSetVector ReachingKernelEntries; + /// Abstract State interface ///{ @@ -517,6 +527,7 @@ IsAtFixpoint = true; SPMDCompatibilityTracker.indicatePessimisticFixpoint(); ReachedUnknownParallelRegions.indicatePessimisticFixpoint(); + ReachingKernelEntries.indicatePessimisticFixpoint(); return ChangeStatus::CHANGED; } @@ -537,6 +548,11 @@ return false; if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions) return false; + if (ReachingKernelEntries != RHS.ReachingKernelEntries) + return false; + if (FoldableFunctions != RHS.FoldableFunctions) + return false; + return true; } @@ -566,6 +582,7 @@ SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; + ReachingKernelEntries ^= KIS.ReachingKernelEntries; return *this; } @@ -2725,6 +2742,10 @@ if (!OMPInfoCache.Kernels.count(Fn)) return; + // Add itself to the reaching kernel and set IsKernelEntry. + ReachingKernelEntries.insert(Fn); + IsKernelEntry = true; + OMPInformationCache::RuntimeFunctionInfo &InitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; OMPInformationCache::RuntimeFunctionInfo &DeinitRFI = @@ -2826,21 +2847,36 @@ /// Modify the IR based on the KernelInfoState as the fixpoint iteration is /// finished now. ChangeStatus manifest(Attributor &A) override { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + // Fold all valid foldable functions + for (std::pair &P : FoldableFunctions) { + if (P.second == nullptr) + continue; + + A.changeValueAfterManifest(*P.first, *P.second); + A.deleteAfterManifest(*P.first); + + Change = ChangeStatus::CHANGED; + } + // If we are not looking at a kernel with __kmpc_target_init and // __kmpc_target_deinit call we cannot actually manifest the information. if (!KernelInitCB || !KernelDeinitCB) - return ChangeStatus::UNCHANGED; + return Change; // Known SPMD-mode kernels need no manifest changes. if (SPMDCompatibilityTracker.isKnown()) - return ChangeStatus::UNCHANGED; + return Change; // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. if (!changeToSPMDMode(A)) - buildCustomStateMachine(A); + Change = Change | buildCustomStateMachine(A); + else + Change = ChangeStatus::CHANGED; - return ChangeStatus::CHANGED; + return Change; } bool changeToSPMDMode(Attributor &A) { @@ -3203,6 +3239,13 @@ if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this)) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + if (!IsKernelEntry) + updateReachingKernelEntries(A); + + // Update info regarding execution mode. + if (ReachingKernelEntries.isAssumed()) + updateSPMDFolding(A); + // Callback to check a call instruction. auto CheckCallInst = [&](Instruction &I) { auto &CB = cast(I); @@ -3210,6 +3253,19 @@ *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); if (CBAA.getState().isValidState()) getState() ^= CBAA.getState(); + + Function *Callee = CB.getCalledFunction(); + if (Callee) { + // We need to propagate information to the callee, but since the + // construction of AA always starts with kernel entries, we have to + // create AAKernelInfoFunction for all called functions. However, here + // the caller doesn't depend on the callee. + // TODO: We might want to change the dependence here later if we need + // information from callee to caller. + A.getOrCreateAAFor(IRPosition::function(*Callee), this, + DepClassTy::NONE); + } + return true; }; @@ -3219,6 +3275,81 @@ return StateBefore == getState() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } + +private: + /// Update info regarding reaching kernels. + void updateReachingKernelEntries(Attributor &A) { + auto PredCallSite = [&](AbstractCallSite ACS) { + Function *Caller = ACS.getInstruction()->getFunction(); + + assert(Caller && "Caller is nullptr"); + + auto &CAA = + A.getOrCreateAAFor(IRPosition::function(*Caller)); + if (CAA.isValidState()) { + ReachingKernelEntries ^= CAA.ReachingKernelEntries; + return true; + } + + // We lost track of the caller of the associated function, any kernel + // could reach now. + ReachingKernelEntries.indicatePessimisticFixpoint(); + + return true; + }; + + bool AllCallSitesKnown; + if (!A.checkForAllCallSites(PredCallSite, *this, + true /* RequireAllCallSites */, + AllCallSitesKnown)) + ReachingKernelEntries.indicatePessimisticFixpoint(); + } + + /// Update information regarding folding SPMD mode function calls. + void updateSPMDFolding(Attributor &A) { + unsigned Count = 0; + + for (Kernel K : ReachingKernelEntries) { + auto &AA = A.getAAFor(*this, IRPosition::function(*K), + DepClassTy::REQUIRED); + + if (!AA.isValidState()) { + ReachingKernelEntries.indicatePessimisticFixpoint(); + break; + } + + if (AA.SPMDCompatibilityTracker.isAssumed()) + ++Count; + } + + // Assume reaching kernels are in a mixture of SPMD and non-SPMD mode. + // Update all function calls to __kmpc_is_spmd_exec_mode to nullptr. + Constant *C = nullptr; + + if (ReachingKernelEntries.isAssumed()) { + auto &Ctx = getAnchorValue().getContext(); + + if (Count == 0) { + // All reaching kernels are in non-SPMD mode. Update all function + // calls to __kmpc_is_spmd_exec_mode to 0. + C = ConstantInt::get(Type::getInt8Ty(Ctx), 0); + } else if (Count == ReachingKernelEntries.size()) { + // All reaching kernels are in SPMD mode. Update all function calls to + // __kmpc_is_spmd_exec_mode to 1. + C = ConstantInt::get(Type::getInt8Ty(Ctx), 1); + } + } + + auto &OMPInfoCache = static_cast(A.getInfoCache()); + OMPInformationCache::RuntimeFunctionInfo &IsSPMDExecModeRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode]; + + for (std::pair &P : FoldableFunctions) { + CallBase *CB = P.first; + if (CB->getCalledFunction() == IsSPMDExecModeRFI.Declaration) + P.second = C; + } + } }; /// The call site kernel info abstract attribute, basically, what can we say diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_foldable.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_foldable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_foldable.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@__omp_offloading_fd00_22cf4bf_foo_l4_exec_mode = weak constant i8 0 +@__omp_offloading_fd00_22cf4bf_foo_l9_exec_mode = weak constant i8 0 +@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_fd00_22cf4bf_foo_l4_exec_mode, i8* @__omp_offloading_fd00_22cf4bf_foo_l9_exec_mode], section "llvm.metadata" +@execution_param = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 + +; Function Attrs: convergent noinline norecurse nounwind optnone +;. +; CHECK: @[[__OMP_OFFLOADING_FD00_22CF4BF_FOO_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[__OMP_OFFLOADING_FD00_22CF4BF_FOO_L9_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x i8*] [i8* @__omp_offloading_fd00_22cf4bf_foo_l4_exec_mode, i8* @__omp_offloading_fd00_22cf4bf_foo_l9_exec_mode], section "llvm.metadata" +; CHECK: @[[EXECUTION_PARAM:[a-zA-Z0-9_$"\\.-]+]] = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 +;. +define weak void @__omp_offloading_fd00_22cf4bf_foo_l4() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd00_22cf4bf_foo_l4 +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) +; CHECK-NEXT: call void @__omp_outlined__(i32* null, i32* null) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; +entry: + %0 = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) + call void @__omp_outlined__(i32* null, i32* null) #4 + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + ret void +} + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 91, i32* null, i32* null, i32* null, i32* null, i32 1, i32 0) +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__1 to i8*), i8* null, i8** null, i64 2) +; CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) +; CHECK-NEXT: ret void +; +entry: + call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 91, i32* null, i32* null, i32* null, i32* null, i32 1, i32 0) + call void @__kmpc_parallel_51(%struct.ident_t* null, i32 0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__1 to i8*), i8* null, i8** null, i64 2) + call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) + ret void +} + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 33, i32* null, i32* null, i32* null, i32* null, i32 1, i32 1) +; CHECK-NEXT: call void bitcast (void (...)* @bar to void ()*)() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) +; CHECK-NEXT: ret void +; +entry: + call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 33, i32* null, i32* null, i32* null, i32* null, i32 1, i32 1) + call void bitcast (void (...)* @bar to void ()*)() #5 + call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) + ret void +} + +; Function Attrs: convergent +declare void @bar(...) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define weak void @__omp_offloading_fd00_22cf4bf_foo_l9() #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd00_22cf4bf_foo_l9 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) +; CHECK-NEXT: call void @__omp_outlined__2(i32* null, i32* null) #[[ATTR4]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: ret void +; +entry: + %0 = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) + call void @__omp_outlined__2(i32* null, i32* null) #4 + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + ret void +} + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 91, i32* null, i32* null, i32* null, i32* null, i32 1, i32 0) +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__3 to i8*), i8* null, i8** null, i64 2) +; CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) +; CHECK-NEXT: ret void +; +entry: + call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 91, i32* null, i32* null, i32* null, i32* null, i32 1, i32 0) + call void @__kmpc_parallel_51(%struct.ident_t* null, i32 0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__3 to i8*), i8* null, i8** null, i64 2) + call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) + ret void +} + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub.) #0 { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 33, i32* null, i32* null, i32* null, i32* null, i32 1, i32 1) +; CHECK-NEXT: call void bitcast (void (...)* @bar to void ()*)() #[[ATTR5]] +; CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) +; CHECK-NEXT: ret void +; +entry: + call void @__kmpc_for_static_init_4(%struct.ident_t* null, i32 0, i32 33, i32* null, i32* null, i32* null, i32* null, i32 1, i32 1) + call void bitcast (void (...)* @bar to void ()*)() #5 + call void @__kmpc_for_static_fini(%struct.ident_t* null, i32 0) + ret void +} + +; Function Attrs: convergent nounwind mustprogress +define internal void @__kmpc_for_static_init_4(%struct.ident_t* %loc, i32 %global_tid, i32 %schedtype, i32* nocapture writeonly %plastiter, i32* nocapture %plower, i32* nocapture %pupper, i32* nocapture %pstride, i32 %incr, i32 %chunk) #2 { +; CHECK-LABEL: define {{[^@]+}}@__kmpc_for_static_init_4 +; CHECK-SAME: (%struct.ident_t* [[LOC:%.*]], i32 [[GLOBAL_TID:%.*]], i32 [[SCHEDTYPE:%.*]], i32* nocapture writeonly [[PLASTITER:%.*]], i32* nocapture [[PLOWER:%.*]], i32* nocapture [[PUPPER:%.*]], i32* nocapture [[PSTRIDE:%.*]], i32 [[INCR:%.*]], i32 [[CHUNK:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL_I:%.*]] = call i32 @_Z21GetNumberOfOmpThreadsb(i1 zeroext true) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[CALL_I]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[END:%.*]] +; CHECK: then: +; CHECK-NEXT: [[CALL_I_1:%.*]] = call i32 @_Z21GetNumberOfOmpThreadsb(i1 zeroext false) #[[ATTR6]] +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + %call = call zeroext i1 @_Z13checkSPMDModeP5ident(%struct.ident_t* %loc) #6 + %call.i = call i32 @_Z21GetNumberOfOmpThreadsb(i1 zeroext %call) #6 + %cmp = icmp ne i32 %call.i, 0 + br i1 %cmp, label %then, label %end + +then: + %call.i.1 = call i32 @_Z21GetNumberOfOmpThreadsb(i1 zeroext false) #6 + br label %end + +end: + ret void +} + +declare i32 @_Z21GetNumberOfOmpThreadsb(i1 zeroext) + +; Function Attrs: convergent nounwind mustprogress +define internal zeroext i1 @_Z13checkSPMDModeP5ident(%struct.ident_t* readonly %loc) local_unnamed_addr #2 { +entry: + %call10 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %tobool11 = icmp ne i8 %call10, 0 + ret i1 %tobool11 +} + +; Function Attrs: convergent nofree norecurse nosync nounwind readonly willreturn mustprogress +define internal signext i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr #3 { +entry: + %0 = load i32, i32* addrspacecast (i32 addrspace(3)* @execution_param to i32*), align 4, !tbaa !12 + %1 = trunc i32 %0 to i8 + %2 = and i8 %1, 1 + %3 = xor i8 %2, 1 + ret i8 %3 +} + +; Function Attrs: convergent +declare void @__kmpc_for_static_fini(%struct.ident_t* nocapture, i32) #1 + +; Function Attrs: convergent +declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 + +; Function Attrs: convergent +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 + +; Function Attrs: convergent nounwind mustprogress +define internal i32 @__kmpc_global_thread_num(%struct.ident_t* nocapture readnone %loc) #2 { +; CHECK-LABEL: define {{[^@]+}}@__kmpc_global_thread_num +; CHECK-SAME: (%struct.ident_t* nocapture readnone [[LOC:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 0 +; +entry: + %call = call i32 @_Z14GetOmpThreadIdv() #6 + ret i32 %call +} + +; Function Attrs: convergent nounwind mustprogress +define internal i32 @_Z14GetOmpThreadIdv() local_unnamed_addr #2 { +entry: + %call4 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %tobool5.not = icmp eq i8 %call4, 0 + br i1 %tobool5.not, label %if.else7, label %cleanup + +if.else7: ; preds = %entry + br label %cleanup + +cleanup: ; preds = %if.else7, %entry + %retval.0 = phi i32 [ 0, %entry ], [ 1, %if.else7 ] + ret i32 %retval.0 +} + +; Function Attrs: convergent nounwind mustprogress +define internal void @__kmpc_parallel_51(%struct.ident_t* %ident, i32 %global_tid, i32 %if_expr, i32 %num_threads, i32 %proc_bind, i8* %fn, i8* %wrapper_fn, i8** %args, i64 %nargs) #2 { +; CHECK-LABEL: define {{[^@]+}}@__kmpc_parallel_51 +; CHECK-SAME: (%struct.ident_t* [[IDENT:%.*]], i32 [[GLOBAL_TID:%.*]], i32 [[IF_EXPR:%.*]], i32 [[NUM_THREADS:%.*]], i32 [[PROC_BIND:%.*]], i8* [[FN:%.*]], i8* [[WRAPPER_FN:%.*]], i8** [[ARGS:%.*]], i64 [[NARGS:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[IF_THEN6:%.*]] +; CHECK: if.then6: +; CHECK-NEXT: br label [[CLEANUP:%.*]] +; CHECK: cleanup: +; CHECK-NEXT: ret void +; +entry: + %call1 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %conv2 = sext i8 %call1 to i32 + %cmp = icmp sgt i32 %conv2, 0 + br i1 %cmp, label %if.then, label %cleanup + +if.then: ; preds = %entry + %call4 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %tobool5.not = icmp eq i8 %call4, 0 + br i1 %tobool5.not, label %cleanup, label %if.then6 + +if.then6: ; preds = %if.then + br label %cleanup + +cleanup: ; preds = %if.then6, %if.then, %entry + ret void +} + +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +attributes #2 = { convergent nounwind mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx61,+sm_75" } +attributes #3 = { convergent nofree norecurse nosync nounwind readonly willreturn mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx61,+sm_75" } +attributes #4 = { nounwind } +attributes #5 = { convergent } +attributes #6 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5} +!omp_offload.info = !{!6, !7} +!nvvm.annotations = !{!8, !9} +!llvm.ident = !{!10, !11, !10, !10, !10, !10, !10, !10, !10, !10, !10, !10, !10, !10, !10, !10} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 2]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"openmp", i32 50} +!3 = !{i32 7, !"openmp-device", i32 50} +!4 = !{i32 7, !"PIC Level", i32 2} +!5 = !{i32 7, !"frame-pointer", i32 2} +!6 = !{i32 0, i32 64768, i32 36500671, !"foo", i32 9, i32 1} +!7 = !{i32 0, i32 64768, i32 36500671, !"foo", i32 4, i32 0} +!8 = !{void ()* @__omp_offloading_fd00_22cf4bf_foo_l4, !"kernel", i32 1} +!9 = !{void ()* @__omp_offloading_fd00_22cf4bf_foo_l9, !"kernel", i32 1} +!10 = !{!"clang version 13.0.0"} +!11 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!12 = !{!13, !13, i64 0} +!13 = !{!"int", !14, i64 0} +!14 = !{!"omnipotent char", !15, i64 0} +!15 = !{!"Simple C++ TBAA"} +;. +; CHECK: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +; CHECK: attributes #[[ATTR2]] = { convergent nounwind mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx61,+sm_75" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +; CHECK: attributes #[[ATTR4]] = { nounwind } +; CHECK: attributes #[[ATTR5]] = { convergent } +; CHECK: attributes #[[ATTR6]] = { convergent nounwind } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 2]} +; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META6:![0-9]+]] = !{i32 0, i32 64768, i32 36500671, !"foo", i32 9, i32 1} +; CHECK: [[META7:![0-9]+]] = !{i32 0, i32 64768, i32 36500671, !"foo", i32 4, i32 0} +; CHECK: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd00_22cf4bf_foo_l4, !"kernel", i32 1} +; CHECK: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd00_22cf4bf_foo_l9, !"kernel", i32 1} +; CHECK: [[META10:![0-9]+]] = !{!"clang version 13.0.0"} +; CHECK: [[META11:![0-9]+]] = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +;. diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_unfoldable.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_unfoldable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_unfoldable.ll @@ -0,0 +1,430 @@ +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s + +;; void bar(void); +;; +;; #pragma omp target teams distribute parallel for +;; for (int i = 0; i < 128; ++i) { +;; bar(); +;; } +;; +;; #pragma omp target +;; { +;; bar(); +;; #pragma omp parallel for +;; for (int i = 0; i < 128; ++i) { +;; bar(); +;; } +;; } + +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 3, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 3, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@4 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 3, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_fd00_22cf4bf_foo_l4_exec_mode = weak constant i8 0 +@5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_fd00_22cf4bf_foo_l9_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_fd00_22cf4bf_foo_l4_exec_mode, i8* @__omp_offloading_fd00_22cf4bf_foo_l9_exec_mode], section "llvm.metadata" + +define weak void @__omp_offloading_fd00_22cf4bf_foo_l4() #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 true, i1 false, i1 false) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @4) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #5 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 true, i1 false) + ret void + +worker.exit: ; preds = %entry + ret void +} + +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %captured_vars_addrs = alloca [2 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 127, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @2, i32 %1, i32 91, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 %nvptx_num_threads) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 127 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 127, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %cond.end7, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %cmp1 = icmp slt i32 %5, 128 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %6 = load i32, i32* %.omp.comb.lb, align 4 + %7 = zext i32 %6 to i64 + %8 = load i32, i32* %.omp.comb.ub, align 4 + %9 = zext i32 %8 to i64 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %captured_vars_addrs, i64 0, i64 0 + %11 = inttoptr i64 %7 to i8* + store i8* %11, i8** %10, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %captured_vars_addrs, i64 0, i64 1 + %13 = inttoptr i64 %9 to i8* + store i8* %13, i8** %12, align 8 + %14 = bitcast [2 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @4, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__1 to i8*), i8* null, i8** %14, i64 2) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %15 = load i32, i32* %.omp.iv, align 4 + %16 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %15, %16 + store i32 %add, i32* %.omp.iv, align 4 + %17 = load i32, i32* %.omp.comb.lb, align 4 + %18 = load i32, i32* %.omp.stride, align 4 + %add2 = add nsw i32 %17, %18 + store i32 %add2, i32* %.omp.comb.lb, align 4 + %19 = load i32, i32* %.omp.comb.ub, align 4 + %20 = load i32, i32* %.omp.stride, align 4 + %add3 = add nsw i32 %19, %20 + store i32 %add3, i32* %.omp.comb.ub, align 4 + %21 = load i32, i32* %.omp.comb.ub, align 4 + %cmp4 = icmp sgt i32 %21, 127 + br i1 %cmp4, label %cond.true5, label %cond.false6 + +cond.true5: ; preds = %omp.inner.for.inc + br label %cond.end7 + +cond.false6: ; preds = %omp.inner.for.inc + %22 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end7 + +cond.end7: ; preds = %cond.false6, %cond.true5 + %cond8 = phi i32 [ 127, %cond.true5 ], [ %22, %cond.false6 ] + store i32 %cond8, i32* %.omp.comb.ub, align 4 + %23 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %23, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @2, i32 %1) + ret void +} + +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 + +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 127, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @3, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %5 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %5 to i64 + %6 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %6 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + call void bitcast (void (...)* @bar to void ()*)() #7 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %8, %9 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @2, i32 %3) + ret void +} + +declare void @bar(...) #2 + +define weak void @__omp_offloading_fd00_22cf4bf_foo_l9() #0 { +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + call void bitcast (void (...)* @bar to void ()*)() #7 + %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %2, i64 0) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 127, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @5, i32 %1, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + br label %omp.dispatch.cond + +omp.dispatch.cond: ; preds = %omp.dispatch.inc, %entry + %2 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %2, 127 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %omp.dispatch.cond + br label %cond.end + +cond.false: ; preds = %omp.dispatch.cond + %3 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 127, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.end + +omp.dispatch.body: ; preds = %cond.end + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %omp.dispatch.body + %7 = load i32, i32* %.omp.iv, align 4 + %8 = load i32, i32* %.omp.ub, align 4 + %cmp2 = icmp sle i32 %7, %8 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %9 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %9, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + call void bitcast (void (...)* @bar to void ()*)() #7 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %10 = load i32, i32* %.omp.iv, align 4 + %add3 = add nsw i32 %10, 1 + store i32 %add3, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.dispatch.inc + +omp.dispatch.inc: ; preds = %omp.inner.for.end + %11 = load i32, i32* %.omp.lb, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %11, %12 + store i32 %add4, i32* %.omp.lb, align 4 + %13 = load i32, i32* %.omp.ub, align 4 + %14 = load i32, i32* %.omp.stride, align 4 + %add5 = add nsw i32 %13, %14 + store i32 %add5, i32* %.omp.ub, align 4 + br label %omp.dispatch.cond + +omp.dispatch.end: ; preds = %cond.end + call void @__kmpc_for_static_fini(%struct.ident_t* @5, i32 %1) + ret void +} + +define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #3 { +entry: + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32 %1, i32* %.addr1, align 4 + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) #5 + ret void +} + +define internal void @__kmpc_for_static_init_4(%struct.ident_t* %loc, i32 %global_tid, i32 %schedtype, i32* nocapture writeonly %plastiter, i32* nocapture %plower, i32* nocapture %pupper, i32* nocapture %pstride, i32 %incr, i32 %chunk) #4 { +entry: + %call = call zeroext i1 @_Z13checkSPMDModeP5ident(%struct.ident_t* %loc) #6 + ret void +} + +; CHECK-LABEL: define internal zeroext i1 @_Z13checkSPMDModeP5ident +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEMP0:%.+]] = call signext i8 @__kmpc_is_spmd_exec_mode() +; CHECK-NEXT: [[TEMP1:%.+]] = icmp ne i8 [[TEMP0]], 0 + +define internal zeroext i1 @_Z13checkSPMDModeP5ident(%struct.ident_t* readonly %loc) local_unnamed_addr #4 { +entry: + %call10 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %tobool11 = icmp ne i8 %call10, 0 + ret i1 0 +} + +; CHECK-LABEL: declare signext i8 @__kmpc_is_spmd_exec_mode() +declare signext i8 @__kmpc_is_spmd_exec_mode() + +declare void @__kmpc_for_static_fini(%struct.ident_t* nocapture %loc, i32 %global_tid) + +declare i32 @__kmpc_target_init(%struct.ident_t* %Ident, i1 zeroext %IsSPMD, i1 zeroext %UseGenericStateMachine, i1 zeroext %RequiresFullRuntime) + +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone %Ident, i1 zeroext %IsSPMD, i1 zeroext %RequiresFullRuntime) + +define internal i32 @__kmpc_global_thread_num(%struct.ident_t* nocapture readnone %loc) #4 { +entry: + %call = call i32 @_Z14GetOmpThreadIdv() #6 + ret i32 %call +} + +; CHECK-LABEL: define internal i32 @_Z14GetOmpThreadIdv() +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call signext i8 @__kmpc_is_spmd_exec_mode() +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0 + +define internal i32 @_Z14GetOmpThreadIdv() local_unnamed_addr #4 { +entry: + %call4 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %tobool5.not = icmp eq i8 %call4, 0 + ret i32 0 +} + +declare signext i8 @__kmpc_is_generic_main_thread(i32 %Tid) + +; CHECK-LABEL: define internal void @__kmpc_parallel_51 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEMP0:%.+]] = call signext i8 @__kmpc_is_spmd_exec_mode() +; CHECK-NEXT: [[TEMP1:%.+]] = sext i8 [[TEMP0]] to i32 +; CHECK-NEXT: [[TEMP2:%.+]] = call signext i8 @__kmpc_is_spmd_exec_mode() +; CHECK-NEXT: [[TEMP3:%.+]] = icmp eq i8 [[TEMP2]], 0 + +define internal void @__kmpc_parallel_51(%struct.ident_t* %ident, i32 %global_tid, i32 %if_expr, i32 %num_threads, i32 %proc_bind, i8* %fn, i8* %wrapper_fn, i8** %args, i64 %nargs) #4 { +entry: + %call1 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %conv2 = sext i8 %call1 to i32 + %call4 = call signext i8 @__kmpc_is_spmd_exec_mode() #6 + %tobool5.not = icmp eq i8 %call4, 0 + ret void +} + +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +attributes #1 = { nounwind readnone } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +attributes #3 = { convergent noinline norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx72,+sm_75" } +attributes #4 = { convergent nounwind mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx61,+sm_75" } +attributes #5 = { nounwind } +attributes #6 = { convergent nounwind } +attributes #7 = { convergent } + + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5} +!omp_offload.info = !{!6, !7} +!nvvm.annotations = !{!8, !9} +!llvm.ident = !{!10, !11} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 2]} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 7, !"openmp", i32 50} +!3 = !{i32 7, !"openmp-device", i32 50} +!4 = !{i32 7, !"PIC Level", i32 2} +!5 = !{i32 7, !"frame-pointer", i32 2} +!6 = !{i32 0, i32 64768, i32 36500671, !"foo", i32 9, i32 1} +!7 = !{i32 0, i32 64768, i32 36500671, !"foo", i32 4, i32 0} +!8 = !{void ()* @__omp_offloading_fd00_22cf4bf_foo_l4, !"kernel", i32 1} +!9 = !{void ()* @__omp_offloading_fd00_22cf4bf_foo_l9, !"kernel", i32 1} +!10 = !{!"clang version 13.0.0"} +!11 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}