diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -1888,7 +1888,8 @@ /// called by external functions when there is an internalized version in the /// module. static bool internalizeFunctions(SmallPtrSetImpl &FnSet, - DenseMap &FnMap); + DenseMap &FnMap, + StringRef Suffix = ".internalized"); /// Return the data layout associated with the anchor scope. const DataLayout &getDataLayout() const { return InfoCache.DL; } diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -1926,8 +1926,7 @@ } bool Attributor::isInternalizable(Function &F) { - if (F.isDeclaration() || F.hasLocalLinkage() || - GlobalValue::isInterposableLinkage(F.getLinkage())) + if (F.isDeclaration() || GlobalValue::isInterposableLinkage(F.getLinkage())) return false; return true; } @@ -1935,7 +1934,7 @@ Function *Attributor::internalizeFunction(Function &F, bool Force) { if (!AllowDeepWrapper && !Force) return nullptr; - if (!isInternalizable(F)) + if (F.hasLocalLinkage() || !isInternalizable(F)) return nullptr; SmallPtrSet FnSet = {&F}; @@ -1946,21 +1945,20 @@ } bool Attributor::internalizeFunctions(SmallPtrSetImpl &FnSet, - DenseMap &FnMap) { + DenseMap &FnMap, + StringRef Suffix) { for (Function *F : FnSet) if (!Attributor::isInternalizable(*F)) return false; - FnMap.clear(); // Generate the internalized version of each function. for (Function *F : FnSet) { Module &M = *F->getParent(); FunctionType *FnTy = F->getFunctionType(); // Create a copy of the current function - Function *Copied = - Function::Create(FnTy, F->getLinkage(), F->getAddressSpace(), - F->getName() + ".internalized"); + Function *Copied = Function::Create( + FnTy, F->getLinkage(), F->getAddressSpace(), F->getName() + Suffix); ValueToValueMapTy VMap; auto *NewFArgIt = Copied->arg_begin(); for (auto &Arg : F->args()) { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -544,6 +544,9 @@ /// caller is __kmpc_parallel_51. BooleanStateWithSetVector ParallelLevels; + /// Set of calls we need to rewrite to enable SPMDzation with simple guards. + SmallPtrSet CallsToVersionForSPMDzation; + /// Abstract State interface ///{ @@ -586,6 +589,8 @@ return false; if (ReachingKernelEntries != RHS.ReachingKernelEntries) return false; + if (CallsToVersionForSPMDzation != RHS.CallsToVersionForSPMDzation) + return false; return true; } @@ -615,6 +620,8 @@ SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; + CallsToVersionForSPMDzation.insert(KIS.CallsToVersionForSPMDzation.begin(), + KIS.CallsToVersionForSPMDzation.end()); return *this; } @@ -3027,6 +3034,37 @@ return false; } + // Before we created guarded regions below we need to perform versioning of + // functions potentially called from different parallel levels into an + // SPMDzided and parallel version. Only the former needs guards and adding + // guards (as defined below) to the latter will deadlock at runitme. We + // therefore will create ".parallel" versions of these functions and call + // them from within parallel regions instead of the original (to which we + // will add guards). + if (!CallsToVersionForSPMDzation.empty()) { + DenseMap VersionedFnsMap; + SmallPtrSet VersionedFns; + for (CallBase *CB : CallsToVersionForSPMDzation) { + LLVM_DEBUG(dbgs() << TAG << "Call to version for SPMDzation " << *CB + << "\n"); + assert(CB->getCalledFunction() && "Versioning without known callee?"); + VersionedFns.insert(CB->getCalledFunction()); + } + for (Function &F : *(*CallsToVersionForSPMDzation.begin())->getModule()) { + auto *FAA = A.lookupAAFor(IRPosition::function(F), + nullptr, DepClassTy::OPTIONAL); + // If we haven't created an AAKernelInfo for a function or it has an + // invalid parallel level (=it can be reached from a parallel region) we + // need to replace the calls inside it with calls to the ".parallel" + // versions we create below. Other functions should retain the cals to + // the default non-".parallel" version. + if (FAA && FAA->ParallelLevels.isValidState()) + VersionedFnsMap[&F] = &F; + } + Attributor::internalizeFunctions(VersionedFns, VersionedFnsMap, + ".parallel"); + } + auto CreateGuardedRegion = [&](Instruction *RegionStartI, Instruction *RegionEndI) { LoopInfo *LI = nullptr; @@ -3572,9 +3610,6 @@ if (!IsKernelEntry) { updateReachingKernelEntries(A); updateParallelLevels(A); - - if (!ParallelLevels.isValidState()) - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); } // Callback to check a call instruction. @@ -3585,6 +3620,29 @@ *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); getState() ^= CBAA.getState(); AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint(); + + // Check if this call is SPMD-compatible and if it requires versioning + // (into an SPMD and non-SPMD variant, the later will [potentially] + // contain guards for side-effects). + Function *Callee = CB.getCalledFunction(); + if (Callee && Callee->getName().startswith("__nv")) { + // TODO: Improve this and allow amdgpu builtins. + } else if (!CBAA.SPMDCompatibilityTracker.isAssumed()) { + SPMDCompatibilityTracker.insert(&I); + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + } else if (!CBAA.ParallelLevels.isValidState()) { + // The call site itself is SPMD compatible but the callee is executed + // from different parallel levels. This means we might require + // versioning so we check if that is possible and record it if so. If + // not, SPMDzation failed. + bool CanVersion = Callee && Attributor::isInternalizable(*Callee); + if (CanVersion) { + CallsToVersionForSPMDzation.insert(&CB); + } else { + SPMDCompatibilityTracker.insert(&I); + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + } + } return true; }; @@ -3692,7 +3750,9 @@ // Helper to lookup an assumption string. auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) { - return Fn && hasAssumption(*Fn, AssumptionStr); + // TODO: Replace the name matching and allow AMDGPU builtins. + return Fn && (hasAssumption(*Fn, AssumptionStr) || + Fn->getName().startswith("__nv")); }; // Check for SPMD-mode assumptions. @@ -4223,7 +4283,6 @@ DepClassTy::NONE, /* ForceUpdate */ false, /* UpdateAfterInit */ false); - registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level); @@ -4432,11 +4491,11 @@ if (isOpenMPDevice(M)) { SmallPtrSet InternalizeFns; for (Function &F : M) - if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) && - !DisableInternalization) { + if (!F.hasLocalLinkage() && !F.isDeclaration() && !Kernels.contains(&F) && + IsCalled(F) && !DisableInternalization) { if (Attributor::isInternalizable(F)) { InternalizeFns.insert(&F); - } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) { + } else if (!F.hasFnAttribute(Attribute::Cold)) { EmitRemark(F); } } diff --git a/llvm/test/Transforms/OpenMP/spmdization_versioning.ll b/llvm/test/Transforms/OpenMP/spmdization_versioning.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/spmdization_versioning.ll @@ -0,0 +1,506 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s +; int G = 0; +; __attribute__((noinline)) +; void version_required1() { +; #pragma omp atomic +; G += 1; +; } +; __attribute__((noinline)) +; static void version_required2() { +; #pragma omp atomic +; G += 3; +; } +; __attribute__((noinline)) +; static void version_required3() { +; version_required2(); +; #pragma omp atomic +; G += 5; +; } +; __attribute__((noinline)) +; void no_version_required1() { +; #pragma omp atomic +; G += 7; +; } +; __attribute__((noinline)) +; static void no_version_required2() { +; version_required3(); +; #pragma omp atomic +; G += 13; +; } +; __attribute__((noinline)) +; void no_version_required3() { +; version_required3(); +; no_version_required1(); +; #pragma omp atomic +; G += 11; +; } +; void spmdizable_with_versions() { +; #pragma omp target teams distribute +; for (int i = 0; i < 100; ++i) { +; #pragma omp parallel +; { +; __nv_foo(); +; version_required1(); +; no_version_required3(); +; } +; __nv_foo(); +; version_required1(); +; no_version_required2(); +; } +; } +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_2a_faa34f_spmdizable_with_versions_l39_exec_mode = weak constant i8 1 +@G = external global i32, align 4 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2a_faa34f_spmdizable_with_versions_l39_exec_mode], section "llvm.metadata" + +; Function Attrs: alwaysinline convergent norecurse nounwind +define weak void @__omp_offloading_2a_faa34f_spmdizable_with_versions_l39() local_unnamed_addr #0 { +entry: + %captured_vars_addrs.i = alloca [0 x i8*], align 8 + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #4 + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %common.ret + +common.ret: ; preds = %entry, %__omp_outlined__.exit + ret void + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) + %2 = bitcast [0 x i8*]* %captured_vars_addrs.i to i8* + call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) + %3 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4, !noalias !8 + store i32 0, i32* %.omp.lb.i, align 4, !tbaa !11, !noalias !8 + %4 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #4, !noalias !8 + store i32 99, i32* %.omp.ub.i, align 4, !tbaa !11, !noalias !8 + %5 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %5) #4, !noalias !8 + store i32 1, i32* %.omp.stride.i, align 4, !tbaa !11, !noalias !8 + %6 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %6) #4, !noalias !8 + store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !11, !noalias !8 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @2, i32 %1, i32 92, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #4, !noalias !8 + %7 = load i32, i32* %.omp.ub.i, align 4, !tbaa !11, !noalias !8 + %8 = icmp slt i32 %7, 99 + %cond.i = select i1 %8, i32 %7, i32 99 + store i32 %cond.i, i32* %.omp.ub.i, align 4, !tbaa !11, !noalias !8 + %9 = load i32, i32* %.omp.lb.i, align 4, !tbaa !11, !noalias !8 + %10 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i, i64 0, i64 0 + %cmp1.not1.i = icmp sgt i32 %9, %cond.i + br i1 %cmp1.not1.i, label %__omp_outlined__.exit, label %omp.inner.for.body.i + +omp.inner.for.body.i: ; preds = %user_code.entry, %omp.inner.for.body.i + %.omp.iv.02.i = phi i32 [ %add2.i, %omp.inner.for.body.i ], [ %9, %user_code.entry ] + call void @__kmpc_parallel_51(%struct.ident_t* nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** nonnull %10, i64 0) #4, !noalias !8 + call void @__nv_foo() + call void @version_required1() #4, !noalias !8 + call fastcc void @no_version_required2() #7, !noalias !8 + %add2.i = add nsw i32 %.omp.iv.02.i, 1 + %11 = load i32, i32* %.omp.ub.i, align 4, !tbaa !11, !noalias !8 + %cmp1.not.not.i = icmp slt i32 %.omp.iv.02.i, %11 + br i1 %cmp1.not.not.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit + +__omp_outlined__.exit: ; preds = %omp.inner.for.body.i, %user_code.entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @2, i32 %1) #4, !noalias !8 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %6) #4, !noalias !8 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %5) #4, !noalias !8 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #4, !noalias !8 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4, !noalias !8 + call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i1 false, i1 true) #4 + br label %common.ret +} + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr + +; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr + +; Function Attrs: alwaysinline convergent mustprogress nofree norecurse nounwind willreturn +define internal void @__omp_outlined__1(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid.) #2 { +entry: + call void @__nv_foo() + call void @version_required1() + call void @no_version_required3() #8 + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { +entry: + %global_args = alloca i8**, align 8 + call void @__kmpc_get_shared_variables(i8*** nonnull %global_args) #4 + call void @__nv_foo() + call void @version_required1() #4 + call void @no_version_required3() #7 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) local_unnamed_addr + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr + +; Function Attrs: nounwind +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr #4 + +; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr + +; Function Attrs: mustprogress nofree noinline norecurse nounwind willreturn +define hidden void @version_required1() local_unnamed_addr #5 { +entry: + %0 = atomicrmw add i32* @G, i32 1 monotonic, align 4 + ret void +} + +; Function Attrs: convergent mustprogress nofree noinline norecurse nounwind willreturn +define hidden void @no_version_required3() local_unnamed_addr #6 { +entry: + call fastcc void @version_required3() #8 + call void @no_version_required1() + %0 = atomicrmw add i32* @G, i32 11 monotonic, align 4 + ret void +} + +; Function Attrs: convergent mustprogress nofree noinline norecurse nounwind willreturn +define internal fastcc void @version_required3() unnamed_addr #6 { +entry: + call fastcc void @version_required2() + %0 = atomicrmw add i32* @G, i32 5 monotonic, align 4 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind willreturn +define internal fastcc void @version_required2() unnamed_addr #5 { +entry: + %0 = atomicrmw add i32* @G, i32 3 monotonic, align 4 + ret void +} + +; Function Attrs: mustprogress nofree noinline norecurse nounwind willreturn +define hidden void @no_version_required1() local_unnamed_addr #5 { +entry: + %0 = atomicrmw add i32* @G, i32 7 monotonic, align 4 + ret void +} + +; Function Attrs: convergent mustprogress nofree noinline norecurse nounwind willreturn +define internal fastcc void @no_version_required2() unnamed_addr #6 { +entry: + call fastcc void @version_required3() #8 + %0 = atomicrmw add i32* @G, i32 13 monotonic, align 4 + ret void +} + +declare void @__nv_foo() + +attributes #0 = { alwaysinline convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn } +attributes #2 = { alwaysinline convergent mustprogress nofree norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #3 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #4 = { nounwind } +attributes #5 = { mustprogress nofree noinline norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #6 = { convergent mustprogress nofree noinline norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #7 = { convergent nounwind } +attributes #8 = { convergent } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3, !4, !5, !6} +!llvm.ident = !{!7} + +!0 = !{i32 0, i32 42, i32 16425807, !"spmdizable_with_versions", i32 39, i32 0} +!1 = !{void ()* @__omp_offloading_2a_faa34f_spmdizable_with_versions_l39, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"openmp", i32 50} +!4 = !{i32 7, !"openmp-device", i32 50} +!5 = !{i32 7, !"PIC Level", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{!"clang version 14.0.0"} +!8 = !{!9} +!9 = distinct !{!9, !10, !"__omp_outlined__: %.global_tid."} +!10 = distinct !{!10, !"__omp_outlined__"} +!11 = !{!12, !12, i64 0} +!12 = !{!"int", !13, i64 0} +!13 = !{!"omnipotent char", !14, i64 0} +!14 = !{!"Simple C/C++ TBAA"} + +;. +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; CHECK: @[[__OMP_OFFLOADING_2A_FAA34F_SPMDIZABLE_WITH_VERSIONS_L39_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4 +; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_faa34f_spmdizable_with_versions_l39_exec_mode], section "llvm.metadata" +;. +; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2a_faa34f_spmdizable_with_versions_l39 +; CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: [[DOTOMP_LB_I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTOMP_UB_I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTOMP_STRIDE_I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTOMP_IS_LAST_I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i1 true, i1 false, i1 false) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @[[GLOB1]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DOTOMP_LB_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP3]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB_I]], align 4, !tbaa [[TBAA11:![0-9]+]], !noalias !8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[DOTOMP_UB_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP4]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: store i32 99, i32* [[DOTOMP_UB_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DOTOMP_STRIDE_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP5]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[DOTOMP_IS_LAST_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP6]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @[[GLOB2]], i32 [[TMP1]], i32 92, i32* nonnull [[DOTOMP_IS_LAST_I]], i32* nonnull [[DOTOMP_LB_I]], i32* nonnull [[DOTOMP_UB_I]], i32* nonnull [[DOTOMP_STRIDE_I]], i32 1, i32 1) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_UB_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i32 [[TMP7]], 99 +; CHECK-NEXT: [[COND_I:%.*]] = select i1 [[TMP8]], i32 [[TMP7]], i32 99 +; CHECK-NEXT: store i32 [[COND_I]], i32* [[DOTOMP_UB_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_LB_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* [[CAPTURED_VARS_ADDRS_I]], i64 0, i64 0 +; CHECK-NEXT: [[CMP1_NOT1_I:%.*]] = icmp sgt i32 [[TMP9]], [[COND_I]] +; CHECK-NEXT: br i1 [[CMP1_NOT1_I]], label [[__OMP_OUTLINED___EXIT:%.*]], label [[OMP_INNER_FOR_BODY_I:%.*]] +; CHECK: omp.inner.for.body.i: +; CHECK-NEXT: [[DOTOMP_IV_02_I:%.*]] = phi i32 [ [[ADD2_I:%.*]], [[OMP_INNER_FOR_BODY_I]] ], [ [[TMP9]], [[USER_CODE_ENTRY]] ] +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** nonnull [[TMP10]], i64 0) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call void @__nv_foo() +; CHECK-NEXT: call void @version_required1.internalized() #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call fastcc void @no_version_required2() #[[ATTR9:[0-9]+]], !noalias !8 +; CHECK-NEXT: [[ADD2_I]] = add nsw i32 [[DOTOMP_IV_02_I]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB_I]], align 4, !tbaa [[TBAA11]], !noalias !8 +; CHECK-NEXT: [[CMP1_NOT_NOT_I:%.*]] = icmp slt i32 [[DOTOMP_IV_02_I]], [[TMP11]] +; CHECK-NEXT: br i1 [[CMP1_NOT_NOT_I]], label [[OMP_INNER_FOR_BODY_I]], label [[__OMP_OUTLINED___EXIT]] +; CHECK: __omp_outlined__.exit: +; CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @[[GLOB2]], i32 [[TMP1]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP6]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP5]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP4]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP3]]) #[[ATTR2]], !noalias !8 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull [[TMP2]]) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i1 true, i1 false) #[[ATTR2]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +; +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 +; CHECK-SAME: (i32* noalias nocapture readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @__nv_foo() +; CHECK-NEXT: call void @version_required1.internalized.parallel() #[[ATTR2]] +; CHECK-NEXT: call void @no_version_required3.internalized() #[[ATTR9]] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** nonnull [[GLOBAL_ARGS]]) #[[ATTR2]] +; CHECK-NEXT: call void @__nv_foo() +; CHECK-NEXT: call void @version_required1.internalized.parallel() #[[ATTR2]] +; CHECK-NEXT: call void @no_version_required3.internalized() #[[ATTR9]] +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required1.internalized.parallel +; CHECK-SAME: () local_unnamed_addr #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 1 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required1.internalized +; CHECK-SAME: () local_unnamed_addr #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] +; CHECK: region.check.tid: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; CHECK: region.guarded: +; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add i32* @G, i32 1 monotonic, align 4 +; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] +; CHECK: region.guarded.end: +; CHECK-NEXT: br label [[REGION_BARRIER]] +; CHECK: region.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[REGION_EXIT:%.*]] +; CHECK: region.exit: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required1 +; CHECK-SAME: () local_unnamed_addr #[[ATTR5]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 1 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@no_version_required3.internalized +; CHECK-SAME: () local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call fastcc void @version_required3.parallel() #[[ATTR10:[0-9]+]] +; CHECK-NEXT: call void @no_version_required1.internalized() #[[ATTR11:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 11 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@no_version_required3 +; CHECK-SAME: () local_unnamed_addr #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call fastcc void @version_required3.parallel() #[[ATTR9]] +; CHECK-NEXT: call void @no_version_required1() #[[ATTR2]] +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 11 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required3.parallel +; CHECK-SAME: () unnamed_addr #[[ATTR8]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call fastcc void @version_required2.parallel() +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 5 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required3 +; CHECK-SAME: () unnamed_addr #[[ATTR7]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call fastcc void @version_required2() #[[ATTR11]] +; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] +; CHECK: region.check.tid: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; CHECK: region.guarded: +; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add i32* @G, i32 5 monotonic, align 4 +; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] +; CHECK: region.guarded.end: +; CHECK-NEXT: br label [[REGION_BARRIER]] +; CHECK: region.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[REGION_EXIT:%.*]] +; CHECK: region.exit: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required2.parallel +; CHECK-SAME: () unnamed_addr #[[ATTR5]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 3 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@version_required2 +; CHECK-SAME: () unnamed_addr #[[ATTR6]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] +; CHECK: region.check.tid: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; CHECK: region.guarded: +; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add i32* @G, i32 3 monotonic, align 4 +; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] +; CHECK: region.guarded.end: +; CHECK-NEXT: br label [[REGION_BARRIER]] +; CHECK: region.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[REGION_EXIT:%.*]] +; CHECK: region.exit: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@no_version_required1.internalized +; CHECK-SAME: () local_unnamed_addr #[[ATTR6]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 7 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@no_version_required1 +; CHECK-SAME: () local_unnamed_addr #[[ATTR5]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add i32* @G, i32 7 monotonic, align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@no_version_required2 +; CHECK-SAME: () unnamed_addr #[[ATTR7]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: call fastcc void @version_required3() #[[ATTR10]] +; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] +; CHECK: region.check.tid: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; CHECK: region.guarded: +; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add i32* @G, i32 13 monotonic, align 4 +; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] +; CHECK: region.guarded.end: +; CHECK-NEXT: br label [[REGION_BARRIER]] +; CHECK: region.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-NEXT: br label [[REGION_EXIT:%.*]] +; CHECK: region.exit: +; CHECK-NEXT: ret void +; +;. +; CHECK: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn } +; CHECK: attributes #[[ATTR2]] = { nounwind } +; CHECK: attributes #[[ATTR3]] = { alwaysinline convergent mustprogress nofree norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR4]] = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR5]] = { mustprogress nofree noinline norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR6]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR7]] = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR8]] = { convergent mustprogress nofree noinline norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +; CHECK: attributes #[[ATTR9]] = { convergent nounwind } +; CHECK: attributes #[[ATTR10]] = { convergent nosync nounwind } +; CHECK: attributes #[[ATTR11]] = { nosync nounwind } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 42, i32 16425807, !"spmdizable_with_versions", i32 39, i32 0} +; CHECK: [[META1:![0-9]+]] = !{void ()* @__omp_offloading_2a_faa34f_spmdizable_with_versions_l39, !"kernel", i32 1} +; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} +; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"} +; CHECK: [[META8:![0-9]+]] = !{!9} +; CHECK: [[META9:![0-9]+]] = distinct !{!9, !10, !"__omp_outlined__: [[DOTGLOBAL_TID_]]"} +; CHECK: [[META10:![0-9]+]] = distinct !{!10, !"__omp_outlined__"} +; CHECK: [[TBAA11]] = !{!12, !12, i64 0} +; CHECK: [[META12:![0-9]+]] = !{!"int", !13, i64 0} +; CHECK: [[META13:![0-9]+]] = !{!"omnipotent char", !14, i64 0} +; CHECK: [[META14:![0-9]+]] = !{!"Simple C/C++ TBAA"} +;.