diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -111,10 +111,12 @@ /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h /// To complete this, more enum values will need to be moved here. enum class OMPScheduleType { - Static = 34, /**< static unspecialized */ + StaticChunked = 33, + Static = 34, + DistributeChunked = 91, + Distribute = 92, DynamicChunked = 35, - ModifierNonmonotonic = - (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */ + ModifierNonmonotonic = (1 << 30), LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic) }; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -68,6 +68,9 @@ "Number of OpenMP runtime function uses identified"); STATISTIC(NumOpenMPTargetRegionKernels, "Number of OpenMP target region entry points (=kernels) identified"); +STATISTIC(NumOpenMPTargetRegionKernelsSPMD, + "Number of OpenMP target region entry points (=kernels) executed in " + "SPMD-mode instead of generic-mode"); STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachine, "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines"); @@ -456,6 +459,9 @@ /// in the ParallelRegions set above. bool MayReachUnknownParallelRegion = false; + /// Flag to indicate if the associated function can be executed in SPMD mode. + bool IsSPMDCompatible = true; + /// Abstract State interface ///{ @@ -474,6 +480,7 @@ /// See AbstractState::indicatePessimisticFixpoint(...) ChangeStatus indicatePessimisticFixpoint() override { IsAtFixpoint = true; + IsSPMDCompatible = false; MayReachUnknownParallelRegion = true; return ChangeStatus::CHANGED; } @@ -489,7 +496,8 @@ const KernelInfoState &getAssumed() const { return *this; } bool operator==(const KernelInfoState &RHS) const { - if ((MayReachUnknownParallelRegion != RHS.MayReachUnknownParallelRegion)) + if ((MayReachUnknownParallelRegion != RHS.MayReachUnknownParallelRegion) | + (IsSPMDCompatible != RHS.IsSPMDCompatible)) return false; return ParallelRegions.size() == RHS.ParallelRegions.size(); } @@ -511,6 +519,7 @@ if (KIS.KernelInitCB || KIS.KernelDeinitCB) indicatePessimisticFixpoint(); MayReachUnknownParallelRegion |= KIS.MayReachUnknownParallelRegion; + IsSPMDCompatible &= KIS.IsSPMDCompatible; ParallelRegions.insert(KIS.ParallelRegions.begin(), KIS.ParallelRegions.end()); return *this; @@ -548,6 +557,7 @@ const int InitIsSPMDArgNo = 1; const int InitUseStateMachineArgNo = 2; + const int DeinitIsSPMDArgNo = 1; // Check if the current configuration is non-SPMD and generic state machine. // If we already have SPMD mode or a custom state machine we do not need to @@ -558,6 +568,22 @@ ConstantInt *IsSPMD = dyn_cast(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); + auto &Ctx = getAnchorValue().getContext(); + // First check if we can go to SPMD-mode, that is the best option. + if (canBeExecutedInSPMDMode() && IsSPMD && IsSPMD->isZero()) { + // Indicate we use SPMD mode now. + A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo), + *ConstantInt::getBool(Ctx, 1)); + A.changeUseAfterManifest( + KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), + *ConstantInt::getBool(Ctx, 0)); + A.changeUseAfterManifest( + KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo), + *ConstantInt::getBool(Ctx, 1)); + ++NumOpenMPTargetRegionKernelsSPMD; + return ChangeStatus::CHANGED; + } + // If we are stuck with generic mode, try to create a custom device (=GPU) // state machine which is specialized for the parallel regions that are // reachable by the kernel. @@ -566,8 +592,7 @@ return ChangeStatus::UNCHANGED; } - // First, indicate we use a custom state machine now. - auto &Ctx = getAnchorValue().getContext(); + // If not SPMD mode, indicate we use a custom state machine now. auto *FalseVal = ConstantInt::getBool(Ctx, 0); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); @@ -737,9 +762,13 @@ /// Statistics are tracked as part of manifest for now. void trackStatistics() const override {} + /// Returns true if value is assumed to be tracked. + bool canBeExecutedInSPMDMode() const { return IsSPMDCompatible; } + /// See AbstractAttribute::getAsStr() const std::string getAsStr() const override { - return std::string("#PR: ") + std::to_string(ParallelRegions.size()) + + return std::string(canBeExecutedInSPMDMode() ? "SPMD" : "generic") + + std::string(" | #PR: ") + std::to_string(ParallelRegions.size()) + (MayReachUnknownParallelRegion ? " + Unknown PR" : ""); } @@ -771,6 +800,29 @@ ChangeStatus updateImpl(Attributor &A) override { KernelInfoState StateBefore = getState(); + // Callback to check a read/write instruction. + auto CheckRWInst = [&](Instruction &I) { + // We handle calls later. + if (isa(I)) + return true; + // We only care about write effects. + if (!I.mayWriteToMemory()) + return true; + if (auto *SI = dyn_cast(&I)) { + SmallVector Objects; + getUnderlyingObjects(SI->getPointerOperand(), Objects); + if (llvm::all_of(Objects, + [](const Value *Obj) { return isa(Obj); })) + return true; + } + // For now we give up on everything but stores. + IsSPMDCompatible = false; + return true; + }; + if (IsSPMDCompatible && + !A.checkForAllReadWriteInstructions(CheckRWInst, *this)) + IsSPMDCompatible = false; + // Callback to check a call instruction. auto CheckCallInst = [&](Instruction &I) { auto &CB = cast(I); @@ -807,6 +859,7 @@ } // The callee is not known, not ipo-amendable (e.g., due to linkage), or // we can for some other reason not analyze it -> worst case. + IsSPMDCompatible = false; MayReachUnknownParallelRegion = true; return true; } @@ -814,6 +867,15 @@ const unsigned int WrapperFunctionArgNo = 6; RuntimeFunction RF = It->getSecond(); switch (RF) { + // All the functions we know are compatible with SPMD mode. + case OMPRTL___kmpc_for_static_fini: + case OMPRTL___kmpc_global_thread_num: + case OMPRTL___kmpc_single: + case OMPRTL___kmpc_end_single: + case OMPRTL___kmpc_master: + case OMPRTL___kmpc_end_master: + case OMPRTL___kmpc_barrier: + return true; case OMPRTL___kmpc_target_init: // The only reason we completely give up is when we see a second init or // deinit call. @@ -839,15 +901,39 @@ // worst. MayReachUnknownParallelRegion = true; return true; + case OMPRTL___kmpc_for_static_init_4: + case OMPRTL___kmpc_for_static_init_4u: + case OMPRTL___kmpc_for_static_init_8: + case OMPRTL___kmpc_for_static_init_8u: { + // Check the schedule and allow static schedule in SPMD mode. + unsigned ScheduleArgOpNo = 2; + auto *ScheduleTypeCI = + dyn_cast(CB.getArgOperand(ScheduleArgOpNo)); + unsigned ScheduleTypeVal = + ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0; + switch (OMPScheduleType(ScheduleTypeVal)) { + case OMPScheduleType::Static: + case OMPScheduleType::StaticChunked: + case OMPScheduleType::Distribute: + case OMPScheduleType::DistributeChunked: + return true; + default: + break; + }; + IsSPMDCompatible = false; + return true; + } case OMPRTL___kmpc_omp_task: // We do not look into tasks right now, just give up. MayReachUnknownParallelRegion = true; + IsSPMDCompatible = false; return true; default: break; } - // All other OpenMP runtime calls will not reach parallel regions so they - // can be safely ignored for now. + // All other OpenMP runtime calls will not reach parallel regions but they + // might not be compatible with SPMD mode. + IsSPMDCompatible = false; return true; }; if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this)) diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -850,56 +850,21 @@ ret void } -; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. -; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed. +; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode. define weak void @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61() #0 { ; CHECK-LABEL: @__omp_offloading_2c_389eb_simple_state_machine_no_openmp_attr_l61( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) #[[ATTR8]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] ; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; CHECK-NEXT: call void @__omp_outlined__9(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) #[[ATTR8]] ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void @@ -1058,56 +1023,21 @@ ret void } -; The second to last argument of __kmpc_target_init is is set to false to indicate we do not need the generic runtime state machine. -; A user code state machine is build because we do need one. No fallback and only one pointer comparison is needed. +; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode. define weak void @__omp_offloading_2c_389eb_simple_state_machine_pure_l72() #0 { ; CHECK-LABEL: @__omp_offloading_2c_389eb_simple_state_machine_pure_l72( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 ; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR8]] -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) #[[ATTR8]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR9]] ; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; CHECK-NEXT: call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR9]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) #[[ATTR8]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) #[[ATTR8]] ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/spdm_mode.ll b/llvm/test/Transforms/OpenMP/spdm_mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/spdm_mode.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s + +;; void unknown(void); +;; +;; void sequential_loop() { +;; #pragma omp target teams +;; { +;; for (int i = 0; i < 100; ++i) { +;; #pragma omp parallel +;; { +;; unknown(); +;; } +;; } +;; } +;; } + +target triple = "nvptx64" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* undef +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 +@__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata" + +; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode. +define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 { +; CHECK-LABEL: @__omp_offloading_2c_38c77_sequential_loop_l4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTTHREADID_TEMP_]], align 4 +; CHECK-NEXT: call void @__omp_outlined__(i32 [[TMP2]]) #[[ATTR4]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) #[[ATTR3]] +; CHECK-NEXT: ret void +; CHECK: worker.exit: +; CHECK-NEXT: ret void +; +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %0, -1 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit + +user_code.entry: ; preds = %entry + %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2 + call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + ret void + +worker.exit: ; preds = %entry + ret void +} + +declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: @__omp_outlined__( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTGLOBAL_TID__PRIV:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[TMP0:%.*]], i32* [[DOTGLOBAL_TID__PRIV]], align 4 +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID__PRIV]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) #[[ATTR3]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 100 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = load i32, i32* %.global_tid., align 4 + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0) + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond, !llvm.loop !6 + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; CHECK-LABEL: @__omp_outlined__1( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @unknown() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: ret void +; +entry: + call void @unknown() #3 + ret void +} + +; Function Attrs: convergent +declare void @unknown() #1 + +; Function Attrs: convergent norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 { +; CHECK-LABEL: @__omp_outlined__1_wrapper( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; CHECK-NEXT: store i32 [[TMP1:%.*]], i32* [[DOTADDR1]], align 4 +; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: ret void +; +entry: + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #2 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) + +; Function Attrs: nounwind +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2 + +declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) + +attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } +attributes #2 = { nounwind } +attributes #3 = { convergent } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3, !4} +!llvm.ident = !{!5} + +!0 = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0} +!1 = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 2} +!5 = !{!"clang version 13.0.0"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.mustprogress"}