diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2036,7 +2036,8 @@ UndefValue::get(Int8Ty), F->getName() + ".ID"); for (Use *U : ToBeReplacedStateMachineUses) - U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); + U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast( + ID, U->get()->getType())); ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; @@ -3422,10 +3423,14 @@ IsWorker->setDebugLoc(DLoc); BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); + Module &M = *Kernel->getParent(); + // Create local storage for the work function pointer. + const DataLayout &DL = M.getDataLayout(); Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); - AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr", - &Kernel->getEntryBlock().front()); + Instruction *WorkFnAI = + new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr, + "worker.work_fn.addr", &Kernel->getEntryBlock().front()); WorkFnAI->setDebugLoc(DLoc); auto &OMPInfoCache = static_cast(A.getInfoCache()); @@ -3438,13 +3443,23 @@ Value *Ident = KernelInitCB->getArgOperand(0); Value *GTid = KernelInitCB; - Module &M = *Kernel->getParent(); FunctionCallee BarrierFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_barrier_simple_spmd); CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB) ->setDebugLoc(DLoc); + if (WorkFnAI->getType()->getPointerAddressSpace() != + (unsigned int)AddressSpace::Generic) { + WorkFnAI = new AddrSpaceCastInst( + WorkFnAI, + PointerType::getWithSamePointeeType( + cast(WorkFnAI->getType()), + (unsigned int)AddressSpace::Generic), + WorkFnAI->getName() + ".generic", StateMachineBeginBB); + WorkFnAI->setDebugLoc(DLoc); + } + FunctionCallee KernelParallelFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_kernel_parallel); diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --include-generated-funcs -; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s -; RUN: opt -openmp-opt-disable-state-machine-rewrite -S -passes=openmp-opt < %s | FileCheck %s --check-prefix=CHECK-DISABLED +; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU +; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX +; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -openmp-opt-disable-state-machine-rewrite -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU-DISABLED +; RUN: opt --mtriple=nvptx64-- -openmp-opt-disable-state-machine-rewrite -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX-DISABLED ;; void p0(void); ;; void p1(void); @@ -117,8 +119,6 @@ ;; { weak_callee_empty(); } ;; } -target triple = "nvptx64" - %struct.ident_t = type { i32, i32, i32, i32, i8* } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @@ -841,1696 +841,3397 @@ !16 = !{i32 1, !"wchar_size", i32 4} !17 = !{i32 7, !"openmp", i32 50} !18 = !{i32 7, !"openmp-device", i32 50} -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14 -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ -; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7:[0-9]+]] -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR8:[0-9]+]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] -; CHECK: omp_if.then: -; CHECK-NEXT: store i32 0, i32* @G, align 4 -; CHECK-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] -; CHECK-NEXT: br label [[OMP_IF_END]] -; CHECK: omp_if.end: -; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@no_parallel_region_in_here -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] -; CHECK: omp_if.then: -; CHECK-NEXT: store i32 0, i32* @G, align 4 -; CHECK-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[OMP_IF_END]] -; CHECK: omp_if.end: -; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*) -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9:[0-9]+]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p1() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] -; CHECK: worker_state_machine.parallel_region.execute5: -; CHECK-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check6: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR7]] -; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p1() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*) -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.fallback.execute: -; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR9]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p1() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*) -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__9 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p1() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__13_wrapper.ID to void (i16, i32)*) -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.check1: -; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; CHECK: worker_state_machine.parallel_region.execute2: -; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.check3: -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__12 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p1() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.check: -; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__19_wrapper -; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; CHECK: worker_state_machine.parallel_region.execute: -; CHECK-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.fallback.execute: -; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__15 -; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR9]] -; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR7]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized -; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK: if.then: -; CHECK-NEXT: br label [[RETURN:%.*]] -; CHECK: if.end: -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[A]], 1 -; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR7]] -; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR7]] -; CHECK-NEXT: br label [[RETURN]] -; CHECK: return: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after -; CHECK-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK: if.then: -; CHECK-NEXT: br label [[RETURN:%.*]] -; CHECK: if.end: -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 -; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR9]] -; CHECK-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR9]] -; CHECK-NEXT: br label [[RETURN]] -; CHECK: return: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.fallback.execute: -; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__16 -; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @weak_callee_empty() #[[ATTR7]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@weak_callee_empty -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline nounwind -; CHECK-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @p0() #[[ATTR9]] -; CHECK-NEXT: ret void -; -; -; CHECK: Function Attrs: convergent noinline norecurse nounwind -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14 -; CHECK-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; CHECK-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7:[0-9]+]] -; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR8:[0-9]+]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized -; CHECK-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-DISABLED-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] -; CHECK-DISABLED: omp_if.then: -; CHECK-DISABLED-NEXT: store i32 0, i32* @G, align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: br label [[OMP_IF_END]] -; CHECK-DISABLED: omp_if.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-DISABLED-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] -; CHECK-DISABLED: omp_if.then: -; CHECK-DISABLED-NEXT: store i32 0, i32* @G, align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[OMP_IF_END]] -; CHECK-DISABLED: omp_if.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9:[0-9]+]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p1() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR7]] -; CHECK-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p1() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR9]] -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p1() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p1() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__12 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR8]] -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p1() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__15 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR9]] -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR7]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized -; CHECK-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0 -; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK-DISABLED: if.then: -; CHECK-DISABLED-NEXT: br label [[RETURN:%.*]] -; CHECK-DISABLED: if.end: -; CHECK-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[A]], 1 -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR7]] -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR7]] -; CHECK-DISABLED-NEXT: br label [[RETURN]] -; CHECK-DISABLED: return: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after -; CHECK-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK-DISABLED: if.then: -; CHECK-DISABLED-NEXT: br label [[RETURN:%.*]] -; CHECK-DISABLED: if.end: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR9]] -; CHECK-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR9]] -; CHECK-DISABLED-NEXT: br label [[RETURN]] -; CHECK-DISABLED: return: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__16 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @weak_callee_empty() #[[ATTR7]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@weak_callee_empty -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after -; CHECK-DISABLED-SAME: () #[[ATTR1]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @p0() #[[ATTR9]] -; CHECK-DISABLED-NEXT: ret void -; -; -; CHECK-DISABLED: Function Attrs: convergent noinline norecurse nounwind -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14 +; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__ +; AMDGPU-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8:[0-9]+]] +; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized +; AMDGPU-SAME: () #[[ATTR1:[0-9]+]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; AMDGPU-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; AMDGPU: omp_if.then: +; AMDGPU-NEXT: store i32 0, i32* @G, align 4 +; AMDGPU-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; AMDGPU-NEXT: br label [[OMP_IF_END]] +; AMDGPU: omp_if.end: +; AMDGPU-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@no_parallel_region_in_here +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; AMDGPU-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; AMDGPU: omp_if.then: +; AMDGPU-NEXT: store i32 0, i32* @G, align 4 +; AMDGPU-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[OMP_IF_END]] +; AMDGPU: omp_if.end: +; AMDGPU-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*) +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute: +; AMDGPU-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check1: +; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute2: +; AMDGPU-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.check3: +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10:[0-9]+]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute: +; AMDGPU-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check1: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute2: +; AMDGPU-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.check3: +; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute5: +; AMDGPU-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.check6: +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]] +; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute: +; AMDGPU-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check1: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*) +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute2: +; AMDGPU-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]] +; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*) +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute: +; AMDGPU-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check1: +; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute2: +; AMDGPU-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.check3: +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__10 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__11 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__13_wrapper.ID to void (i16, i32)*) +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute: +; AMDGPU-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check1: +; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute2: +; AMDGPU-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.check3: +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__12 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__13 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__14 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.check: +; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__19_wrapper +; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU: worker_state_machine.parallel_region.execute: +; AMDGPU-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__15 +; AMDGPU-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR10]] +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR8]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized +; AMDGPU-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0 +; AMDGPU-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; AMDGPU: if.then: +; AMDGPU-NEXT: br label [[RETURN:%.*]] +; AMDGPU: if.end: +; AMDGPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[A]], 1 +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR8]] +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR8]] +; AMDGPU-NEXT: br label [[RETURN]] +; AMDGPU: return: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after +; AMDGPU-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; AMDGPU: if.then: +; AMDGPU-NEXT: br label [[RETURN:%.*]] +; AMDGPU: if.end: +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; AMDGPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR10]] +; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR10]] +; AMDGPU-NEXT: br label [[RETURN]] +; AMDGPU: return: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__16 +; AMDGPU-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @weak_callee_empty() #[[ATTR8]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@weak_callee_empty +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__17 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__18 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline nounwind +; AMDGPU-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after +; AMDGPU-SAME: () #[[ATTR1]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__19 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-NEXT: ret void +; +; +; AMDGPU: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14 +; NVPTX-SAME: () #[[ATTR0:[0-9]+]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__ +; NVPTX-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8:[0-9]+]] +; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized +; NVPTX-SAME: () #[[ATTR1:[0-9]+]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; NVPTX-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; NVPTX: omp_if.then: +; NVPTX-NEXT: store i32 0, i32* @G, align 4 +; NVPTX-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; NVPTX-NEXT: br label [[OMP_IF_END]] +; NVPTX: omp_if.end: +; NVPTX-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@no_parallel_region_in_here +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; NVPTX-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; NVPTX-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; NVPTX: omp_if.then: +; NVPTX-NEXT: store i32 0, i32* @G, align 4 +; NVPTX-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[OMP_IF_END]] +; NVPTX: omp_if.end: +; NVPTX-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.check: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*) +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute: +; NVPTX-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.check1: +; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute2: +; NVPTX-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.check3: +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10:[0-9]+]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.check: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute: +; NVPTX-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.check1: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute2: +; NVPTX-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.check3: +; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute5: +; NVPTX-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.check6: +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]] +; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.check: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute: +; NVPTX-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.check1: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*) +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute2: +; NVPTX-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]] +; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.check: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*) +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute: +; NVPTX-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.check1: +; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute2: +; NVPTX-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.check3: +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__10 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__11 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.check: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__13_wrapper.ID to void (i16, i32)*) +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute: +; NVPTX-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.check1: +; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute2: +; NVPTX-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.check3: +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__12 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__13 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__14 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.check: +; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__19_wrapper +; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX: worker_state_machine.parallel_region.execute: +; NVPTX-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__15 +; NVPTX-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR10]] +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR8]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized +; NVPTX-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0 +; NVPTX-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; NVPTX: if.then: +; NVPTX-NEXT: br label [[RETURN:%.*]] +; NVPTX: if.end: +; NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[A]], 1 +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR8]] +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR8]] +; NVPTX-NEXT: br label [[RETURN]] +; NVPTX: return: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after +; NVPTX-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; NVPTX: if.then: +; NVPTX-NEXT: br label [[RETURN:%.*]] +; NVPTX: if.end: +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR10]] +; NVPTX-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR10]] +; NVPTX-NEXT: br label [[RETURN]] +; NVPTX: return: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__16 +; NVPTX-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @weak_callee_empty() #[[ATTR8]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@weak_callee_empty +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__17 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__18 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline nounwind +; NVPTX-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after +; NVPTX-SAME: () #[[ATTR1]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__19 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-NEXT: ret void +; +; +; NVPTX: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14 +; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8:[0-9]+]] +; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized +; AMDGPU-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; AMDGPU-DISABLED-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; AMDGPU-DISABLED: omp_if.then: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* @G, align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: br label [[OMP_IF_END]] +; AMDGPU-DISABLED: omp_if.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; AMDGPU-DISABLED-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; AMDGPU-DISABLED: omp_if.then: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* @G, align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[OMP_IF_END]] +; AMDGPU-DISABLED: omp_if.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10:[0-9]+]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__12 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__15 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized +; AMDGPU-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; AMDGPU-DISABLED: if.then: +; AMDGPU-DISABLED-NEXT: br label [[RETURN:%.*]] +; AMDGPU-DISABLED: if.end: +; AMDGPU-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[A]], 1 +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: br label [[RETURN]] +; AMDGPU-DISABLED: return: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after +; AMDGPU-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; AMDGPU-DISABLED: if.then: +; AMDGPU-DISABLED-NEXT: br label [[RETURN:%.*]] +; AMDGPU-DISABLED: if.end: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: br label [[RETURN]] +; AMDGPU-DISABLED: return: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__16 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @weak_callee_empty() #[[ATTR8]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@weak_callee_empty +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after +; AMDGPU-DISABLED-SAME: () #[[ATTR1]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14 +; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8:[0-9]+]] +; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here.internalized +; NVPTX-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2:[0-9]+]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; NVPTX-DISABLED-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; NVPTX-DISABLED: omp_if.then: +; NVPTX-DISABLED-NEXT: store i32 0, i32* @G, align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_end_single(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: br label [[OMP_IF_END]] +; NVPTX-DISABLED: omp_if.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier(%struct.ident_t* noundef @[[GLOB3:[0-9]+]], i32 [[TMP0]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@no_parallel_region_in_here +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; NVPTX-DISABLED-NEXT: br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_END:%.*]] +; NVPTX-DISABLED: omp_if.then: +; NVPTX-DISABLED-NEXT: store i32 0, i32* @G, align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_end_single(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[OMP_IF_END]] +; NVPTX-DISABLED: omp_if.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10:[0-9]+]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_before.internalized() #[[ATTR8]] +; NVPTX-DISABLED-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before.internalized +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_before +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__17 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__17_wrapper to i8*), i8** [[TMP1]], i64 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after.internalized +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_after +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__18 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__18_wrapper to i8*), i8** [[TMP1]], i64 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__10_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__10(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__11_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__11(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__12 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR9]] +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__13_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__13(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p1() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__14_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__14(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__15 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @omp_get_thread_num to i32 ()*)() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[CALL]]) #[[ATTR8]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after.internalized +; NVPTX-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; NVPTX-DISABLED: if.then: +; NVPTX-DISABLED-NEXT: br label [[RETURN:%.*]] +; NVPTX-DISABLED: if.end: +; NVPTX-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[A]], 1 +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after.internalized(i32 [[SUB]]) #[[ATTR8]] +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after.internalized() #[[ATTR8]] +; NVPTX-DISABLED-NEXT: br label [[RETURN]] +; NVPTX-DISABLED: return: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after +; NVPTX-DISABLED-SAME: (i32 [[A:%.*]]) #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; NVPTX-DISABLED: if.then: +; NVPTX-DISABLED-NEXT: br label [[RETURN:%.*]] +; NVPTX-DISABLED: if.end: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 1 +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after(i32 [[SUB]]) #[[ATTR10]] +; NVPTX-DISABLED-NEXT: call void @simple_state_machine_interprocedural_nested_recursive_after_after() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: br label [[RETURN]] +; NVPTX-DISABLED: return: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__16 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @weak_callee_empty() #[[ATTR8]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@weak_callee_empty +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__17_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__17(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__18_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__18(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after.internalized +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef @[[GLOB2]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB2]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@simple_state_machine_interprocedural_nested_recursive_after_after +; NVPTX-DISABLED-SAME: () #[[ATTR1]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__19 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__19_wrapper to i8*), i8** [[TMP1]], i64 0) +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @p0() #[[ATTR10]] +; NVPTX-DISABLED-NEXT: ret void +; +; +; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__19_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__19(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: ret void ; diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s -; RUN: opt -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=CHECK-DISABLED +; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU +; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX +; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=AMDGPU-DISABLED +; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=NVPTX-DISABLED ;; void unknown(void); ;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable"))); @@ -74,8 +76,6 @@ ;; } ;; } -target triple = "nvptx64" - %struct.ident_t = type { i32, i32, i32, i32, i8* } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @@ -87,98 +87,191 @@ @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode = weak constant i8 1 @llvm.compiler.used = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" + +;. +; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" +; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 ;. -; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 -; CHECK: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 -; CHECK: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" +; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; NVPTX: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 ;. -; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" -; CHECK-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 -; CHECK-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 -; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; CHECK-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; CHECK-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; CHECK-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" +; AMDGPU-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; AMDGPU-DISABLED: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; AMDGPU-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +;. +; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" +; NVPTX-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; NVPTX-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; NVPTX-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ;. define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 -; CHECK-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.begin: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.finished: -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker_state_machine.is_active.check: -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.check: -; CHECK-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*) -; CHECK-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.execute: -; CHECK-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; CHECK-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK-DISABLED: worker_state_machine.parallel_region.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK-DISABLED: worker_state_machine.done.barrier: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK-DISABLED: thread.user_code.check: -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 +; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 +; NVPTX-SAME: () #[[ATTR0:[0-9]+]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 +; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.begin: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.finished: +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker_state_machine.is_active.check: +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: +; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*) +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU-DISABLED: worker_state_machine.done.barrier: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU-DISABLED: thread.user_code.check: +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 +; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.begin: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.finished: +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker_state_machine.is_active.check: +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.check: +; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*) +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX-DISABLED: worker_state_machine.done.barrier: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX-DISABLED: thread.user_code.check: +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 @@ -202,61 +295,119 @@ declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-NEXT: br label [[FOR_INC:%.*]] -; CHECK: for.inc: -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: call void @spmd_amenable() #[[ATTR5:[0-9]+]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; CHECK-DISABLED: for.cond: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 -; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK-DISABLED: for.body: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; CHECK-DISABLED-NEXT: br label [[FOR_INC:%.*]] -; CHECK-DISABLED: for.inc: -; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] -; CHECK-DISABLED: for.end: -; CHECK-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR5:[0-9]+]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__ +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU: for.cond: +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU: for.body: +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__ +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND:%.*]] +; NVPTX: for.cond: +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX: for.body: +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU-DISABLED: for.cond: +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU-DISABLED: for.body: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; NVPTX-DISABLED: for.cond: +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX-DISABLED: for.body: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -292,21 +443,39 @@ } define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -320,33 +489,63 @@ declare void @unknown() #1 define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.addr = alloca i16, align 2 @@ -372,70 +571,138 @@ declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20() #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.begin: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.finished: -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker_state_machine.is_active.check: -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.check: -; CHECK-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__3_wrapper.ID to void (i16, i32)*) -; CHECK-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.execute: -; CHECK-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; CHECK-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK-DISABLED: worker_state_machine.parallel_region.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK-DISABLED: worker_state_machine.done.barrier: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK-DISABLED: thread.user_code.check: -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.begin: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.finished: +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker_state_machine.is_active.check: +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: +; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__3_wrapper.ID to void (i16, i32)*) +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU-DISABLED: worker_state_machine.done.barrier: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU-DISABLED: thread.user_code.check: +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.begin: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.finished: +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker_state_machine.is_active.check: +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.check: +; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__3_wrapper.ID to void (i16, i32)*) +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX-DISABLED: worker_state_machine.done.barrier: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX-DISABLED: thread.user_code.check: +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 @@ -457,67 +724,129 @@ } define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 -; CHECK-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* -; CHECK-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]] -; CHECK-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) -; CHECK-NEXT: br label [[FOR_INC:%.*]] -; CHECK: for.inc: -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: call void @spmd_amenable() #[[ATTR5]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 -; CHECK-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* -; CHECK-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]] -; CHECK-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; CHECK-DISABLED: for.cond: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 -; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK-DISABLED: for.body: -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0) -; CHECK-DISABLED-NEXT: br label [[FOR_INC:%.*]] -; CHECK-DISABLED: for.inc: -; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] -; CHECK-DISABLED: for.end: -; CHECK-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR5]] -; CHECK-DISABLED-NEXT: ret void +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 +; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* +; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU: for.cond: +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU: for.body: +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 +; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* +; NVPTX-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND:%.*]] +; NVPTX: for.cond: +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX: for.body: +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 +; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* +; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU-DISABLED: for.cond: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU-DISABLED: for.body: +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 +; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* +; NVPTX-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; NVPTX-DISABLED: for.cond: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX-DISABLED: for.body: +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -561,21 +890,39 @@ declare void @use(i32* nocapture) #2 define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -587,33 +934,63 @@ } define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.addr = alloca i16, align 2 @@ -631,70 +1008,138 @@ declare void @__kmpc_free_shared(i8* nocapture, i64) #3 define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35() #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.begin: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.finished: -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker_state_machine.is_active.check: -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.check: -; CHECK-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) -; CHECK-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.execute: -; CHECK-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; CHECK-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK-DISABLED: worker_state_machine.parallel_region.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK-DISABLED: worker_state_machine.done.barrier: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK-DISABLED: thread.user_code.check: -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.begin: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.finished: +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker_state_machine.is_active.check: +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: +; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU-DISABLED: worker_state_machine.done.barrier: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU-DISABLED: thread.user_code.check: +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.begin: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.finished: +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker_state_machine.is_active.check: +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.check: +; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX-DISABLED: worker_state_machine.done.barrier: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX-DISABLED: thread.user_code.check: +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 @@ -716,65 +1161,127 @@ } define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__4 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; CHECK-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 1) -; CHECK-NEXT: br label [[FOR_INC:%.*]] -; CHECK: for.inc: -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: call void @spmd_amenable() #[[ATTR5]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; CHECK-DISABLED: for.cond: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 -; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK-DISABLED: for.body: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; CHECK-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) -; CHECK-DISABLED-NEXT: br label [[FOR_INC:%.*]] -; CHECK-DISABLED: for.inc: -; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] -; CHECK-DISABLED: for.end: -; CHECK-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR5]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU: for.cond: +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU: for.body: +; AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 1) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND:%.*]] +; NVPTX: for.cond: +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX: for.body: +; NVPTX-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 1) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU-DISABLED: for.cond: +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU-DISABLED: for.body: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; NVPTX-DISABLED: for.cond: +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX-DISABLED: for.body: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -816,31 +1323,59 @@ } define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[X]], align 4 -; CHECK-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 -; CHECK-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 -; CHECK-DISABLED-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -858,41 +1393,79 @@ } define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -; CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -; CHECK-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -; CHECK-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; AMDGPU-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; NVPTX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.addr = alloca i16, align 2 @@ -912,70 +1485,138 @@ } define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50() #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.begin: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.finished: -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker_state_machine.is_active.check: -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.check: -; CHECK-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) -; CHECK-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.execute: -; CHECK-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; CHECK-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; CHECK-DISABLED: worker_state_machine.parallel_region.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK-DISABLED: worker_state_machine.done.barrier: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK-DISABLED: thread.user_code.check: -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.begin: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.finished: +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker_state_machine.is_active.check: +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: +; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU-DISABLED: worker_state_machine.done.barrier: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU-DISABLED: thread.user_code.check: +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.begin: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.finished: +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker_state_machine.is_active.check: +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.check: +; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX-DISABLED: worker_state_machine.done.barrier: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX-DISABLED: thread.user_code.check: +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 @@ -997,82 +1638,161 @@ } define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__6 -; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* -; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] -; CHECK: region.check.tid: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] -; CHECK: region.guarded: -; CHECK-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 -; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] -; CHECK: region.guarded.end: -; CHECK-NEXT: br label [[REGION_BARRIER]] -; CHECK: region.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[REGION_EXIT:%.*]] -; CHECK: region.exit: -; CHECK-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 100 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; CHECK-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP4]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP5]], i64 noundef 1) -; CHECK-NEXT: br label [[FOR_INC:%.*]] -; CHECK: for.inc: -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: call void @spmd_amenable() #[[ATTR5]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -; CHECK-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* -; CHECK-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND:%.*]] -; CHECK-DISABLED: for.cond: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 -; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] -; CHECK-DISABLED: for.body: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; CHECK-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 -; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; CHECK-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) -; CHECK-DISABLED-NEXT: br label [[FOR_INC:%.*]] -; CHECK-DISABLED: for.inc: -; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 -; CHECK-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK-DISABLED: for.end: -; CHECK-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR5]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6 +; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32* +; AMDGPU-NEXT: br label [[REGION_CHECK_TID:%.*]] +; AMDGPU: region.check.tid: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; AMDGPU-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; AMDGPU-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; AMDGPU: region.guarded: +; AMDGPU-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 +; AMDGPU-NEXT: br label [[REGION_GUARDED_END:%.*]] +; AMDGPU: region.guarded.end: +; AMDGPU-NEXT: br label [[REGION_BARRIER]] +; AMDGPU: region.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[REGION_EXIT:%.*]] +; AMDGPU: region.exit: +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU: for.cond: +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU: for.body: +; AMDGPU-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP3]], align 8 +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP4]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP5]], i64 noundef 1) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP6:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6 +; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* +; NVPTX-NEXT: br label [[REGION_CHECK_TID:%.*]] +; NVPTX: region.check.tid: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; NVPTX-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +; NVPTX-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] +; NVPTX: region.guarded: +; NVPTX-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 +; NVPTX-NEXT: br label [[REGION_GUARDED_END:%.*]] +; NVPTX: region.guarded.end: +; NVPTX-NEXT: br label [[REGION_BARRIER]] +; NVPTX: region.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[REGION_EXIT:%.*]] +; NVPTX: region.exit: +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND:%.*]] +; NVPTX: for.cond: +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX: for.body: +; NVPTX-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP3]], align 8 +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP4]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP5]], i64 noundef 1) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP6:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32* +; AMDGPU-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; AMDGPU-DISABLED: for.cond: +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; AMDGPU-DISABLED: for.body: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* +; NVPTX-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] +; NVPTX-DISABLED: for.cond: +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; NVPTX-DISABLED: for.body: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -1115,31 +1835,59 @@ } define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7 -; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* [[X]], align 4 -; CHECK-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 -; CHECK-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; CHECK-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 -; CHECK-DISABLED-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7 +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7 +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -1157,41 +1905,79 @@ } define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -; CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -; CHECK-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; CHECK-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 -; CHECK-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 -; CHECK-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 -; CHECK-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 -; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** -; CHECK-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -; CHECK-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; AMDGPU-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; NVPTX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.addr = alloca i16, align 2 @@ -1211,86 +1997,171 @@ } define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 -; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK: worker_state_machine.begin: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK: worker_state_machine.finished: -; CHECK-NEXT: ret void -; CHECK: worker_state_machine.is_active.check: -; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK: worker_state_machine.parallel_region.fallback.execute: -; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK: worker_state_machine.parallel_region.end: -; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK: worker_state_machine.done.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK: thread.user_code.check: -; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-NEXT: ret void -; CHECK: worker.exit: -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 -; CHECK-DISABLED-SAME: () #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; CHECK-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; CHECK-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.begin: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; CHECK-DISABLED: worker_state_machine.finished: -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker_state_machine.is_active.check: -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; CHECK-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; CHECK-DISABLED: worker_state_machine.parallel_region.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; CHECK-DISABLED: worker_state_machine.done.barrier: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; CHECK-DISABLED: thread.user_code.check: -; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] -; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) -; CHECK-DISABLED-NEXT: ret void -; CHECK-DISABLED: worker.exit: -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 +; AMDGPU-SAME: () #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU: worker_state_machine.begin: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU: worker_state_machine.finished: +; AMDGPU-NEXT: ret void +; AMDGPU: worker_state_machine.is_active.check: +; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU: worker_state_machine.parallel_region.end: +; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU: worker_state_machine.done.barrier: +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU: user_code.entry: +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 +; NVPTX-SAME: () #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX: worker_state_machine.begin: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX: worker_state_machine.finished: +; NVPTX-NEXT: ret void +; NVPTX: worker_state_machine.is_active.check: +; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX: worker_state_machine.parallel_region.end: +; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX: worker_state_machine.done.barrier: +; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX: thread.user_code.check: +; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX: user_code.entry: +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 +; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) +; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.begin: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 +; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED: worker_state_machine.finished: +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker_state_machine.is_active.check: +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; AMDGPU-DISABLED: worker_state_machine.done.barrier: +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; AMDGPU-DISABLED: thread.user_code.check: +; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; AMDGPU-DISABLED: user_code.entry: +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 +; NVPTX-DISABLED-SAME: () #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.begin: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED: worker_state_machine.finished: +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker_state_machine.is_active.check: +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; NVPTX-DISABLED: worker_state_machine.parallel_region.end: +; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; NVPTX-DISABLED: worker_state_machine.done.barrier: +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; NVPTX-DISABLED: thread.user_code.check: +; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +; NVPTX-DISABLED: user_code.entry: +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 @@ -1312,21 +2183,39 @@ } define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__8 -; CHECK-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-NEXT: ret void -; -; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 -; CHECK-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 -; CHECK-DISABLED-NEXT: call void @unknown() #[[ATTR6]] -; CHECK-DISABLED-NEXT: ret void +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8 +; AMDGPU-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8 +; NVPTX-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: %.global_tid..addr = alloca i32*, align 8 @@ -1367,57 +2256,115 @@ !16 = distinct !{!16, !14} !17 = distinct !{!17, !14} ;. -; CHECK: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } -; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } -; CHECK: attributes #[[ATTR3]] = { nounwind } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nounwind } -; CHECK: attributes #[[ATTR5]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; CHECK: attributes #[[ATTR6]] = { convergent } +; AMDGPU: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU: attributes #[[ATTR2:[0-9]+]] = { alwaysinline } +; AMDGPU: attributes #[[ATTR3:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU: attributes #[[ATTR4]] = { nounwind } +; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { convergent nounwind } +; AMDGPU: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; AMDGPU: attributes #[[ATTR7]] = { convergent } +;. +; NVPTX: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX: attributes #[[ATTR2:[0-9]+]] = { alwaysinline } +; NVPTX: attributes #[[ATTR3:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX: attributes #[[ATTR4]] = { nounwind } +; NVPTX: attributes #[[ATTR5:[0-9]+]] = { convergent nounwind } +; NVPTX: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; NVPTX: attributes #[[ATTR7]] = { convergent } +;. +; AMDGPU-DISABLED: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU-DISABLED: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU-DISABLED: attributes #[[ATTR2:[0-9]+]] = { alwaysinline } +; AMDGPU-DISABLED: attributes #[[ATTR3:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU-DISABLED: attributes #[[ATTR4]] = { nounwind } +; AMDGPU-DISABLED: attributes #[[ATTR5:[0-9]+]] = { convergent nounwind } +; AMDGPU-DISABLED: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; AMDGPU-DISABLED: attributes #[[ATTR7]] = { convergent } +;. +; NVPTX-DISABLED: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX-DISABLED: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX-DISABLED: attributes #[[ATTR2:[0-9]+]] = { alwaysinline } +; NVPTX-DISABLED: attributes #[[ATTR3:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX-DISABLED: attributes #[[ATTR4]] = { nounwind } +; NVPTX-DISABLED: attributes #[[ATTR5:[0-9]+]] = { convergent nounwind } +; NVPTX-DISABLED: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; NVPTX-DISABLED: attributes #[[ATTR7]] = { convergent } +;. +; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; AMDGPU: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; AMDGPU: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; AMDGPU: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; AMDGPU: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; AMDGPU: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; AMDGPU: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU: [[LOOP13]] = distinct !{!13, !14} +; AMDGPU: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; AMDGPU: [[LOOP15]] = distinct !{!15, !14} +; AMDGPU: [[LOOP16]] = distinct !{!16, !14} +; AMDGPU: [[LOOP17]] = distinct !{!17, !14} ;. -; CHECK-DISABLED: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } -; CHECK-DISABLED: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } -; CHECK-DISABLED: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } -; CHECK-DISABLED: attributes #[[ATTR3]] = { nounwind } -; CHECK-DISABLED: attributes #[[ATTR4:[0-9]+]] = { convergent nounwind } -; CHECK-DISABLED: attributes #[[ATTR5]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; CHECK-DISABLED: attributes #[[ATTR6]] = { convergent } +; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; NVPTX: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; NVPTX: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; NVPTX: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; NVPTX: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; NVPTX: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; NVPTX: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX: [[LOOP13]] = distinct !{!13, !14} +; NVPTX: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; NVPTX: [[LOOP15]] = distinct !{!15, !14} +; NVPTX: [[LOOP16]] = distinct !{!16, !14} +; NVPTX: [[LOOP17]] = distinct !{!17, !14} ;. -; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} -; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} -; CHECK: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} -; CHECK: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} -; CHECK: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; CHECK: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} -; CHECK: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; CHECK: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; CHECK: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; CHECK: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} -; CHECK: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK: [[LOOP13]] = distinct !{!13, !14} -; CHECK: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; CHECK: [[LOOP15]] = distinct !{!15, !14} -; CHECK: [[LOOP16]] = distinct !{!16, !14} -; CHECK: [[LOOP17]] = distinct !{!17, !14} +; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU-DISABLED: [[LOOP13]] = distinct !{!13, !14} +; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; AMDGPU-DISABLED: [[LOOP15]] = distinct !{!15, !14} +; AMDGPU-DISABLED: [[LOOP16]] = distinct !{!16, !14} +; AMDGPU-DISABLED: [[LOOP17]] = distinct !{!17, !14} ;. -; CHECK-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} -; CHECK-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} -; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} -; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} -; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; CHECK-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} -; CHECK-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; CHECK-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; CHECK-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; CHECK-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} -; CHECK-DISABLED: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; CHECK-DISABLED: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; CHECK-DISABLED: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; CHECK-DISABLED: [[LOOP13]] = distinct !{!13, !14} -; CHECK-DISABLED: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; CHECK-DISABLED: [[LOOP15]] = distinct !{!15, !14} -; CHECK-DISABLED: [[LOOP16]] = distinct !{!16, !14} -; CHECK-DISABLED: [[LOOP17]] = distinct !{!17, !14} +; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; NVPTX-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; NVPTX-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; NVPTX-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; NVPTX-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; NVPTX-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; NVPTX-DISABLED: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX-DISABLED: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX-DISABLED: [[LOOP13]] = distinct !{!13, !14} +; NVPTX-DISABLED: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; NVPTX-DISABLED: [[LOOP15]] = distinct !{!15, !14} +; NVPTX-DISABLED: [[LOOP16]] = distinct !{!16, !14} +; NVPTX-DISABLED: [[LOOP17]] = distinct !{!17, !14} ;.