diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -683,6 +683,12 @@ Constant *getOrCreateSrcLocStr(const LocationDescription &Loc, uint32_t &SrcLocStrSize); + /// Return an ident_t encoding the source location \p SrcLocStr and \p Flags. + Constant * + getOrCreateIdentInitializer(Constant *SrcLocStr, uint32_t SrcLocStrSize, + omp::IdentFlag Flags = omp::IdentFlag(0), + unsigned Reserve2Flags = 0); + /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags. /// TODO: Create a enum class for the Reserve2Flags Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -87,6 +87,13 @@ __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr) __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr) +__OMP_STRUCT_TYPE(ConfigurationEnvironmentTy, _OMP::ConfigurationEnvironmentTy,\ + Int8, \ + Int8) +__OMP_STRUCT_TYPE(KernelEnvironmentTy, _OMP::KernelEnvironmentTy, \ + Ident, \ + ConfigurationEnvironmentTy, \ + Int16) #undef __OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE @@ -426,8 +433,8 @@ /* Int */ Int32, /* kmp_task_t */ VoidPtr) /// OpenMP Device runtime functions -__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1, Int1) -__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8, Int1) +__OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentTyPtr, Int1) +__OMP_RTL(__kmpc_target_deinit, false, Void, Int1) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -258,6 +258,19 @@ return GV; } +Constant *OpenMPIRBuilder::getOrCreateIdentInitializer(Constant *SrcLocStr, + uint32_t SrcLocStrSize, + omp::IdentFlag LocFlags, + unsigned Reserve2Flags) { + // Enable "C-mode". + LocFlags |= OMP_IDENT_FLAG_KMPC; + Constant *I32Null = ConstantInt::getNullValue(Int32); + Constant *IdentData[] = {I32Null, ConstantInt::get(Int32, uint32_t(LocFlags)), + ConstantInt::get(Int32, Reserve2Flags), + ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; + return ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); +} + Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, IdentFlag LocFlags, @@ -268,13 +281,8 @@ Constant *&Ident = IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}]; if (!Ident) { - Constant *I32Null = ConstantInt::getNullValue(Int32); - Constant *IdentData[] = {I32Null, - ConstantInt::get(Int32, uint32_t(LocFlags)), - ConstantInt::get(Int32, Reserve2Flags), - ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; - Constant *Initializer = - ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); + Constant *Initializer = getOrCreateIdentInitializer( + SrcLocStr, SrcLocStrSize, LocFlags, Reserve2Flags); // Look for existing encoding of the location + flags, not needed but // minimizes the difference to the existing solution while we transition. @@ -2802,20 +2810,43 @@ uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); - ConstantInt *IsSPMDVal = ConstantInt::getSigned( - IntegerType::getInt8Ty(Int8->getContext()), - IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); - ConstantInt *UseGenericStateMachine = - ConstantInt::getBool(Int32->getContext(), !IsSPMD); + Constant *Initializer = getOrCreateIdentInitializer(SrcLocStr, SrcLocStrSize); + Constant *IsSPMDVal = ConstantInt::get( + Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); + Constant *UseGenericStateMachine = ConstantInt::getSigned(Int8, !IsSPMD); ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_init); - CallInst *ThreadKind = Builder.CreateCall( - Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal}); + Function *Kernel = Builder.GetInsertBlock()->getParent(); + const DataLayout &DL = Fn->getParent()->getDataLayout(); + Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get( + ConfigurationEnvironmentTy, { + UseGenericStateMachine, + IsSPMDVal, + }); + Constant *KernelEnvironmentInitializer = ConstantStruct::get( + KernelEnvironmentTy, { + Initializer, + ConfigurationEnvironmentInitializer, + ConstantInt::getNullValue(Int16), + }); + std::string KernelEnvironmentName = + (Kernel->getName() + "_kernel_info").str(); + GlobalVariable *KernelEnvironment = new GlobalVariable( + M, KernelEnvironmentTy, /*IsConstant*/ true, + llvm::GlobalValue::ExternalLinkage, KernelEnvironmentInitializer, + KernelEnvironmentName, + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace()); + auto *KernelEnvironmentCasted = + ConstantExpr::getPointerBitCastOrAddrSpaceCast(KernelEnvironment, + KernelEnvironmentTyPtr); + + CallInst *ThreadKind = + Builder.CreateCall(Fn, {KernelEnvironmentCasted, RequiresFullRuntimeVal}); Value *ExecUserCode = Builder.CreateICmpEQ( ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), @@ -2854,19 +2885,13 @@ if (!updateToLocation(Loc)) return; - uint32_t SrcLocStrSize; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); - ConstantInt *IsSPMDVal = ConstantInt::getSigned( - IntegerType::getInt8Ty(Int8->getContext()), - IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); - Builder.CreateCall(Fn, {Ident, IsSPMDVal, RequiresFullRuntimeVal}); + Builder.CreateCall(Fn, {RequiresFullRuntimeVal}); } std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef Parts, diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -26,17 +26,22 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Frontend/OpenMP/KernelEnvironment.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/Assumptions.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -154,6 +159,71 @@ namespace { +/// Return the constant element of \p StructC at \p Offset. +template +unsigned getConstantStructOffsetIdx(const DataLayout &DL, + ConstantStruct *StructC) { + StructType *ST = cast(StructC->getType()); + return DL.getStructLayout(ST)->getElementContainingOffset(Offset); +} + +/// Return the constant element of \p StructC at \p Offset. +template +Constant *getConstantStructOffsetElementAddr(const DataLayout &DL, + ConstantStruct *StructC, + Constant *Ptr) { + unsigned Idx = getConstantStructOffsetIdx(DL, StructC); + StructType *ST = cast(StructC->getType()); + return ConstantExpr::getInBoundsGetElementPtr( + ST, Ptr, + ArrayRef{ + ConstantInt::getNullValue(IntegerType::getInt32Ty(ST->getContext())), + ConstantInt::get(IntegerType::getInt32Ty(ST->getContext()), Idx)}); +} + +/// Return the constant element of \p StructC at \p Offset. +template +Constant *getConstantStructOffsetElement(const DataLayout &DL, + ConstantStruct *StructC) { + return StructC->getAggregateElement( + getConstantStructOffsetIdx(DL, StructC)); +} + +/// Set the constant element of \p StructC at \p Offset to be \p V. +template +static ConstantStruct *setConstantStructOffsetElement(const DataLayout &DL, + ConstantStruct *StructC, + Constant *V) { + unsigned Idxs[1] = {getConstantStructOffsetIdx(DL, StructC)}; + Constant *NewStructC = ConstantFoldInsertValueInstruction(StructC, V, Idxs); + assert(NewStructC && "Failed to create constant kernel environment!"); + return cast(NewStructC); +} + +#define GET_KERNEL_ENVIRONMENT_MEMBER(Member, DL, StructC) \ + getConstantStructOffsetElement(DL, StructC) +#define GET_KERNEL_ENVIRONMENT_MEMBER_ADDR(Member, DL, StructC, Ptr) \ + getConstantStructOffsetElementAddr(DL, StructC, Ptr) +#define SET_KERNEL_ENVIRONMENT_MEMBER(Member, DL, StructC, V) \ + setConstantStructOffsetElement(DL, StructC, V) +#define GET_KERNEL_CONFIGURATION_ENVIRONMENT_MEMBER(Member, DL, StructC) \ + getConstantStructOffsetElement( \ + DL, cast( \ + GET_KERNEL_ENVIRONMENT_MEMBER(Configuration, DL, StructC))) +#define SET_KERNEL_CONFIGURATION_ENVIRONMENT_MEMBER(Member, DL, StructC, V) \ + SET_KERNEL_ENVIRONMENT_MEMBER( \ + Configuration, DL, StructC, \ + setConstantStructOffsetElement( \ + DL, \ + cast( \ + GET_KERNEL_ENVIRONMENT_MEMBER(Configuration, DL, StructC)), \ + V)) + struct AAHeapToShared; struct AAICVTracker; @@ -547,6 +617,10 @@ /// one we abort as the kernel is malformed. CallBase *KernelInitCB = nullptr; + /// The constant kernel environement as taken from and passed to + /// __kmpc_target_init. + ConstantStruct *KernelEnvC = nullptr; + /// The __kmpc_target_deinit call in this kernel, if any. If we find more than /// one we abort as the kernel is malformed. CallBase *KernelDeinitCB = nullptr; @@ -638,6 +712,12 @@ "assumptions."); KernelInitCB = KIS.KernelInitCB; } + if (KIS.KernelEnvC) { + if (KernelEnvC && KernelEnvC != KIS.KernelEnvC) + llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt " + "assumptions."); + KernelEnvC = KIS.KernelEnvC; + } if (KIS.KernelDeinitCB) { if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB) llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt " @@ -2594,8 +2674,8 @@ CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; if (!CB) return false; - const int InitModeArgNo = 1; - auto *ModeCI = dyn_cast(CB->getOperand(InitModeArgNo)); + const int InitializeIsSPMD = 1; + auto *ModeCI = dyn_cast(CB->getOperand(InitializeIsSPMD)); return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC); } @@ -2886,6 +2966,35 @@ return GuardedInstructions; } + /// Return the IdentTy (ident_ty) corresponding to the associated kernel. + Constant *getKernelIdent(ConstantStruct *StructC) { + const DataLayout &DL = KernelInitCB->getModule()->getDataLayout(); + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + return GET_KERNEL_ENVIRONMENT_MEMBER_ADDR(Ident, DL, StructC, KernelEnvGV); + } + +#define KERNEL_CONFIGURATION_GETTER_AND_SETTER_INT(Member) \ + ConstantInt *getKernelConfiguration##Member(ConstantStruct *StructC) { \ + const DataLayout &DL = KernelInitCB->getModule()->getDataLayout(); \ + return cast( \ + GET_KERNEL_CONFIGURATION_ENVIRONMENT_MEMBER(Member, DL, StructC)); \ + } \ + void setKernelConfiguration##Member(ConstantStruct *StructC, Constant *V) { \ + const DataLayout &DL = KernelInitCB->getModule()->getDataLayout(); \ + KernelEnvC = \ + SET_KERNEL_CONFIGURATION_ENVIRONMENT_MEMBER(Member, DL, StructC, V); \ + } + + KERNEL_CONFIGURATION_GETTER_AND_SETTER_INT(UseGenericStateMachine) + KERNEL_CONFIGURATION_GETTER_AND_SETTER_INT(ExecMode) + + GlobalVariable *getKernelEnvironementGlobalVariable() { + constexpr const int InitKernelEnvironmentArgNo = 0; + return cast( + KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) + ->stripPointerCasts()); + } + /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { // This is a high-level transform that might change the constant arguments @@ -2912,10 +3021,8 @@ OMPInformationCache::RuntimeFunctionInfo &RFI, CallBase *&Storage) { CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); - assert(CB && - "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!"); - assert(!Storage && - "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!"); + assert(CB && "Unexpected use of kernel (de)init!"); + assert(!Storage && "Multiple uses of kernel (de)init!"); Storage = CB; return false; }; @@ -2938,55 +3045,35 @@ return; } - // For kernels we might need to initialize/finalize the IsSPMD state and - // we need to register a simplification callback so that the Attributor - // knows the constant arguments to __kmpc_target_init and - // __kmpc_target_deinit might actually change. + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + KernelEnvC = cast(KernelEnvGV->getInitializer()); - Attributor::SimplifictionCallbackTy StateMachineSimplifyCB = - [&](const IRPosition &IRP, const AbstractAttribute *AA, - bool &UsedAssumedInformation) -> Optional { - // IRP represents the "use generic state machine" argument of an - // __kmpc_target_init call. We will answer this one with the internal - // state. As long as we are not in an invalid state, we will create a - // custom state machine so the value should be a `i1 false`. If we are - // in an invalid state, we won't change the value that is in the IR. - if (!ReachedKnownParallelRegions.isValidState()) - return nullptr; - // If we have disabled state machine rewrites, don't make a custom one. - if (DisableOpenMPOptStateMachineRewrite) - return nullptr; - if (AA) - A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); - UsedAssumedInformation = !isAtFixpoint(); - auto *FalseVal = - ConstantInt::getBool(IRP.getAnchorValue().getContext(), false); - return FalseVal; - }; + auto *ExecModeC = getKernelConfigurationExecMode(KernelEnvC); + auto *AssumedExecModeC = ConstantInt::get( + ExecModeC->getType(), + ExecModeC->getZExtValue() | OMP_TGT_EXEC_MODE_GENERIC_SPMD); - Attributor::SimplifictionCallbackTy ModeSimplifyCB = + // Next rewrite the kernel configuration to indicate we use SPMD-mode now. + auto &Ctx = getAnchorValue().getContext(); + if (!DisableOpenMPOptSPMDization) + setKernelConfigurationExecMode(KernelEnvC, AssumedExecModeC); + // If we have disabled state machine rewrites, don't make a custom one. + if (!DisableOpenMPOptStateMachineRewrite) + setKernelConfigurationUseGenericStateMachine( + KernelEnvC, ConstantInt::get(Type::getInt8Ty(Ctx), 0)); + + Attributor::SimplifictionCallbackTy KernelConfigurationSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional { - // IRP represents the "SPMDCompatibilityTracker" argument of an - // __kmpc_target_init or - // __kmpc_target_deinit call. We will answer this one with the internal - // state. - if (!SPMDCompatibilityTracker.isValidState()) - return nullptr; - if (!SPMDCompatibilityTracker.isAtFixpoint()) { - if (AA) - A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); - UsedAssumedInformation = true; - } else { - UsedAssumedInformation = false; - } - auto *Val = ConstantInt::getSigned( - IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()), - SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD - : OMP_TGT_EXEC_MODE_GENERIC); - return Val; + return KernelEnvC; }; + A.registerSimplificationCallback( + IRPosition::value(*KernelEnvGV->getInitializer()), + KernelConfigurationSimplifyCB); + // TODO: These are only needed for the old runtime and should be removed + // with it: + //{ Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional { @@ -3008,20 +3095,8 @@ return Val; }; - constexpr const int InitModeArgNo = 1; - constexpr const int DeinitModeArgNo = 1; - constexpr const int InitUseStateMachineArgNo = 2; - constexpr const int InitRequiresFullRuntimeArgNo = 3; - constexpr const int DeinitRequiresFullRuntimeArgNo = 2; - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), - StateMachineSimplifyCB); - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo), - ModeSimplifyCB); - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo), - ModeSimplifyCB); + constexpr const int InitRequiresFullRuntimeArgNo = 1; + constexpr const int DeinitRequiresFullRuntimeArgNo = 0; A.registerSimplificationCallback( IRPosition::callsite_argument(*KernelInitCB, InitRequiresFullRuntimeArgNo), @@ -3030,11 +3105,10 @@ IRPosition::callsite_argument(*KernelDeinitCB, DeinitRequiresFullRuntimeArgNo), IsGenericModeSimplifyCB); + //} // Check if we know we are in SPMD-mode already. - ConstantInt *ModeArg = - dyn_cast(KernelInitCB->getArgOperand(InitModeArgNo)); - if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) + if (ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD) SPMDCompatibilityTracker.indicateOptimisticFixpoint(); // This is a generic region but SPMDization is disabled so stop tracking. else if (DisableOpenMPOptSPMDization) @@ -3335,35 +3409,28 @@ // kernel is executed in. assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && "Initially non-SPMD kernel has SPMD exec mode!"); - ExecMode->setInitializer( + auto *ExecModeC = ConstantInt::get(ExecMode->getInitializer()->getType(), - ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD)); - - // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. - const int InitModeArgNo = 1; - const int DeinitModeArgNo = 1; - const int InitUseStateMachineArgNo = 2; - const int InitRequiresFullRuntimeArgNo = 3; - const int DeinitRequiresFullRuntimeArgNo = 2; - + ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD); + ExecMode->setInitializer(ExecModeC); + + // Next rewrite the kernel configuration to indicate we use SPMD-mode now. + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + KernelEnvGV->setInitializer(KernelEnvC); + + // TODO: These are only needed for the old runtime and should be removed + // with it: + //{ + const int InitRequiresFullRuntimeArgNo = 1; + const int DeinitRequiresFullRuntimeArgNo = 0; auto &Ctx = getAnchorValue().getContext(); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitModeArgNo), - *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), - OMP_TGT_EXEC_MODE_SPMD)); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), - *ConstantInt::getBool(Ctx, false)); - A.changeUseAfterManifest( - KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), - *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), - OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), *ConstantInt::getBool(Ctx, false)); A.changeUseAfterManifest( KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, false)); + *ConstantInt::getBool(Ctx, 0)); + //} ++NumOpenMPTargetRegionKernelsSPMD; @@ -3372,7 +3439,7 @@ }; A.emitRemark(KernelInitCB, "OMP120", Remark); return true; - }; + } ChangeStatus buildCustomStateMachine(Attributor &A) { // If we have disabled state machine rewrites, don't make a custom one @@ -3383,30 +3450,13 @@ if (!ReachedKnownParallelRegions.isValidState()) return ChangeStatus::UNCHANGED; - const int InitModeArgNo = 1; - const int InitUseStateMachineArgNo = 2; - - // Check if the current configuration is non-SPMD and generic state machine. - // If we already have SPMD mode or a custom state machine we do not need to - // go any further. If it is anything but a constant something is weird and - // we give up. - ConstantInt *UseStateMachine = dyn_cast( - KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); - ConstantInt *Mode = - dyn_cast(KernelInitCB->getArgOperand(InitModeArgNo)); - // If we are stuck with generic mode, try to create a custom device (=GPU) // state machine which is specialized for the parallel regions that are // reachable by the kernel. - if (!UseStateMachine || UseStateMachine->isZero() || !Mode || - (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) - return ChangeStatus::UNCHANGED; // If not SPMD mode, indicate we use a custom state machine now. - auto &Ctx = getAnchorValue().getContext(); - auto *FalseVal = ConstantInt::getBool(Ctx, false); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + KernelEnvGV->setInitializer(KernelEnvC); // If we don't actually need a state machine we are done here. This can // happen if there simply are no parallel regions. In the resulting kernel @@ -3488,6 +3538,7 @@ Function *Kernel = getAssociatedFunction(); assert(Kernel && "Expected an associated function!"); + auto &Ctx = Kernel->getContext(); BasicBlock *InitBB = KernelInitCB->getParent(); BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock( KernelInitCB->getNextNode(), "thread.user_code.check"); @@ -3566,7 +3617,7 @@ StateMachineBeginBB->end()), DLoc)); - Value *Ident = KernelInitCB->getArgOperand(0); + Value *Ident = getKernelIdent(KernelEnvC); Value *GTid = KernelInitCB; FunctionCallee BarrierFn = @@ -3690,6 +3741,38 @@ ChangeStatus updateImpl(Attributor &A) override { KernelInfoState StateBefore = getState(); + // When we leave this function this RAII will make sure the member + // KernelEnvC is updated properly depending on the state. That member is + // used for simplification of values and needs to be up to date at all + // times. + struct UpdateKernelEnvCRAII { + UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {} + + ~UpdateKernelEnvCRAII() { + if (!AA.KernelEnvC) + return; + GlobalVariable *KernelEnvGV = AA.getKernelEnvironementGlobalVariable(); + if (!AA.isValidState()) { + AA.KernelEnvC = cast(KernelEnvGV->getInitializer()); + return; + } + if (!AA.ReachedKnownParallelRegions.isValidState()) { + AA.setKernelConfigurationUseGenericStateMachine( + AA.KernelEnvC, + AA.getKernelConfigurationUseGenericStateMachine( + cast(KernelEnvGV->getInitializer()))); + } + if (!AA.SPMDCompatibilityTracker.isValidState()) { + AA.setKernelConfigurationExecMode( + AA.KernelEnvC, + AA.getKernelConfigurationExecMode( + cast(KernelEnvGV->getInitializer()))); + } + } + + AAKernelInfoFunction &AA; + } RAII(*this); + // Callback to check a read/write instruction. auto CheckRWInst = [&](Instruction &I) { // We handle calls later. diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll --- a/llvm/test/Transforms/OpenMP/always_inline_device.ll +++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll @@ -1,30 +1,42 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals ; RUN: opt < %s -S -passes=openmp-opt -openmp-opt-inline-device | FileCheck %s +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } + @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1 +@__omp_offloading_2b_fc9fc2_foo_l2_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata" @G = external global i8 ; Function Attrs: convergent norecurse nounwind +;. +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; CHECK: @[[__OMP_OFFLOADING_FD02_C0934FC2_FOO_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; CHECK: @[[__OMP_OFFLOADING_2B_FC9FC2_FOO_L2_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata" +; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 +;. define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 { ; CHECK: Function Attrs: convergent norecurse nounwind ; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2b_fc9fc2_foo_l2_kernel_info, i1 true) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: ; CHECK-NEXT: store i8 0, i8* @G, align 1 -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void ; entry: - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2b_fc9fc2_foo_l2_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -36,7 +48,7 @@ %isSPMD = call i8 @__kmpc_is_spmd_exec_mode() store i8 %isSPMD, i8* @G call void @bar() #2 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -45,9 +57,9 @@ declare i8 @__kmpc_is_spmd_exec_mode() -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +declare void @__kmpc_target_deinit(i1) ; Function Attrs: convergent nounwind define hidden void @bar() #1 { @@ -77,3 +89,16 @@ !5 = !{i32 7, !"PIC Level", i32 2} !6 = !{i32 7, !"frame-pointer", i32 2} !7 = !{!"clang version 14.0.0"} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx32,+sm_70" } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0} +; CHECK: [[META1:![0-9]+]] = !{void ()* @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1} +; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; CHECK: [[META5:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} +; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} +; CHECK: [[META7:![0-9]+]] = !{!"clang version 14.0.0"} +;. diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -120,17 +120,27 @@ ;; } %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_14_a36502b_no_state_machine_needed_l14_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_simple_state_machine_l22_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info= global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_exec_mode = weak constant i8 1 +@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @G = external global i32, align 4 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @@ -141,7 +151,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -149,15 +159,13 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { entry: %.global_tid..addr = alloca i32*, align 8 @@ -190,14 +198,12 @@ declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) - define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22() #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -205,7 +211,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -292,7 +298,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -300,7 +306,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -372,7 +378,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -380,7 +386,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -460,7 +466,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -468,7 +474,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__9(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -546,7 +552,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -554,7 +560,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__12(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -635,7 +641,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -643,7 +649,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__15(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -690,7 +696,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -698,7 +704,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__16(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry @@ -807,6 +813,9 @@ ret void } +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) + attributes #0 = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } attributes #1 = { convergent noinline nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } attributes #2 = { convergent "frame-pointer"="none" "llvm.assume"="omp_no_openmp" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } @@ -847,13 +856,13 @@ ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 true) +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_info, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) #[[ATTR3:[0-9]+]] ; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -911,23 +920,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -949,7 +952,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -958,7 +961,7 @@ ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1036,23 +1039,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1080,7 +1077,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1089,7 +1086,7 @@ ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1187,23 +1184,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1227,7 +1218,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1236,7 +1227,7 @@ ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1313,23 +1304,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1351,7 +1336,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1360,7 +1345,7 @@ ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1437,23 +1422,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1475,7 +1454,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1484,7 +1463,7 @@ ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1561,23 +1540,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1595,7 +1568,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1603,7 +1576,7 @@ ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1667,23 +1640,17 @@ ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1695,7 +1662,7 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1703,7 +1670,7 @@ ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1823,13 +1790,13 @@ ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 true) +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_info, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) #[[ATTR3:[0-9]+]] ; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -1887,22 +1854,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -1924,7 +1885,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -1933,7 +1894,7 @@ ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2011,22 +1972,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -2054,7 +2009,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -2063,7 +2018,7 @@ ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2161,22 +2116,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -2200,7 +2149,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -2209,7 +2158,7 @@ ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2286,22 +2235,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -2323,7 +2266,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -2332,7 +2275,7 @@ ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2409,22 +2352,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -2446,7 +2383,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -2455,7 +2392,7 @@ ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2532,22 +2469,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -2565,7 +2496,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -2573,7 +2504,7 @@ ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2637,22 +2568,16 @@ ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -2664,7 +2589,7 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 @@ -2672,7 +2597,7 @@ ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2792,13 +2717,13 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) #[[ATTR3:[0-9]+]] ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -2855,14 +2780,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -2939,14 +2864,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3043,14 +2968,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3126,14 +3051,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3209,14 +3134,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3292,13 +3217,13 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3361,13 +3286,13 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3487,13 +3412,13 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) #[[ATTR3:[0-9]+]] ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3550,14 +3475,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3634,14 +3559,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3738,14 +3663,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3821,14 +3746,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3904,14 +3829,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3987,13 +3912,13 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -4056,13 +3981,13 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll --- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll @@ -36,6 +36,8 @@ ;; } %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8 @@ -44,6 +46,7 @@ @4 = private unnamed_addr constant [114 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1 @5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([114 x i8], [114 x i8]* @4, i32 0, i32 0) }, align 8 @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1 +@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @6 = private unnamed_addr constant [116 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1 @7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([116 x i8], [116 x i8]* @6, i32 0, i32 0) }, align 8 @8 = private unnamed_addr constant [85 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_no_fallback;20;1;;\00", align 1 @@ -51,6 +54,7 @@ @10 = private unnamed_addr constant [117 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1 @11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([117 x i8], [117 x i8]* @10, i32 0, i32 0) }, align 8 @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1 +@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([116 x i8], [116 x i8]* @6, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8 @G = external global i32 @@ -60,7 +64,7 @@ define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 { entry: %captured_vars_addrs.i.i = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) #3, !dbg !18 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_info, i1 true) #3, !dbg !18 %exec_user_code = icmp eq i32 %0, -1, !dbg !18 br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18 @@ -77,12 +81,10 @@ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23 call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26 call void @unknown() #6, !dbg !27 - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i8 1, i1 true) #3, !dbg !28 + call void @__kmpc_target_deinit(i1 true) #3, !dbg !28 br label %common.ret } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) local_unnamed_addr - ; Function Attrs: convergent declare void @unknown() local_unnamed_addr #1 @@ -99,13 +101,11 @@ ; Function Attrs: nounwind declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) local_unnamed_addr - ; Function Attrs: norecurse nounwind define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 { entry: %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i8 1, i1 true, i1 true) #3, !dbg !33 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_info, i1 true) #3, !dbg !33 %exec_user_code = icmp eq i32 %0, -1, !dbg !33 br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33 @@ -130,7 +130,7 @@ call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45 call void @no_openmp() call void @no_parallelism() - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i8 1, i1 true) #3, !dbg !46 + call void @__kmpc_target_deinit(i1 true) #3, !dbg !46 br label %common.ret } @@ -163,6 +163,9 @@ declare void @no_openmp() #7 declare void @no_parallelism() #8 +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) + attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" } diff --git a/llvm/test/Transforms/OpenMP/deduplication.ll b/llvm/test/Transforms/OpenMP/deduplication.ll --- a/llvm/test/Transforms/OpenMP/deduplication.ll +++ b/llvm/test/Transforms/OpenMP/deduplication.ll @@ -19,6 +19,7 @@ ; CHECK-DAG: @.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 ; CHECK-DAG: @.str1 = private unnamed_addr constant [23 x i8] c";file001;loc0001;0;0;;\00", align 1 ; CHECK-DAG: @.str2 = private unnamed_addr constant [23 x i8] c";file002;loc0002;0;0;;\00", align 1 +; CHECK-DAG: @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8 ; UTC_ARGS: --enable diff --git a/llvm/test/Transforms/OpenMP/deduplication_remarks.ll b/llvm/test/Transforms/OpenMP/deduplication_remarks.ll --- a/llvm/test/Transforms/OpenMP/deduplication_remarks.ll +++ b/llvm/test/Transforms/OpenMP/deduplication_remarks.ll @@ -7,7 +7,7 @@ %struct.ident_t = type { i32, i32, i32, i32, i8* } -@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8 @.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 ; CHECK: remark: deduplication_remarks.c:7:10: OpenMP runtime call __kmpc_global_thread_num deduplicated diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll --- a/llvm/test/Transforms/OpenMP/deduplication_target.ll +++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll @@ -6,11 +6,14 @@ target triple = "nvptx64" %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_50_a3e09bf8_foo_l2_exec_mode = weak constant i8 0 +@__omp_offloading_50_a3e09bf8_foo_l2_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_50_a3e09bf8_foo_l2_exec_mode], section "llvm.metadata" declare void @use(i32) @@ -20,37 +23,36 @@ ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_50_a3e09bf8_foo_l2_kernel_info, i1 true) ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 2, i1 false, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_50_a3e09bf8_foo_l2_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 2, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #1 -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) attributes #0 = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } attributes #1 = { nounwind } diff --git a/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll b/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll --- a/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll +++ b/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll @@ -4,13 +4,40 @@ ; ModuleID = 'single_threaded_exeuction.c' %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [1 x i8] c"\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8 +@kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } define void @kernel() { ; CHECK-LABEL: define {{[^@]+}}@kernel() { -; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 false) +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[CALL]], -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @kernel_info, i32 0, i32 0), i32 [[CALL]]) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[CALL]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @kernel_info, i32 0, i32 0), i32 [[CALL]]) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: @@ -20,11 +47,35 @@ ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@kernel() { -; CHECK-DISABLED-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 false) +; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-DISABLED-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) +; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[CALL]], -1 +; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK-DISABLED: worker_state_machine.begin: +; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @kernel_info, i32 0, i32 0), i32 [[CALL]]) +; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK-DISABLED: worker_state_machine.finished: +; CHECK-DISABLED-NEXT: ret void +; CHECK-DISABLED: worker_state_machine.is_active.check: +; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK-DISABLED: worker_state_machine.parallel_region.fallback.execute: +; CHECK-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[CALL]]) +; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK-DISABLED: worker_state_machine.parallel_region.end: +; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK-DISABLED: worker_state_machine.done.barrier: +; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @kernel_info, i32 0, i32 0), i32 [[CALL]]) +; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK-DISABLED: thread.user_code.check: ; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK-DISABLED: if.then: @@ -34,10 +85,10 @@ ; CHECK-DISABLED-NEXT: call void @bar() ; CHECK-DISABLED-NEXT: br label [[IF_END]] ; CHECK-DISABLED: if.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) +; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-DISABLED-NEXT: ret void ; - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 false, i1 false) + %call = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.else if.then: @@ -47,7 +98,7 @@ call void @bar() br label %if.end if.end: - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } @@ -135,9 +186,8 @@ declare i32 @__kmpc_get_hardware_thread_id() -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -3,60 +3,70 @@ target triple = "nvptx64" %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } +@0 = private unnamed_addr constant [1 x i8] c"\00", align 1 @kernel0_exec_mode = weak constant i8 1 +@kernel0_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @G = external global i32 ;. +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 ; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; CHECK: @[[KERNEL0_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32 ; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; CHECK: @[[KERNEL1_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } ; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; CHECK: @[[KERNEL2_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB1]], i32 0, i32 0) }, align 8 ;. define weak void @kernel0() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel0 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel0_info, i1 false) ; CHECK-NEXT: call void @helper0() #[[ATTR1:[0-9]+]] ; CHECK-NEXT: call void @helper1() #[[ATTR1]] ; CHECK-NEXT: call void @helper2() #[[ATTR1]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) + %call = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel0_info, i1 true) call void @helper0() call void @helper1() call void @helper2() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) + call void @__kmpc_target_deinit(i1 true) ret void } @kernel1_exec_mode = weak constant i8 1 +@kernel1_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } define weak void @kernel1() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel1 ; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel1_info, i1 false) ; CHECK-NEXT: call void @helper1() #[[ATTR1]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) + %call = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel1_info, i1 true) call void @helper1() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) + call void @__kmpc_target_deinit(i1 true) ret void } @kernel2_exec_mode = weak constant i8 1 +@kernel2_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } define weak void @kernel2() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel2_info, i1 false) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: @@ -68,12 +78,12 @@ ; CHECK-NEXT: call void @helper1() #[[ATTR1]] ; CHECK-NEXT: call void @helper2() #[[ATTR1]] ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 true, i1 true) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel2_info, i1 true) %exec_user_code = icmp eq i32 %i, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -87,7 +97,7 @@ call void @helper1() call void @helper2() call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } @@ -105,7 +115,7 @@ ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]]) ; CHECK-NEXT: br label [[REGION_EXIT:%.*]] ; CHECK: region.exit: ; CHECK-NEXT: ret void @@ -148,7 +158,7 @@ ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]]) ; CHECK-NEXT: br label [[REGION_EXIT:%.*]] ; CHECK: region.exit: ; CHECK-NEXT: ret void @@ -179,11 +189,11 @@ } declare i32 @__kmpc_get_hardware_num_threads_in_block() -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1 zeroext, i1 zeroext) #1 -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8, i1 zeroext) #1 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) declare i32 @__kmpc_global_thread_num(%struct.ident_t*) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) !llvm.module.flags = !{!0, !1} !nvvm.annotations = !{!2, !3, !4} diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll --- a/llvm/test/Transforms/OpenMP/global_constructor.ll +++ b/llvm/test/Transforms/OpenMP/global_constructor.ll @@ -2,15 +2,18 @@ ; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @_ZL6Device = internal global double 0.000000e+00, align 8 -@__omp_offloading_fd02_85283c04_main_l11_exec_mode = weak constant i8 0 +@__omp_offloading_fd02_85283c04_main_l11_exec_mode = weak constant i8 1 +@kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } define weak void @__omp_offloading_fd02_85283c04_main_l11(double* nonnull align 8 dereferenceable(8) %X) local_unnamed_addr { entry: - %0 = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 2, i1 false, i1 false) #0 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 false) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -29,13 +32,12 @@ region.barrier: tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @1, i32 %2) - tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i8 2, i1 false) #0 + call void @__kmpc_target_deinit(i1 false) br label %common.ret } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) local_unnamed_addr - -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) local_unnamed_addr +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) define internal void @__omp_offloading__fd02_85283c04_Device_l6_ctor() { entry: @@ -78,21 +80,21 @@ ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_85283c04_main_l11 ; CHECK-SAME: (double* nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 false) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: ; CHECK-NEXT: ret void ; CHECK: user_code.entry: ; CHECK-NEXT: [[TMP1:%.*]] = load double, double* @_ZL6Device, align 8, !tbaa [[TBAA11:![0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1]] +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1:[0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: ; CHECK-NEXT: store double [[TMP1]], double* [[X]], align 8, !tbaa [[TBAA11]] ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: -; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR1]] -; CHECK-NEXT: tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR1]] +; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i32 [[TMP2]]) #[[ATTR1]] +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: br label [[COMMON_RET]] ; diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll --- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll +++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll @@ -36,10 +36,13 @@ %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_10301_87b2c_foo_l7_exec_mode = weak constant i8 1 +@__omp_offloading_10301_87b2c_foo_l7_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_10301_87b2c_foo_l7_exec_mode], section "llvm.metadata" @@ -48,7 +51,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_10301_87b2c_foo_l7_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -56,14 +59,16 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) + declare void @unknown() define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { @@ -146,8 +151,6 @@ declare i32 @__kmpc_global_thread_num(%struct.ident_t*) -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) - define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { entry: %.global_tid..addr = alloca i32*, align 8 diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll --- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll @@ -3,36 +3,48 @@ target triple = "nvptx64" %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } -@is_spmd_exec_mode = weak constant i8 0 +@0 = private unnamed_addr constant [1 x i8] c"\00", align 1 +@is_spmd_exec_mode = weak constant i8 2 +@is_spmd_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 2 }, i16 0 } @will_be_spmd_exec_mode = weak constant i8 1 +@will_be_spmd_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @non_spmd_exec_mode = weak constant i8 1 +@non_spmd_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @will_not_be_spmd_exec_mode = weak constant i8 1 +@will_not_be_spmd_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @G = external global i8 @llvm.compiler.used = appending global [4 x i8*] [i8* @is_spmd_exec_mode, i8* @will_be_spmd_exec_mode, i8* @non_spmd_exec_mode, i8* @will_not_be_spmd_exec_mode ], section "llvm.metadata" ;. -; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 +; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[IS_SPMD_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 2 }, i16 0 } ; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; CHECK: @[[WILL_BE_SPMD_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } ; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; CHECK: @[[NON_SPMD_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } ; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [4 x i8*] [i8* @is_spmd_exec_mode, i8* @will_be_spmd_exec_mode, i8* @non_spmd_exec_mode, i8* @will_not_be_spmd_exec_mode], section "llvm.metadata" ;. define weak void @is_spmd() { ; CHECK-LABEL: define {{[^@]+}}@is_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @is_spmd_kernel_info, i1 false) ; CHECK-NEXT: call void @is_spmd_helper1() ; CHECK-NEXT: call void @is_spmd_helper2() ; CHECK-NEXT: call void @is_mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @is_spmd_kernel_info, i1 false) call void @is_spmd_helper1() call void @is_spmd_helper2() call void @is_mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) + call void @__kmpc_target_deinit(i1 false) ret void } @@ -40,7 +52,7 @@ ; CHECK-LABEL: define {{[^@]+}}@will_be_spmd() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @will_be_spmd_kernel_info, i1 false) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: @@ -50,12 +62,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: call void @is_spmd_helper2() ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 true, i1 true) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @will_be_spmd_kernel_info, i1 false) %exec_user_code = icmp eq i32 %i, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -67,41 +79,89 @@ %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** call void @is_spmd_helper2() call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 false) ret void } define weak void @non_spmd() { ; CHECK-LABEL: define {{[^@]+}}@non_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @non_spmd_kernel_info, i1 false) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 undef, -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @non_spmd_kernel_info, i32 0, i32 0), i32 undef) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 undef) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @non_spmd_kernel_info, i32 0, i32 0), i32 undef) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: ; CHECK-NEXT: call void @is_generic_helper1() ; CHECK-NEXT: call void @is_generic_helper2() ; CHECK-NEXT: call void @is_mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @non_spmd_kernel_info, i1 false) call void @is_generic_helper1() call void @is_generic_helper2() call void @is_mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) + call void @__kmpc_target_deinit(i1 false) ret void } define weak void @will_not_be_spmd() { ; CHECK-LABEL: define {{[^@]+}}@will_not_be_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @will_not_be_spmd_kernel_info, i1 false) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 undef, -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @will_not_be_spmd_kernel_info, i32 0, i32 0), i32 undef) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 undef) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @will_not_be_spmd_kernel_info, i32 0, i32 0), i32 undef) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: ; CHECK-NEXT: call void @is_generic_helper1() ; CHECK-NEXT: call void @is_generic_helper2() ; CHECK-NEXT: call void @is_mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @will_not_be_spmd_kernel_info, i1 false) call void @is_generic_helper1() call void @is_generic_helper2() call void @is_mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) + call void @__kmpc_target_deinit(i1 false) ret void } @@ -197,8 +257,8 @@ declare void @spmd_compatible() "llvm.assume"="ompx_spmd_amenable" declare i8 @__kmpc_is_spmd_exec_mode() -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1 zeroext, i1 zeroext) -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8, i1 zeroext) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) declare i32 @__kmpc_global_thread_num(%struct.ident_t*) declare void @foo() @@ -217,6 +277,7 @@ ; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline } ; CHECK: attributes #[[ATTR2]] = { nounwind } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll --- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll +++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll @@ -3,62 +3,96 @@ target triple = "nvptx64" %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } +@0 = private unnamed_addr constant [1 x i8] c"\00", align 1 @no_spmd_exec_mode = weak constant i8 1 -@spmd_exec_mode = weak constant i8 0 -@parallel_exec_mode = weak constant i8 0 +@no_spmd_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } +@spmd_exec_mode = weak constant i8 2 +@spmd_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 2 }, i16 0 } +@parallel_exec_mode = weak constant i8 2 +@parallel_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 2 }, i16 0 } @G = external global i8 @llvm.compiler.used = appending global [3 x i8*] [i8* @no_spmd_exec_mode, i8* @spmd_exec_mode, i8* @parallel_exec_mode], section "llvm.metadata" ;. +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 ; CHECK: @[[NO_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK: @[[SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 -; CHECK: @[[PARALLEL_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0 +; CHECK: @[[NO_SPMD_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; CHECK: @[[SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[SPMD_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 2 }, i16 0 } +; CHECK: @[[PARALLEL_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2 +; CHECK: @[[PARALLEL_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 2 }, i16 0 } ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8 ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [3 x i8*] [i8* @no_spmd_exec_mode, i8* @spmd_exec_mode, i8* @parallel_exec_mode], section "llvm.metadata" ;. define weak void @none_spmd() { ; CHECK-LABEL: define {{[^@]+}}@none_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @no_spmd_kernel_info, i1 true) +; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 undef, -1 +; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] +; CHECK: worker_state_machine.begin: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @no_spmd_kernel_info, i32 0, i32 0), i32 undef) +; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) +; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 +; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* +; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null +; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK: worker_state_machine.finished: +; CHECK-NEXT: ret void +; CHECK: worker_state_machine.is_active.check: +; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] +; CHECK: worker_state_machine.parallel_region.fallback.execute: +; CHECK-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 undef) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] +; CHECK: worker_state_machine.parallel_region.end: +; CHECK-NEXT: call void @__kmpc_kernel_end_parallel() +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] +; CHECK: worker_state_machine.done.barrier: +; CHECK-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @no_spmd_kernel_info, i32 0, i32 0), i32 undef) +; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] +; CHECK: thread.user_code.check: ; CHECK-NEXT: call void @none_spmd_helper() ; CHECK-NEXT: call void @mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @no_spmd_kernel_info, i1 true) call void @none_spmd_helper() call void @mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) + call void @__kmpc_target_deinit(i1 true) ret void } define weak void @spmd() { ; CHECK-LABEL: define {{[^@]+}}@spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @spmd_kernel_info, i1 true) ; CHECK-NEXT: call void @spmd_helper() ; CHECK-NEXT: call void @mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @spmd_kernel_info, i1 true) call void @spmd_helper() call void @mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) + call void @__kmpc_target_deinit(i1 true) ret void } define weak void @parallel() { ; CHECK-LABEL: define {{[^@]+}}@parallel() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* align 4294967296 null, i8 2, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @parallel_kernel_info, i1 true) ; CHECK-NEXT: call void @spmd_helper() ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noalias noundef align 4294967296 null, i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 0, i8* noalias noundef align 4294967296 null, i8* noalias noundef align 4294967296 null, i8** noalias noundef align 4294967296 null, i64 noundef 0) -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @parallel_kernel_info, i1 true) call void @spmd_helper() call void @__kmpc_parallel_51(%struct.ident_t* null, i32 0, i32 0, i32 0, i32 0, i8* null, i8* null, i8** null, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) + call void @__kmpc_target_deinit(i1 true) ret void } @@ -130,8 +164,8 @@ declare void @foo() declare void @bar() declare i8 @__kmpc_parallel_level() -declare i32 @__kmpc_target_init(%struct.ident_t*, i8 zeroext, i1 zeroext, i1 zeroext) #1 -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8 zeroext, i1 zeroext) #1 +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) !llvm.module.flags = !{!0, !1} !nvvm.annotations = !{!2, !3, !4} @@ -143,6 +177,7 @@ !4 = !{void ()* @parallel, !"kernel", i32 1} ;. ; CHECK: attributes #[[ATTR0]] = { alwaysinline } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll --- a/llvm/test/Transforms/OpenMP/remove_globalization.ll +++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll @@ -17,40 +17,49 @@ @S = external local_unnamed_addr global i8* %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +@0 = private unnamed_addr constant [1 x i8] c"\00", align 1 +@kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } + +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) ;. ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global i8* +; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 +; CHECK: @[[KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } ;. ; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global i8* +; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 +; CHECK-DISABLED: @[[KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } ;. define void @kernel() { ; CHECK-LABEL: define {{[^@]+}}@kernel() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i8 1, i1 false, i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) ; CHECK-NEXT: call void @foo() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: call void @bar() #[[ATTR4]] -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR3:[0-9]+]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i8 1, i1 true) +; CHECK-NEXT: call void @unknown_no_openmp() +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@kernel() { ; CHECK-DISABLED-NEXT: entry: -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i8 1, i1 false, i1 true) +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) ; CHECK-DISABLED-NEXT: call void @foo() #[[ATTR4:[0-9]+]] ; CHECK-DISABLED-NEXT: call void @bar() #[[ATTR4]] -; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() #[[ATTR3:[0-9]+]] -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i8 1, i1 true) +; CHECK-DISABLED-NEXT: call void @unknown_no_openmp() +; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-DISABLED-NEXT: ret void ; entry: - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i8 1, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) call void @foo() call void @bar() call void @unknown_no_openmp() - call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } @@ -183,14 +192,14 @@ ; CHECK: attributes #[[ATTR0]] = { nosync nounwind } ; CHECK: attributes #[[ATTR1]] = { nofree nosync nounwind readnone willreturn } ; CHECK: attributes #[[ATTR2]] = { nofree nosync nounwind willreturn writeonly } -; CHECK: attributes #[[ATTR3]] = { "llvm.assume"="omp_no_openmp" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } ; CHECK: attributes #[[ATTR4]] = { nounwind } ; CHECK: attributes #[[ATTR5]] = { nosync nounwind writeonly } ;. ; CHECK-DISABLED: attributes #[[ATTR0]] = { nosync nounwind } ; CHECK-DISABLED: attributes #[[ATTR1]] = { nofree nosync nounwind readnone willreturn } ; CHECK-DISABLED: attributes #[[ATTR2]] = { nofree nosync nounwind willreturn writeonly } -; CHECK-DISABLED: attributes #[[ATTR3]] = { "llvm.assume"="omp_no_openmp" } +; CHECK-DISABLED: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } ; CHECK-DISABLED: attributes #[[ATTR4]] = { nounwind } ; CHECK-DISABLED: attributes #[[ATTR5]] = { nosync nounwind writeonly } ;. diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -11,32 +11,35 @@ ; UTC_ARGS: --enable %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @S = external local_unnamed_addr global i8* @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8 +@kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } define dso_local void @foo() { entry: - %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) - %x = call align 4 i8* @__kmpc_alloc_shared(i64 4) + %c = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) + %x = call i8* @__kmpc_alloc_shared(i64 4) call void @unknown_no_openmp() %x_on_stack = bitcast i8* %x to i32* %0 = bitcast i32* %x_on_stack to i8* call void @use(i8* %0) call void @__kmpc_free_shared(i8* %x, i64 4) - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } define void @bar() { - %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + %c = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) call void @unknown_no_openmp() %cmp = icmp eq i32 %c, -1 br i1 %cmp, label %master1, label %exit master1: - %x = call align 4 i8* @__kmpc_alloc_shared(i64 16), !dbg !11 + %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !11 %x_on_stack = bitcast i8* %x to [4 x i32]* %a0 = bitcast [4 x i32]* %x_on_stack to i8* call void @use(i8* %a0) @@ -47,31 +50,31 @@ %b0 = icmp eq i32 %c, -1 br i1 %b0, label %master2, label %exit master2: - %y = call align 4 i8* @__kmpc_alloc_shared(i64 4), !dbg !12 + %y = call i8* @__kmpc_alloc_shared(i64 4), !dbg !12 %y_on_stack = bitcast i8* %y to [4 x i32]* %b1 = bitcast [4 x i32]* %y_on_stack to i8* call void @use(i8* %b1) call void @__kmpc_free_shared(i8* %y, i64 4) br label %exit exit: - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } define void @baz_spmd() { - %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 2, i1 true, i1 true) + %c = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) call void @unknown_no_openmp() %c0 = icmp eq i32 %c, -1 br i1 %c0, label %master3, label %exit master3: - %z = call align 4 i8* @__kmpc_alloc_shared(i64 24), !dbg !12 + %z = call i8* @__kmpc_alloc_shared(i64 24), !dbg !12 %z_on_stack = bitcast i8* %z to [6 x i32]* %c1 = bitcast [6 x i32]* %z_on_stack to i8* call void @use(i8* %c1) call void @__kmpc_free_shared(i8* %z, i64 24) br label %exit exit: - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 2, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } @@ -91,9 +94,8 @@ declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" @@ -119,57 +121,57 @@ ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global i8* ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 4 -; CHECK: @[[Y:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @[[KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = external global %"struct._OMP::KernelEnvironmentTy" +; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 32 +; CHECK: @[[Y:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; CHECK: @[[Z:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [24 x i8] undef, align 32 ;. ; CHECK-LABEL: define {{[^@]+}}@foo() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; CHECK-NEXT: [[X:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]] -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR4:[0-9]+]] -; CHECK-NEXT: call void @use.internalized(i8* nofree writeonly align 4 [[X]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) +; CHECK-NEXT: [[X:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: call void @unknown_no_openmp() +; CHECK-NEXT: call void @use.internalized(i8* nofree writeonly [[X]]) #[[ATTR6:[0-9]+]] ; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[X]], i64 4) #[[ATTR5]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@bar() { -; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR4]] +; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) +; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C]], -1 ; CHECK-NEXT: br i1 [[CMP]], label [[MASTER1:%.*]], label [[EXIT:%.*]] ; CHECK: master1: -; CHECK-NEXT: call void @use.internalized(i8* nofree align 4 addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*)) #[[ATTR6]] +; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*)) #[[ATTR6]] ; CHECK-NEXT: br label [[NEXT:%.*]] ; CHECK: next: -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR4]] +; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: br label [[MASTER2:%.*]] ; CHECK: master2: -; CHECK-NEXT: call void @use.internalized(i8* nofree align 4 addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y, i32 0, i32 0) to i8*)) #[[ATTR6]] +; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @y, i32 0, i32 0) to i8*)) #[[ATTR6]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@baz_spmd() { -; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 true, i1 true) -; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR4]] +; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) +; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: [[C0:%.*]] = icmp eq i32 [[C]], -1 ; CHECK-NEXT: br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]] ; CHECK: master3: -; CHECK-NEXT: [[Z:%.*]] = call align 4 i8* @__kmpc_alloc_shared(i64 24) #[[ATTR5]], !dbg [[DBG9:![0-9]+]] -; CHECK-NEXT: call void @use.internalized(i8* nofree align 4 [[Z]]) #[[ATTR6]] -; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[Z]], i64 24) #[[ATTR5]] +; CHECK-NEXT: call void @use.internalized(i8* nofree addrspacecast (i8 addrspace(3)* getelementptr inbounds ([24 x i8], [24 x i8] addrspace(3)* @z, i32 0, i32 0) to i8*)) #[[ATTR6]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 true) ; CHECK-NEXT: ret void ; ; ; CHECK: Function Attrs: nofree nosync nounwind willreturn writeonly ; CHECK-LABEL: define {{[^@]+}}@use.internalized -; CHECK-SAME: (i8* nofree writeonly align 4 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: store i8* [[X]], i8** @S, align 8 ; CHECK-NEXT: ret void @@ -186,7 +188,7 @@ ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nosync nounwind } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind readnone speculatable } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nofree nosync nounwind readnone speculatable willreturn } -; CHECK: attributes #[[ATTR4]] = { "llvm.assume"="omp_no_openmp" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { "llvm.assume"="omp_no_openmp" } ; CHECK: attributes #[[ATTR5]] = { nounwind } ; CHECK: attributes #[[ATTR6]] = { nounwind writeonly } ;. @@ -199,7 +201,4 @@ ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} ; CHECK: [[META7:![0-9]+]] = !{void ()* @foo, !"kernel", i32 1} ; CHECK: [[META8:![0-9]+]] = !{void ()* @bar, !"kernel", i32 1} -; CHECK: [[DBG9]] = !DILocation(line: 5, column: 14, scope: !10) -; CHECK: [[META10:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -; CHECK: [[META11:![0-9]+]] = !DISubroutineType(types: !2) ;. diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -4,9 +4,12 @@ ; ModuleID = 'single_threaded_exeuction.c' %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [1 x i8] c"\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8 +@kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } ; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread. @@ -14,7 +17,7 @@ ; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread. define void @kernel() { - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 false, i1 false) + %call = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @kernel_info, i1 true) %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.else if.then: @@ -22,7 +25,7 @@ if.else: br label %if.end if.end: - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) + call void @__kmpc_target_deinit(i1 true) ret void } @@ -105,9 +108,8 @@ declare void @__kmpc_kernel_prepare_parallel(i8*) -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) attributes #0 = { cold noinline } diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll --- a/llvm/test/Transforms/OpenMP/spmdization.ll +++ b/llvm/test/Transforms/OpenMP/spmdization.ll @@ -75,500 +75,606 @@ ;; unknown(); ;; } ;; } -;; -;; void do_not_spmdize_task() { -;; #pragma omp target -;; { -;; #pragma omp task -;; spmd_amenable(); -;; #pragma omp parallel -;; unknown(); -;; } -;; } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } %struct.ident_t = type { i32, i32, i32, i32, i8* } -%struct.kmp_task_t_with_privates = type { %struct.kmp_task_t } -%struct.kmp_task_t = type { i8*, i32 (i32, i8*)*, i32, %union.kmp_cmplrdata_t, %union.kmp_cmplrdata_t } -%union.kmp_cmplrdata_t = type { i32 (i32, i8*)* } -%struct.anon = type {} +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 -@__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode = weak constant i8 1 -@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode = weak constant i8 1 -@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode = weak constant i8 1 -@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode = weak constant i8 1 -@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode = weak constant i8 1 -@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode = weak constant i8 1 -@llvm.compiler.used = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata" +@__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode = weak constant i8 1 +@__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } +@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode = weak constant i8 1 +@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } +@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode = weak constant i8 1 +@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } +@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode = weak constant i8 1 +@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } +@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode = weak constant i8 1 +@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } +@llvm.compiler.used = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" + -; Function Attrs: alwaysinline convergent norecurse nounwind ;. ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata" +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 -; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 -; AMDGPU: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 ;. ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata" +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 -; NVPTX: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 -; NVPTX: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef +; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; NVPTX: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 ;. ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; AMDGPU-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata" -; AMDGPU-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 -; AMDGPU-DISABLED: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; AMDGPU-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" +; AMDGPU-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; AMDGPU-DISABLED: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; AMDGPU-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ;. ; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; NVPTX-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata" -; NVPTX-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 -; NVPTX-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } +; NVPTX-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata" +; NVPTX-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 +; NVPTX-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ; NVPTX-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef -; NVPTX-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ;. -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 +define weak void @__omp_offloading_14_a34ca11_sequential_loop_l5() #0 { +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 ; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] { ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i1 false) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU: common.ret: -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] -; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; AMDGPU-NEXT: br label [[COMMON_RET]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void ; -; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 ; NVPTX-SAME: () #[[ATTR0:[0-9]+]] { ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i1 false) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX: common.ret: -; NVPTX-NEXT: ret void +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] -; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; NVPTX-NEXT: br label [[COMMON_RET]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void ; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 ; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: is_worker_check: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.finished: ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker_state_machine.is_active.check: ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*) -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; AMDGPU-DISABLED-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; AMDGPU-DISABLED: worker_state_machine.parallel_region.check1: ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU-DISABLED: thread.user_code.check: ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void ; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5 +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_l5 ; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] { ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: is_worker_check: +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.finished: ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker_state_machine.is_active.check: ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*) -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] +; NVPTX-DISABLED-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) +; NVPTX-DISABLED: worker_state_machine.parallel_region.check1: ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] ; NVPTX-DISABLED: worker_state_machine.parallel_region.end: ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX-DISABLED: thread.user_code.check: ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4:[0-9]+]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_l5_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 - br i1 %exec_user_code, label %user_code.entry, label %common.ret - -common.ret: ; preds = %entry, %user_code.entry - ret void + br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) - store i32 0, i32* %.zero.addr, align 4 - store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18 - call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #6 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) - br label %common.ret + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #3 + call void @__kmpc_target_deinit(i1 true) + ret void + +worker.exit: ; preds = %entry + ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) + define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__ ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU: for.cond: -; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU: for.cond.cleanup: -; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @indirection() #[[ATTR7:[0-9]+]] +; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__ ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-NEXT: br label [[FOR_COND:%.*]] ; NVPTX: for.cond: -; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX: for.cond.cleanup: -; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] -; NVPTX-NEXT: ret void +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) -; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @indirection() #[[ATTR7:[0-9]+]] +; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @indirection() #[[ATTR7:[0-9]+]] +; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__ ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]] -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @indirection() #[[ATTR7:[0-9]+]] +; NVPTX-DISABLED-NEXT: ret void ; entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %i = alloca i32, align 4 %captured_vars_addrs = alloca [0 x i8*], align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32 0, i32* %i, align 4 br label %for.cond -for.cond: ; preds = %for.body, %entry - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %cmp = icmp slt i32 %i.0, 100 - br i1 %cmp, label %for.body, label %for.cond.cleanup +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 100 + br i1 %cmp, label %for.body, label %for.end -for.cond.cleanup: ; preds = %for.cond - call void @spmd_amenable() #10 +for.body: ; preds = %for.cond + %1 = load i32*, i32** %.global_tid..addr, align 8 + %2 = load i32, i32* %1, align 4 + %3 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %2, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %3, i64 0) + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !13 + +for.end: ; preds = %for.cond + call void @indirection() #4 ret void +} -for.body: ; preds = %for.cond - %0 = load i32, i32* %.global_tid., align 4, !tbaa !18 - %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** - call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond, !llvm.loop !22 +define internal void @indirection() { +; AMDGPU-LABEL: define {{[^@]+}}@indirection +; AMDGPU-SAME: () #[[ATTR1:[0-9]+]] { +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR1]] +; AMDGPU-NEXT: ret void +; +; NVPTX-LABEL: define {{[^@]+}}@indirection +; NVPTX-SAME: () #[[ATTR1:[0-9]+]] { +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR1]] +; NVPTX-NEXT: ret void +; +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@indirection +; AMDGPU-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR1]] +; AMDGPU-DISABLED-NEXT: ret void +; +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@indirection +; NVPTX-DISABLED-SAME: () #[[ATTR1:[0-9]+]] { +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR1]] +; NVPTX-DISABLED-NEXT: ret void +; + call void @spmd_amenable() + ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7:[0-9]+]] +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8:[0-9]+]] ; NVPTX-DISABLED-NEXT: ret void ; entry: - call void @unknown() #11 + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @unknown() #5 ret void } -; Function Attrs: convergent norecurse nounwind -define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 { +declare void @unknown() #1 + +define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; AMDGPU-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; NVPTX-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED-NEXT: ret void ; entry: + %.addr = alloca i16, align 2 %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 %global_args = alloca i8**, align 8 - store i32 %1, i32* %.addr1, align 4, !tbaa !18 store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 call void @__kmpc_get_shared_variables(i8*** %global_args) - call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #6 + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #3 ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) + +declare void @spmd_amenable() + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3 + +declare void @__kmpc_target_deinit(i1) + +define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20() #0 { +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i1 false) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU: common.ret: -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; AMDGPU-NEXT: br label [[COMMON_RET]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void ; -; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i1 false) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX: common.ret: -; NVPTX-NEXT: ret void +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; NVPTX-NEXT: br label [[COMMON_RET]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void ; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: is_worker_check: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.finished: ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker_state_machine.is_active.check: @@ -586,42 +692,37 @@ ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU-DISABLED: thread.user_code.check: ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void ; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20 +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: is_worker_check: +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.finished: ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker_state_machine.is_active.check: @@ -639,297 +740,378 @@ ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX-DISABLED: thread.user_code.check: ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 - br i1 %exec_user_code, label %user_code.entry, label %common.ret - -common.ret: ; preds = %entry, %user_code.entry - ret void + br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) - store i32 0, i32* %.zero.addr, align 4 - store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18 - call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr) #6 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) - br label %common.ret + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr) #3 + call void @__kmpc_target_deinit(i1 true) + ret void + +worker.exit: ; preds = %entry + ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 ; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* -; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU: for.cond: -; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU: for.cond.cleanup: -; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] +; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; NVPTX-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 ; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* -; NVPTX-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; NVPTX-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-NEXT: br label [[FOR_COND:%.*]] ; NVPTX: for.cond: -; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX: for.cond.cleanup: -; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-NEXT: ret void +; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) -; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] +; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 ; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* -; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 ; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32* -; NVPTX-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]] +; NVPTX-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR7]] +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %i = alloca i32, align 4 %captured_vars_addrs = alloca [0 x i8*], align 8 - %x = call align 4 i8* @__kmpc_alloc_shared(i64 4) + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %x = call i8* @__kmpc_alloc_shared(i64 4) %x_on_stack = bitcast i8* %x to i32* - call void @use(i32* nocapture %x_on_stack) #10 + call void @use(i32* nocapture %x_on_stack) #4 + store i32 0, i32* %i, align 4 br label %for.cond -for.cond: ; preds = %for.body, %entry - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %cmp = icmp slt i32 %i.0, 100 - br i1 %cmp, label %for.body, label %for.cond.cleanup +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 100 + br i1 %cmp, label %for.body, label %for.end -for.cond.cleanup: ; preds = %for.cond - call void @spmd_amenable() #10 +for.body: ; preds = %for.cond + %1 = load i32*, i32** %.global_tid..addr, align 8 + %2 = load i32, i32* %1, align 4 + %3 = bitcast [0 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %2, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0) + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !15 + +for.end: ; preds = %for.cond + call void @spmd_amenable() #4 call void @__kmpc_free_shared(i8* %x, i64 4) ret void - -for.body: ; preds = %for.cond - %0 = load i32, i32* %.global_tid., align 4, !tbaa !18 - %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** - call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %1, i64 0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond, !llvm.loop !25 } -; Function Attrs: alwaysinline convergent norecurse nounwind + +declare i8* @__kmpc_alloc_shared(i64) #3 + +declare void @use(i32* nocapture) + define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED-NEXT: ret void ; entry: - call void @unknown() #11 + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @unknown() #5 ret void } -; Function Attrs: convergent norecurse nounwind -define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 { +define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; AMDGPU-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; NVPTX-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR4]] ; NVPTX-DISABLED-NEXT: ret void ; entry: + %.addr = alloca i16, align 2 %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 %global_args = alloca i8**, align 8 - store i32 %1, i32* %.addr1, align 4, !tbaa !18 store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 call void @__kmpc_get_shared_variables(i8*** %global_args) - call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #6 + call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #3 ret void } +declare void @__kmpc_free_shared(i8* nocapture, i64) #3 -; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 +define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35() #0 { +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i1 false) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU: common.ret: -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; AMDGPU-NEXT: br label [[COMMON_RET]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void ; -; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i1 false) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX: common.ret: -; NVPTX-NEXT: ret void +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; NVPTX-NEXT: br label [[COMMON_RET]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void ; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: is_worker_check: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.finished: ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker_state_machine.is_active.check: @@ -947,42 +1129,37 @@ ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU-DISABLED: thread.user_code.check: ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void ; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35 +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: is_worker_check: +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.finished: ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker_state_machine.is_active.check: @@ -1000,324 +1177,418 @@ ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX-DISABLED: thread.user_code.check: ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 - br i1 %exec_user_code, label %user_code.entry, label %common.ret - -common.ret: ; preds = %entry, %user_code.entry - ret void + br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) - store i32 0, i32* %.zero.addr, align 4 - store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18 - call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #6 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) - br label %common.ret + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #3 + call void @__kmpc_target_deinit(i1 true) + ret void + +worker.exit: ; preds = %entry + ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU: for.cond: -; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU: for.cond.cleanup: -; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]] -; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 1) -; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 1) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] +; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-NEXT: br label [[FOR_COND:%.*]] ; NVPTX: for.cond: -; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX: for.cond.cleanup: -; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-NEXT: ret void +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]] -; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 1) -; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; NVPTX-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 1) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] +; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]] -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]] -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %i = alloca i32, align 4 %captured_vars_addrs = alloca [1 x i8*], align 8 - %x = call align 4 i8* @__kmpc_alloc_shared(i64 4) + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %x = call i8* @__kmpc_alloc_shared(i64 4) %x_on_stack = bitcast i8* %x to i32* + store i32 0, i32* %i, align 4 br label %for.cond -for.cond: ; preds = %for.body, %entry - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %cmp = icmp slt i32 %i.0, 100 - br i1 %cmp, label %for.body, label %for.cond.cleanup +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 100 + br i1 %cmp, label %for.body, label %for.end -for.cond.cleanup: ; preds = %for.cond - call void @spmd_amenable() #10 +for.body: ; preds = %for.cond + %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0 + %2 = bitcast i32* %x_on_stack to i8* + store i8* %2, i8** %1, align 8 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + %5 = bitcast [1 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %4, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** %5, i64 1) + br label %for.inc + +for.inc: ; preds = %for.body + %6 = load i32, i32* %i, align 4 + %inc = add nsw i32 %6, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !16 + +for.end: ; preds = %for.cond + call void @spmd_amenable() #4 call void @__kmpc_free_shared(i8* %x, i64 4) ret void - -for.body: ; preds = %for.cond - %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0 - store i8* %x, i8** %0, align 8, !tbaa !26 - %1 = load i32, i32* %.global_tid., align 4, !tbaa !18 - %2 = bitcast [1 x i8*]* %captured_vars_addrs to i8** - call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** %2, i64 1) - %inc = add nsw i32 %i.0, 1 - br label %for.cond, !llvm.loop !28 } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5 -; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5 -; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 -; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5 -; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED-NEXT: ret void ; entry: - %0 = load i32, i32* %x, align 4, !tbaa !18 - %inc = add nsw i32 %0, 1 - store i32 %inc, i32* %x, align 4, !tbaa !18 - call void @unknown() #11 + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %x.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %x, i32** %x.addr, align 8 + %0 = load i32*, i32** %x.addr, align 8 + %1 = load i32, i32* %0, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %0, align 4 + call void @unknown() #5 ret void } -; Function Attrs: convergent norecurse nounwind -define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 { +define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; AMDGPU-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; AMDGPU-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; NVPTX-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; NVPTX-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; NVPTX-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; NVPTX-DISABLED-NEXT: ret void ; entry: + %.addr = alloca i16, align 2 %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 %global_args = alloca i8**, align 8 - store i32 %1, i32* %.addr1, align 4, !tbaa !18 store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 call void @__kmpc_get_shared_variables(i8*** %global_args) %2 = load i8**, i8*** %global_args, align 8 - %3 = bitcast i8** %2 to i32** - %4 = load i32*, i32** %3, align 8, !tbaa !26 - call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr, i32* %4) #6 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i32** + %5 = load i32*, i32** %4, align 8 + call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr, i32* %5) #3 ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 +define weak void @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50() #0 { +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i1 false) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU: common.ret: -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; AMDGPU-NEXT: br label [[COMMON_RET]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 false) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void ; -; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i1 false) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX: common.ret: -; NVPTX-NEXT: ret void +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) -; NVPTX-NEXT: br label [[COMMON_RET]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 false) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void ; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: is_worker_check: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.finished: ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker_state_machine.is_active.check: @@ -1335,42 +1606,37 @@ ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU-DISABLED: thread.user_code.check: ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void ; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50 +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: is_worker_check: +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.finished: ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker_state_machine.is_active.check: @@ -1388,45 +1654,50 @@ ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX-DISABLED: thread.user_code.check: ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 - br i1 %exec_user_code, label %user_code.entry, label %common.ret - -common.ret: ; preds = %entry, %user_code.entry - ret void + br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) - store i32 0, i32* %.zero.addr, align 4 - store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18 - call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #6 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) - br label %common.ret + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #3 + call void @__kmpc_target_deinit(i1 true) + ret void + +worker.exit: ; preds = %entry + ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32* ; AMDGPU-NEXT: br label [[REGION_CHECK_TID:%.*]] ; AMDGPU: region.check.tid: @@ -1434,7 +1705,7 @@ ; AMDGPU-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; AMDGPU-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; AMDGPU: region.guarded: -; AMDGPU-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 ; AMDGPU-NEXT: br label [[REGION_GUARDED_END:%.*]] ; AMDGPU: region.guarded.end: ; AMDGPU-NEXT: br label [[REGION_BARRIER]] @@ -1442,27 +1713,36 @@ ; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[REGION_EXIT:%.*]] ; AMDGPU: region.exit: +; AMDGPU-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU: for.cond: -; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU: for.cond.cleanup: -; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 100 +; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU: for.body: -; AMDGPU-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP2]], align 8, !tbaa [[TBAA26]] -; AMDGPU-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP3]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP4]], i64 noundef 1) -; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; AMDGPU-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP3]], align 8 +; AMDGPU-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP4]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP5]], i64 noundef 1) +; AMDGPU-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU: for.inc: +; AMDGPU-NEXT: [[TMP6:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 +; AMDGPU-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; AMDGPU: for.end: +; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR7]] +; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* ; NVPTX-NEXT: br label [[REGION_CHECK_TID:%.*]] ; NVPTX: region.check.tid: @@ -1470,7 +1750,7 @@ ; NVPTX-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 ; NVPTX-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; NVPTX: region.guarded: -; NVPTX-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 ; NVPTX-NEXT: br label [[REGION_GUARDED_END:%.*]] ; NVPTX: region.guarded.end: ; NVPTX-NEXT: br label [[REGION_BARRIER]] @@ -1478,235 +1758,314 @@ ; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]]) ; NVPTX-NEXT: br label [[REGION_EXIT:%.*]] ; NVPTX: region.exit: +; NVPTX-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-NEXT: br label [[FOR_COND:%.*]] ; NVPTX: for.cond: -; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX: for.cond.cleanup: -; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-NEXT: ret void +; NVPTX-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 100 +; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX: for.body: -; NVPTX-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP2]], align 8, !tbaa [[TBAA26]] -; NVPTX-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP3]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP4]], i64 noundef 1) -; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; NVPTX-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP3]], align 8 +; NVPTX-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP4]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP5]], i64 noundef 1) +; NVPTX-NEXT: br label [[FOR_INC:%.*]] +; NVPTX: for.inc: +; NVPTX-NEXT: [[TMP6:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 +; NVPTX-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; NVPTX: for.end: +; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR7]] +; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32* -; AMDGPU-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; AMDGPU-DISABLED: for.cond: -; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; AMDGPU-DISABLED: for.cond.cleanup: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; AMDGPU-DISABLED: for.body: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1) -; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; AMDGPU-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; AMDGPU-DISABLED: for.inc: +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; AMDGPU-DISABLED: for.end: +; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[I:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +; NVPTX-DISABLED-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32* -; NVPTX-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[I]], align 4 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]] ; NVPTX-DISABLED: for.cond: -; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 -; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -; NVPTX-DISABLED: for.cond.cleanup: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 100 +; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] ; NVPTX-DISABLED: for.body: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 -; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1) -; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 -; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP1]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP2]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 1) +; NVPTX-DISABLED-NEXT: br label [[FOR_INC:%.*]] +; NVPTX-DISABLED: for.inc: +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32, i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +; NVPTX-DISABLED: for.end: +; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: ret void ; entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %i = alloca i32, align 4 %captured_vars_addrs = alloca [1 x i8*], align 8 - %x = call align 4 i8* @__kmpc_alloc_shared(i64 4) + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + %x = call i8* @__kmpc_alloc_shared(i64 4) %x_on_stack = bitcast i8* %x to i32* - store i32 42, i32* %x_on_stack, align 4, !tbaa !18 + store i32 42, i32* %x_on_stack, align 4 + store i32 0, i32* %i, align 4 br label %for.cond -for.cond: ; preds = %for.body, %entry - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %cmp = icmp slt i32 %i.0, 100 - br i1 %cmp, label %for.body, label %for.cond.cleanup +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 100 + br i1 %cmp, label %for.body, label %for.end -for.cond.cleanup: ; preds = %for.cond - call void @spmd_amenable() #10 +for.body: ; preds = %for.cond + %1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0 + %2 = bitcast i32* %x_on_stack to i8* + store i8* %2, i8** %1, align 8 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + %5 = bitcast [1 x i8*]* %captured_vars_addrs to i8** + call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %4, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %5, i64 1) + br label %for.inc + +for.inc: ; preds = %for.body + %6 = load i32, i32* %i, align 4 + %inc = add nsw i32 %6, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !17 + +for.end: ; preds = %for.cond + call void @spmd_amenable() #4 call void @__kmpc_free_shared(i8* %x, i64 4) ret void - -for.body: ; preds = %for.cond - %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0 - store i8* %x, i8** %0, align 8, !tbaa !26 - %1 = load i32, i32* %.global_tid., align 4, !tbaa !18 - %2 = bitcast [1 x i8*]* %captured_vars_addrs to i8** - call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %2, i64 1) - %inc = add nsw i32 %i.0, 1 - br label %for.cond, !llvm.loop !29 } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7 -; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7 -; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 -; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7 -; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]] +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[X_ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: store i32* [[X]], i32** [[X_ADDR]], align 8 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4 ; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]] -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED-NEXT: ret void ; entry: - %0 = load i32, i32* %x, align 4, !tbaa !18 - %inc = add nsw i32 %0, 1 - store i32 %inc, i32* %x, align 4, !tbaa !18 - call void @unknown() #11 + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %x.addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %x, i32** %x.addr, align 8 + %0 = load i32*, i32** %x.addr, align 8 + %1 = load i32, i32* %0, align 4 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32* %0, align 4 + call void @unknown() #5 ret void } -; Function Attrs: convergent norecurse nounwind -define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 { +define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: +; AMDGPU-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; AMDGPU-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; AMDGPU-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; AMDGPU-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; AMDGPU-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: +; NVPTX-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; NVPTX-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; NVPTX-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; NVPTX-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; NVPTX-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; NVPTX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: +; AMDGPU-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; AMDGPU-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { +; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: +; NVPTX-DISABLED-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8 -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32** -; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]] +; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 +; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** +; NVPTX-DISABLED-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] ; NVPTX-DISABLED-NEXT: ret void ; entry: + %.addr = alloca i16, align 2 %.addr1 = alloca i32, align 4 %.zero.addr = alloca i32, align 4 %global_args = alloca i8**, align 8 - store i32 %1, i32* %.addr1, align 4, !tbaa !18 store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 call void @__kmpc_get_shared_variables(i8*** %global_args) %2 = load i8**, i8*** %global_args, align 8 - %3 = bitcast i8** %2 to i32** - %4 = load i32*, i32** %3, align 8, !tbaa !26 - call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr, i32* %4) #6 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i32** + %5 = load i32*, i32** %4, align 8 + call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr, i32* %5) #3 ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 +define weak void @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65() #0 { +; +; +; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i1 true) ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU: worker_state_machine.finished: ; AMDGPU-NEXT: ret void ; AMDGPU: worker_state_machine.is_active.check: @@ -1718,41 +2077,35 @@ ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU: thread.user_code.check: ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU: common.ret: -; AMDGPU-NEXT: ret void +; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-NEXT: br label [[COMMON_RET]] +; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-NEXT: call void @__kmpc_target_deinit(i1 true) +; AMDGPU-NEXT: ret void +; AMDGPU: worker.exit: +; AMDGPU-NEXT: ret void ; -; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 +; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i1 true) ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX: worker_state_machine.finished: ; NVPTX-NEXT: ret void ; NVPTX: worker_state_machine.is_active.check: @@ -1764,42 +2117,37 @@ ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX: thread.user_code.check: ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX: common.ret: -; NVPTX-NEXT: ret void +; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-NEXT: br label [[COMMON_RET]] +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-NEXT: call void @__kmpc_target_deinit(i1 true) +; NVPTX-NEXT: ret void +; NVPTX: worker.exit: +; NVPTX-NEXT: ret void ; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 +; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: is_worker_check: +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i1 true) ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; AMDGPU-DISABLED: worker_state_machine.finished: ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker_state_machine.is_active.check: @@ -1811,41 +2159,36 @@ ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; AMDGPU-DISABLED: thread.user_code.check: ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] +; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; AMDGPU-DISABLED-NEXT: ret void +; AMDGPU-DISABLED: worker.exit: +; AMDGPU-DISABLED-NEXT: ret void ; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65 +; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a34ca11_do_not_spmdize_target_l65 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: is_worker_check: +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i1 true) ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; NVPTX-DISABLED: worker_state_machine.finished: ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker_state_machine.is_active.check: @@ -1857,710 +2200,225 @@ ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; NVPTX-DISABLED: thread.user_code.check: ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] +; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR4]] +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) +; NVPTX-DISABLED-NEXT: ret void +; NVPTX-DISABLED: worker.exit: +; NVPTX-DISABLED-NEXT: ret void ; entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 - br i1 %exec_user_code, label %user_code.entry, label %common.ret - -common.ret: ; preds = %entry, %user_code.entry - ret void + br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) - store i32 0, i32* %.zero.addr, align 4 - store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18 - call void @__omp_outlined__8(i32* %.threadid_temp., i32* %.zero.addr) #6 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) - br label %common.ret + store i32 %1, i32* %.threadid_temp., align 4 + call void @__omp_outlined__8(i32* %.threadid_temp., i32* %.zero.addr) #3 + call void @__kmpc_target_deinit(i1 true) + ret void + +worker.exit: ; preds = %entry + ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { +; +; ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8 ; AMDGPU-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-NEXT: ret void ; ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8 ; NVPTX-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-NEXT: ret void ; ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; AMDGPU-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; AMDGPU-DISABLED-NEXT: ret void ; ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { ; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] +; NVPTX-DISABLED-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR8]] ; NVPTX-DISABLED-NEXT: ret void ; entry: - call void @unknown() #11 + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + call void @unknown() #5 ret void } -; Function Attrs: alwaysinline convergent norecurse nounwind -define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 -; AMDGPU-SAME: () #[[ATTR0]] { -; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) -; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU: is_worker_check: -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: -; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU: common.ret: -; AMDGPU-NEXT: ret void -; AMDGPU: user_code.entry: -; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]] -; AMDGPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]] -; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-NEXT: br label [[COMMON_RET]] -; -; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 -; NVPTX-SAME: () #[[ATTR0]] { -; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX: is_worker_check: -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: -; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX: common.ret: -; NVPTX-NEXT: ret void -; NVPTX: user_code.entry: -; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]] -; NVPTX-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]] -; NVPTX-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-NEXT: br label [[COMMON_RET]] -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 -; AMDGPU-DISABLED-SAME: () #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) -; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; AMDGPU-DISABLED: is_worker_check: -; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.begin: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU-DISABLED: worker_state_machine.finished: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: worker_state_machine.is_active.check: -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.check: -; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*) -; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute: -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU-DISABLED: worker_state_machine.parallel_region.end: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU-DISABLED: worker_state_machine.done.barrier: -; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU-DISABLED: thread.user_code.check: -; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; AMDGPU-DISABLED: common.ret: -; AMDGPU-DISABLED-NEXT: ret void -; AMDGPU-DISABLED: user_code.entry: -; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]] -; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74 -; NVPTX-DISABLED-SAME: () #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 -; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) -; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; NVPTX-DISABLED: is_worker_check: -; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.begin: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX-DISABLED: worker_state_machine.finished: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: worker_state_machine.is_active.check: -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.check: -; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*) -; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.execute: -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX-DISABLED: worker_state_machine.parallel_region.end: -; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX-DISABLED: worker_state_machine.done.barrier: -; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX-DISABLED: thread.user_code.check: -; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 -; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] -; NVPTX-DISABLED: common.ret: -; NVPTX-DISABLED-NEXT: ret void -; NVPTX-DISABLED: user_code.entry: -; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0) -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) -; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]] -; -entry: - %captured_vars_addrs = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) - %exec_user_code = icmp eq i32 %0, -1 - br i1 %exec_user_code, label %user_code.entry, label %common.ret - -common.ret: ; preds = %entry, %user_code.entry - ret void - -user_code.entry: ; preds = %entry - %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) - %2 = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 %1, i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) - %3 = bitcast i8* %2 to %struct.kmp_task_t_with_privates* - %4 = call i32 @__kmpc_omp_task(%struct.ident_t* @1, i32 %1, i8* %2) - %5 = bitcast [0 x i8*]* %captured_vars_addrs to i8** - call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__9_wrapper to i8*), i8** %5, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) - br label %common.ret -} - -; Function Attrs: alwaysinline convergent nounwind -define internal void @.omp_outlined.(i32 %.global_tid., i32* noalias %.part_id., i8* noalias %.privates., void (i8*, ...)* noalias %.copy_fn., i8* %.task_t., %struct.anon* noalias %__context) #9 { -; AMDGPU-LABEL: define {{[^@]+}}@.omp_outlined. -; AMDGPU-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] { -; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-NEXT: ret void -; -; NVPTX-LABEL: define {{[^@]+}}@.omp_outlined. -; NVPTX-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] { -; NVPTX-NEXT: entry: -; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined. -; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; AMDGPU-DISABLED-NEXT: ret void -; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined. -; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]] -; NVPTX-DISABLED-NEXT: ret void -; -entry: - call void @spmd_amenable() #10 - ret void -} - -; Function Attrs: convergent norecurse nounwind -define internal i32 @"_omp_task_entry$"(i32 %0, %struct.kmp_task_t_with_privates* noalias %1) #3 { -entry: - %2 = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* %1, i32 0, i32 0 - %3 = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* %2, i32 0, i32 2 - %4 = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* %2, i32 0, i32 0 - %5 = load i8*, i8** %4, align 8, !tbaa !30 - %6 = bitcast i8* %5 to %struct.anon* - %7 = bitcast %struct.kmp_task_t_with_privates* %1 to i8* - call void @.omp_outlined.(i32 %0, i32* %3, i8* null, void (i8*, ...)* null, i8* %7, %struct.anon* %6) #6 - ret i32 0 -} - -; Function Attrs: nounwind -declare i8* @__kmpc_omp_task_alloc(%struct.ident_t*, i32, i32, i64, i64, i32 (i32, i8*)*) #6 - -; Function Attrs: nounwind -declare i32 @__kmpc_omp_task(%struct.ident_t*, i32, i8*) #6 - -; Function Attrs: nosync nounwind -declare void @__kmpc_free_shared(i8* nocapture, i64) #8 - -; Function Attrs: nofree nosync nounwind -declare i8* @__kmpc_alloc_shared(i64) #7 - -; Function Attrs: convergent -declare void @use(i32* nocapture) #5 - -; Function Attrs: convergent -declare void @unknown() #2 - -; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn -declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 - -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - -declare void @__kmpc_get_shared_variables(i8***) - -; Function Attrs: alwaysinline -declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) #4 - -; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn -declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 - -; Function Attrs: convergent -declare void @spmd_amenable() #5 - -; Function Attrs: nounwind -declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #6 - -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) - - -; Function Attrs: alwaysinline convergent norecurse nounwind -define internal void @__omp_outlined__9(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9 -; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: call void @unknown() #[[ATTR7]] -; AMDGPU-NEXT: ret void -; -; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9 -; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; NVPTX-NEXT: entry: -; NVPTX-NEXT: call void @unknown() #[[ATTR7]] -; NVPTX-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 -; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]] -; AMDGPU-DISABLED-NEXT: ret void -; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9 -; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]] -; NVPTX-DISABLED-NEXT: ret void -; -entry: - call void @unknown() #11 - ret void -} - -; Function Attrs: convergent norecurse nounwind -define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 { -; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper -; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { -; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; AMDGPU-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: ret void -; -; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper -; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { -; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; NVPTX-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: ret void -; -; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper -; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { -; AMDGPU-DISABLED-NEXT: entry: -; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: ret void -; -; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper -; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] { -; NVPTX-DISABLED-NEXT: entry: -; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 -; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) -; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: ret void -; -entry: - %.addr1 = alloca i32, align 4 - %.zero.addr = alloca i32, align 4 - %global_args = alloca i8**, align 8 - store i32 %1, i32* %.addr1, align 4, !tbaa !18 - store i32 0, i32* %.zero.addr, align 4 - call void @__kmpc_get_shared_variables(i8*** %global_args) - call void @__omp_outlined__9(i32* %.addr1, i32* %.zero.addr) #6 - ret void -} - -attributes #0 = { alwaysinline convergent norecurse nounwind } -attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn } -attributes #2 = { convergent } -attributes #3 = { convergent norecurse nounwind } -attributes #4 = { alwaysinline } -attributes #5 = { convergent "llvm.assume"="ompx_spmd_amenable" } -attributes #6 = { nounwind } -attributes #7 = { nofree nosync nounwind } -attributes #8 = { nosync nounwind } -attributes #9 = { alwaysinline convergent nounwind } -attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" } -attributes #11 = { convergent } - -!omp_offload.info = !{!0, !1, !2, !3, !4, !5} -!nvvm.annotations = !{!6, !7, !8, !9, !10, !11} -!llvm.module.flags = !{!12, !13, !14, !15, !16} -!llvm.ident = !{!17} - -!0 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -!1 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -!2 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -!3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -!4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -!5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -!6 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -!7 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -!8 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -!9 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -!10 = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -!11 = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -!12 = !{i32 1, !"wchar_size", i32 4} -!13 = !{i32 7, !"openmp", i32 50} -!14 = !{i32 7, !"openmp-device", i32 50} -!15 = !{i32 7, !"PIC Level", i32 2} -!16 = !{i32 7, !"frame-pointer", i32 2} -!17 = !{!"clang version 14.0.0"} -!18 = !{!19, !19, i64 0} -!19 = !{!"int", !20, i64 0} -!20 = !{!"omnipotent char", !21, i64 0} -!21 = !{!"Simple C/C++ TBAA"} -!22 = distinct !{!22, !23, !24} -!23 = !{!"llvm.loop.mustprogress"} -!24 = !{!"llvm.loop.unroll.disable"} -!25 = distinct !{!25, !23, !24} -!26 = !{!27, !27, i64 0} -!27 = !{!"any pointer", !20, i64 0} -!28 = distinct !{!28, !23, !24} -!29 = distinct !{!29, !23, !24} -!30 = !{!31, !27, i64 0} -!31 = !{!"kmp_task_t_with_privates", !32, i64 0} -!32 = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} +attributes #0 = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +attributes #1 = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +attributes #2 = { convergent "frame-pointer"="none" "llvm.assume"="ompx_spmd_amenable" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +attributes #3 = { nounwind } +attributes #4 = { convergent "llvm.assume"="ompx_spmd_amenable" } +attributes #5 = { convergent } + +!omp_offload.info = !{!0, !1, !2, !3, !4} +!nvvm.annotations = !{!5, !6, !7, !8, !9} +!llvm.module.flags = !{!10, !11, !12} + +!0 = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +!1 = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +!2 = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +!3 = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +!4 = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +!5 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +!6 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +!7 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +!8 = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +!9 = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +!10 = !{i32 1, !"wchar_size", i32 4} +!11 = !{i32 7, !"openmp", i32 50} +!12 = !{i32 7, !"openmp-device", i32 50} +!13 = distinct !{!13, !14} +!14 = !{!"llvm.loop.mustprogress"} +!15 = distinct !{!15, !14} +!16 = distinct !{!16, !14} +!17 = distinct !{!17, !14} ;. -; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind } -; AMDGPU: attributes #[[ATTR1]] = { convergent norecurse nounwind } -; AMDGPU: attributes #[[ATTR2]] = { alwaysinline convergent nounwind } -; AMDGPU: attributes #[[ATTR3]] = { nounwind } -; AMDGPU: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind } -; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind } -; AMDGPU: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; AMDGPU: attributes #[[ATTR7]] = { convergent } -; AMDGPU: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn } -; AMDGPU: attributes #[[ATTR9:[0-9]+]] = { alwaysinline } -; AMDGPU: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind } +; AMDGPU: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU: attributes #[[ATTR1]] = { "llvm.assume"="ompx_spmd_amenable" } +; AMDGPU: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU: attributes #[[ATTR3:[0-9]+]] = { alwaysinline } +; AMDGPU: attributes #[[ATTR4]] = { nounwind } +; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind } +; AMDGPU: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind } +; AMDGPU: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; AMDGPU: attributes #[[ATTR8]] = { convergent } ;. -; NVPTX: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind } -; NVPTX: attributes #[[ATTR1]] = { convergent norecurse nounwind } -; NVPTX: attributes #[[ATTR2]] = { alwaysinline convergent nounwind } -; NVPTX: attributes #[[ATTR3]] = { nounwind } -; NVPTX: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind } -; NVPTX: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind } -; NVPTX: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; NVPTX: attributes #[[ATTR7]] = { convergent } -; NVPTX: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn } -; NVPTX: attributes #[[ATTR9:[0-9]+]] = { alwaysinline } -; NVPTX: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind } +; NVPTX: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX: attributes #[[ATTR1]] = { "llvm.assume"="ompx_spmd_amenable" } +; NVPTX: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX: attributes #[[ATTR3:[0-9]+]] = { alwaysinline } +; NVPTX: attributes #[[ATTR4]] = { nounwind } +; NVPTX: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind } +; NVPTX: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind } +; NVPTX: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; NVPTX: attributes #[[ATTR8]] = { convergent } ;. -; AMDGPU-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR1]] = { convergent norecurse nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR2]] = { alwaysinline convergent nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR3]] = { nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind } -; AMDGPU-DISABLED: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; AMDGPU-DISABLED: attributes #[[ATTR7]] = { convergent } -; AMDGPU-DISABLED: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn } -; AMDGPU-DISABLED: attributes #[[ATTR9:[0-9]+]] = { alwaysinline } -; AMDGPU-DISABLED: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind } +; AMDGPU-DISABLED: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU-DISABLED: attributes #[[ATTR1]] = { "llvm.assume"="ompx_spmd_amenable" } +; AMDGPU-DISABLED: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; AMDGPU-DISABLED: attributes #[[ATTR3:[0-9]+]] = { alwaysinline } +; AMDGPU-DISABLED: attributes #[[ATTR4]] = { nounwind } +; AMDGPU-DISABLED: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind } +; AMDGPU-DISABLED: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind } +; AMDGPU-DISABLED: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; AMDGPU-DISABLED: attributes #[[ATTR8]] = { convergent } ;. -; NVPTX-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind } -; NVPTX-DISABLED: attributes #[[ATTR1]] = { convergent norecurse nounwind } -; NVPTX-DISABLED: attributes #[[ATTR2]] = { alwaysinline convergent nounwind } -; NVPTX-DISABLED: attributes #[[ATTR3]] = { nounwind } -; NVPTX-DISABLED: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind } -; NVPTX-DISABLED: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind } -; NVPTX-DISABLED: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" } -; NVPTX-DISABLED: attributes #[[ATTR7]] = { convergent } -; NVPTX-DISABLED: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn } -; NVPTX-DISABLED: attributes #[[ATTR9:[0-9]+]] = { alwaysinline } -; NVPTX-DISABLED: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind } +; NVPTX-DISABLED: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX-DISABLED: attributes #[[ATTR1]] = { "llvm.assume"="ompx_spmd_amenable" } +; NVPTX-DISABLED: attributes #[[ATTR2:[0-9]+]] = { convergent "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } +; NVPTX-DISABLED: attributes #[[ATTR3:[0-9]+]] = { alwaysinline } +; NVPTX-DISABLED: attributes #[[ATTR4]] = { nounwind } +; NVPTX-DISABLED: attributes #[[ATTR5:[0-9]+]] = { nosync nounwind } +; NVPTX-DISABLED: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind } +; NVPTX-DISABLED: attributes #[[ATTR7]] = { convergent "llvm.assume"="ompx_spmd_amenable" } +; NVPTX-DISABLED: attributes #[[ATTR8]] = { convergent } ;. -; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; AMDGPU: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; AMDGPU: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; AMDGPU: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; AMDGPU: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; AMDGPU: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; AMDGPU: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} -; AMDGPU: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU: [[META17:![0-9]+]] = !{!"clang version 14.0.0"} -; AMDGPU: [[TBAA18]] = !{!19, !19, i64 0} -; AMDGPU: [[META19:![0-9]+]] = !{!"int", !20, i64 0} -; AMDGPU: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0} -; AMDGPU: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"} -; AMDGPU: [[LOOP22]] = distinct !{!22, !23, !24} -; AMDGPU: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; AMDGPU: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU: [[LOOP25]] = distinct !{!25, !23, !24} -; AMDGPU: [[TBAA26]] = !{!27, !27, i64 0} -; AMDGPU: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} -; AMDGPU: [[LOOP28]] = distinct !{!28, !23, !24} -; AMDGPU: [[LOOP29]] = distinct !{!29, !23, !24} -; AMDGPU: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; AMDGPU: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; AMDGPU: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} +; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; AMDGPU: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; AMDGPU: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; AMDGPU: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; AMDGPU: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; AMDGPU: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; AMDGPU: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU: [[LOOP13]] = distinct !{!13, !14} +; AMDGPU: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; AMDGPU: [[LOOP15]] = distinct !{!15, !14} +; AMDGPU: [[LOOP16]] = distinct !{!16, !14} +; AMDGPU: [[LOOP17]] = distinct !{!17, !14} ;. -; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; NVPTX: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; NVPTX: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; NVPTX: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; NVPTX: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; NVPTX: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; NVPTX: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} -; NVPTX: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX: [[META17:![0-9]+]] = !{!"clang version 14.0.0"} -; NVPTX: [[TBAA18]] = !{!19, !19, i64 0} -; NVPTX: [[META19:![0-9]+]] = !{!"int", !20, i64 0} -; NVPTX: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0} -; NVPTX: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"} -; NVPTX: [[LOOP22]] = distinct !{!22, !23, !24} -; NVPTX: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; NVPTX: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"} -; NVPTX: [[LOOP25]] = distinct !{!25, !23, !24} -; NVPTX: [[TBAA26]] = !{!27, !27, i64 0} -; NVPTX: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} -; NVPTX: [[LOOP28]] = distinct !{!28, !23, !24} -; NVPTX: [[LOOP29]] = distinct !{!29, !23, !24} -; NVPTX: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; NVPTX: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; NVPTX: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} +; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; NVPTX: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; NVPTX: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; NVPTX: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; NVPTX: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; NVPTX: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; NVPTX: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX: [[LOOP13]] = distinct !{!13, !14} +; NVPTX: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; NVPTX: [[LOOP15]] = distinct !{!15, !14} +; NVPTX: [[LOOP16]] = distinct !{!16, !14} +; NVPTX: [[LOOP17]] = distinct !{!17, !14} ;. -; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} -; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"} -; AMDGPU-DISABLED: [[TBAA18]] = !{!19, !19, i64 0} -; AMDGPU-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0} -; AMDGPU-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0} -; AMDGPU-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"} -; AMDGPU-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24} -; AMDGPU-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; AMDGPU-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"} -; AMDGPU-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24} -; AMDGPU-DISABLED: [[TBAA26]] = !{!27, !27, i64 0} -; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} -; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24} -; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24} -; AMDGPU-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; AMDGPU-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; AMDGPU-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} +; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; AMDGPU-DISABLED: [[LOOP13]] = distinct !{!13, !14} +; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; AMDGPU-DISABLED: [[LOOP15]] = distinct !{!15, !14} +; AMDGPU-DISABLED: [[LOOP16]] = distinct !{!16, !14} +; AMDGPU-DISABLED: [[LOOP17]] = distinct !{!17, !14} ;. -; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5} -; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1} -; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0} -; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2} -; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4} -; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} -; NVPTX-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1} -; NVPTX-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1} -; NVPTX-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1} -; NVPTX-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} -; NVPTX-DISABLED: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1} -; NVPTX-DISABLED: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1} -; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -; NVPTX-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50} -; NVPTX-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} -; NVPTX-DISABLED: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2} -; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2} -; NVPTX-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"} -; NVPTX-DISABLED: [[TBAA18]] = !{!19, !19, i64 0} -; NVPTX-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0} -; NVPTX-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0} -; NVPTX-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"} -; NVPTX-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24} -; NVPTX-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"} -; NVPTX-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"} -; NVPTX-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24} -; NVPTX-DISABLED: [[TBAA26]] = !{!27, !27, i64 0} -; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0} -; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24} -; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24} -; NVPTX-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0} -; NVPTX-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0} -; NVPTX-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32} +; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_stack_var", i32 20, i32 1} +; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop", i32 5, i32 0} +; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var", i32 35, i32 2} +; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"do_not_spmdize_target", i32 65, i32 4} +; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 20, i32 171231761, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3} +; NVPTX-DISABLED: [[META5:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_l5, !"kernel", i32 1} +; NVPTX-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20, !"kernel", i32 1} +; NVPTX-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35, !"kernel", i32 1} +; NVPTX-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1} +; NVPTX-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65, !"kernel", i32 1} +; NVPTX-DISABLED: [[META10:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; NVPTX-DISABLED: [[META11:![0-9]+]] = !{i32 7, !"openmp", i32 50} +; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} +; NVPTX-DISABLED: [[LOOP13]] = distinct !{!13, !14} +; NVPTX-DISABLED: [[META14:![0-9]+]] = !{!"llvm.loop.mustprogress"} +; NVPTX-DISABLED: [[LOOP15]] = distinct !{!15, !14} +; NVPTX-DISABLED: [[LOOP16]] = distinct !{!16, !14} +; NVPTX-DISABLED: [[LOOP17]] = distinct !{!17, !14} ;. diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll --- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll @@ -12,11 +12,14 @@ target triple = "nvptx64" +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_fd02_404433c2_main_l5_exec_mode = weak constant i8 1 +@__omp_offloading_fd02_404433c2_main_l5_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_404433c2_main_l5_exec_mode], section "llvm.metadata" ; Function Attrs: alwaysinline convergent norecurse nounwind @@ -24,6 +27,7 @@ ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_fd02_404433c2_main_l5_exec_mode], section "llvm.metadata" ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ;. @@ -32,7 +36,7 @@ ; CHECK-SAME: (double* nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false, i1 false) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_fd02_404433c2_main_l5_kernel_info, i1 false) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: @@ -56,12 +60,12 @@ ; CHECK: region.exit: ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [0 x i8*], [0 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* nonnull @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** nonnull [[TMP4]], i64 0) #[[ATTR3]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR3]] +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) #[[ATTR3]] ; CHECK-NEXT: br label [[COMMON_RET]] ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) #3 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_fd02_404433c2_main_l5_kernel_info, i1 true) #3 %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -74,12 +78,10 @@ store double %call.i, double* %x, align 8, !tbaa !8 %2 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs, i64 0, i64 0 call void @__kmpc_parallel_51(%struct.ident_t* nonnull @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** nonnull %2, i64 0) #3 - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i8 1, i1 true) #3 + call void @__kmpc_target_deinit(i1 true) #3 br label %common.ret } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) local_unnamed_addr - ; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn define internal void @__omp_outlined__(i32* noalias nocapture %.global_tid., i32* noalias nocapture %.bound_tid.) #1 { ; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ @@ -114,7 +116,8 @@ ; Function Attrs: alwaysinline declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr #4 -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) local_unnamed_addr +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) ; Function Attrs: convergent declare double @__nv_sin(double) local_unnamed_addr #5 diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -34,11 +34,14 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64" +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode = weak constant i8 1 +@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" ; Function Attrs: convergent norecurse nounwind @@ -46,12 +49,14 @@ ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 +; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 3 }, i16 0 } ; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ;. ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 ; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_INFO:[a-zA-Z0-9_$"\\.-]+]] = global %"struct._OMP::KernelEnvironmentTy" { [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 0, i8 1 }, i16 0 } ; CHECK-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata" ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef ;. @@ -60,11 +65,11 @@ ; CHECK-SAME: (i32* [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false, i1 false) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_info, i1 false) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @[[GLOB1]]) #[[ATTR4]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @[[GLOB1]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1 ; CHECK-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32 ; CHECK-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32 @@ -167,7 +172,7 @@ ; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR4]] +; CHECK-NEXT: call void @__kmpc_target_deinit(i1 false) #[[ATTR4]] ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void @@ -177,22 +182,16 @@ ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; CHECK-DISABLED-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i8 1, i1 false, i1 true) #[[ATTR4:[0-9]+]] -; CHECK-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -; CHECK-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size() -; CHECK-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]] -; CHECK-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]] -; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]] -; CHECK-DISABLED: is_worker_check: +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_info, i1 true) ; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; CHECK-DISABLED: worker_state_machine.begin: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; CHECK-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) ; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 ; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* ; CHECK-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] +; CHECK-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] ; CHECK-DISABLED: worker_state_machine.finished: ; CHECK-DISABLED-NEXT: ret void ; CHECK-DISABLED: worker_state_machine.is_active.check: @@ -208,13 +207,13 @@ ; CHECK-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel() ; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] ; CHECK-DISABLED: worker_state_machine.done.barrier: -; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) +; CHECK-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* getelementptr inbounds (%"struct._OMP::KernelEnvironmentTy", %"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_info, i32 0, i32 0), i32 [[TMP0]]) ; CHECK-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] ; CHECK-DISABLED: thread.user_code.check: ; CHECK-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK-DISABLED: user_code.entry: -; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @[[GLOB1]]) #[[ATTR4]] +; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @[[GLOB1]]) #[[ATTR4:[0-9]+]] ; CHECK-DISABLED-NEXT: store i32 0, i32* [[X]], align 4, !noalias !8 ; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1 ; CHECK-DISABLED-NEXT: store i32 1, i32* [[ARRAYIDX1_I]], align 4, !noalias !8 @@ -252,14 +251,14 @@ ; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 1, i1 true) #[[ATTR4]] +; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(i1 true) #[[ATTR4]] ; CHECK-DISABLED-NEXT: ret void ; CHECK-DISABLED: worker.exit: ; CHECK-DISABLED-NEXT: ret void ; entry: %N.addr.sroa.0.0.extract.trunc = trunc i64 %N to i32 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) #3 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_info, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -305,7 +304,7 @@ %call14.i = call i32 @no_openmp(i32* nonnull %x) #5, !noalias !8 %call15.i = call i32 @no_openmp(i32* nonnull %x) #5, !noalias !8 %call16.i = call i32 @no_openmp(i32* nonnull %x) #5, !noalias !8 - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i8 1, i1 true) #3 + call void @__kmpc_target_deinit(i1 true) #3 ret void worker.exit: ; preds = %entry @@ -337,8 +336,6 @@ declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) - ; Function Attrs: convergent declare i32 @no_openmp(i32*) #1 @@ -348,7 +345,8 @@ ; Function Attrs: nounwind declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn declare void @llvm.experimental.noalias.scope.decl(metadata) #4 diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll --- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll @@ -37,7 +37,9 @@ ;; } ;; } +%"struct._OMP::KernelEnvironmentTy" = type { %struct.ident_t, %"struct._OMP::ConfigurationEnvironmentTy", i16 } %struct.ident_t = type { i32, i32, i32, i32, i8* } +%"struct._OMP::ConfigurationEnvironmentTy" = type { i8, i8 } @0 = private unnamed_addr constant [103 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([103 x i8], [103 x i8]* @0, i32 0, i32 0) }, align 8 @@ -46,6 +48,7 @@ @4 = private unnamed_addr constant [104 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1 @5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([104 x i8], [104 x i8]* @4, i32 0, i32 0) }, align 8 @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1 +@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([103 x i8], [103 x i8]* @0, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @6 = private unnamed_addr constant [106 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1 @7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([106 x i8], [106 x i8]* @6, i32 0, i32 0) }, align 8 @8 = private unnamed_addr constant [75 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;test_no_fallback;20;1;;\00", align 1 @@ -53,6 +56,7 @@ @10 = private unnamed_addr constant [107 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1 @11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([107 x i8], [107 x i8]* @10, i32 0, i32 0) }, align 8 @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1 +@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_info = global %"struct._OMP::KernelEnvironmentTy" { %struct.ident_t { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([106 x i8], [106 x i8]* @6, i32 0, i32 0) }, %"struct._OMP::ConfigurationEnvironmentTy" { i8 1, i8 1 }, i16 0 } @12 = private unnamed_addr constant [63 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;known;4;1;;\00", align 1 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([63 x i8], [63 x i8]* @12, i32 0, i32 0) }, align 8 @G = external global i32 @@ -62,7 +66,7 @@ define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 { entry: %captured_vars_addrs.i.i = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) #3, !dbg !18 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_info, i1 true) #3, !dbg !18 %exec_user_code = icmp eq i32 %0, -1, !dbg !18 br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18 @@ -79,12 +83,10 @@ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23 call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26 call void @unknown() #6, !dbg !27 - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i8 1, i1 true) #3, !dbg !28 + call void @__kmpc_target_deinit(i1 true) #3, !dbg !28 br label %common.ret } -declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) local_unnamed_addr - ; Function Attrs: convergent declare void @unknown() local_unnamed_addr #1 @@ -101,13 +103,14 @@ ; Function Attrs: nounwind declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) local_unnamed_addr +declare i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* %KernelEnv, i1) +declare void @__kmpc_target_deinit(i1) ; Function Attrs: norecurse nounwind define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 { entry: %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i8 1, i1 true, i1 true) #3, !dbg !33 + %0 = call i32 @__kmpc_target_init(%"struct._OMP::KernelEnvironmentTy"* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_info, i1 true) #3, !dbg !33 %exec_user_code = icmp eq i32 %0, -1, !dbg !33 br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33 @@ -131,7 +134,7 @@ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43 call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45 call void @spmd_amenable() - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i8 1, i1 true) #3, !dbg !46 + call void @__kmpc_target_deinit(i1 true) #3, !dbg !46 br label %common.ret }