diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -781,7 +781,7 @@ emitGenericVarsEpilog(CGF); CGBuilderTy &Bld = CGF.Builder; - OMPBuilder.createTargetDeinit(Bld, IsSPMD); + OMPBuilder.createTargetDeinit(Bld); } void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1461,8 +1461,7 @@ /// Create a runtime call for kmpc_target_deinit /// /// \param Loc The insert and source location description. - /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not. - void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD); + void createTargetDeinit(const LocationDescription &Loc); ///} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -96,6 +96,10 @@ Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32) __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr) __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8) +__OMP_STRUCT_TYPE(ConfigurationEnvironmentTy, ConfigurationEnvironmentTy, false, + Int8, Int8, Int8) +__OMP_STRUCT_TYPE(KernelEnvironmentTy, KernelEnvironmentTy, false, + ConfigurationEnvironmentTy, IdentPtr, Int16) #undef __OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE @@ -452,8 +456,8 @@ /* Int */ Int32, /* kmp_task_t */ VoidPtr) /// OpenMP Device runtime functions -__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1) -__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8) +__OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentTyPtr) +__OMP_RTL(__kmpc_target_deinit, false, Void,) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) @@ -1012,9 +1016,9 @@ ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs, SExt)) __OMP_RTL_ATTRS(__kmpc_target_init, AttributeSet(), SExt, - ParamAttrs(AttributeSet(), SExt, SExt)) + ParamAttrs(AttributeSet())) __OMP_RTL_ATTRS(__kmpc_target_deinit, AttributeSet(), AttributeSet(), - ParamAttrs(AttributeSet(), SExt)) + ParamAttrs()) __OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(AttributeSet(), SExt, SExt, SExt, SExt, AttributeSet(), AttributeSet(), AttributeSet(), diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3878,14 +3878,39 @@ ConstantInt *IsSPMDVal = ConstantInt::getSigned( IntegerType::getInt8Ty(Int8->getContext()), IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); - ConstantInt *UseGenericStateMachine = - ConstantInt::getBool(Int32->getContext(), !IsSPMD); - + ConstantInt *UseGenericStateMachineVal = ConstantInt::getSigned( + IntegerType::getInt8Ty(Int8->getContext()), !IsSPMD); + ConstantInt *MayUseNestedParallelismVal = + ConstantInt::getSigned(IntegerType::getInt8Ty(Int8->getContext()), true); + ConstantInt *DebugIndentionLevelVal = + ConstantInt::getSigned(IntegerType::getInt16Ty(Int8->getContext()), 0); + + Function *Kernel = Builder.GetInsertBlock()->getParent(); Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_init); - - CallInst *ThreadKind = Builder.CreateCall( - Fn, {Ident, IsSPMDVal, UseGenericStateMachine}); + const DataLayout &DL = Fn->getParent()->getDataLayout(); + + Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get( + ConfigurationEnvironmentTy, { + UseGenericStateMachineVal, + MayUseNestedParallelismVal, + IsSPMDVal, + }); + Constant *KernelEnvironmentInitializer = ConstantStruct::get( + KernelEnvironmentTy, { + ConfigurationEnvironmentInitializer, + Ident, + DebugIndentionLevelVal, + }); + Twine KernelEnvironmentName = Kernel->getName() + "_kernel_info"; + GlobalVariable *KernelEnvironment = new GlobalVariable( + M, KernelEnvironmentTy, /* IsConstant */ true, + GlobalValue::ExternalLinkage, KernelEnvironmentInitializer, + KernelEnvironmentName, + /* InsertBefore */ nullptr, llvm::GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace()); + + CallInst *ThreadKind = Builder.CreateCall(Fn, {KernelEnvironment}); Value *ExecUserCode = Builder.CreateICmpEQ( ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), @@ -3918,22 +3943,14 @@ return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt()); } -void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, - bool IsSPMD) { +void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) { if (!updateToLocation(Loc)) return; - uint32_t SrcLocStrSize; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); - ConstantInt *IsSPMDVal = ConstantInt::getSigned( - IntegerType::getInt8Ty(Int8->getContext()), - IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); - Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); - Builder.CreateCall(Fn, {Ident, IsSPMDVal}); + Builder.CreateCall(Fn, {}); } void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -44,6 +44,7 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -588,6 +589,10 @@ /// one we abort as the kernel is malformed. CallBase *KernelInitCB = nullptr; + /// The constant kernel environement as taken from and passed to + /// __kmpc_target_init. + ConstantStruct *KernelEnvC = nullptr; + /// The __kmpc_target_deinit call in this kernel, if any. If we find more than /// one we abort as the kernel is malformed. CallBase *KernelDeinitCB = nullptr; @@ -692,6 +697,12 @@ "assumptions."); KernelDeinitCB = KIS.KernelDeinitCB; } + if (KIS.KernelEnvC) { + if (KernelEnvC && KernelEnvC != KIS.KernelEnvC) + llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt " + "assumptions."); + KernelEnvC = KIS.KernelEnvC; + } SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; @@ -3386,6 +3397,50 @@ static const char ID; }; +/// Return the \p IdentTy pointer from the constant kernel environment \p +/// KernelEnvC. +Constant *getIdentFromKernelEnvironment(ConstantStruct *KernelEnvC) { + constexpr const unsigned IdentIdx = 1; + return KernelEnvC->getAggregateElement(IdentIdx); +} + +/// Return the \p Configurationnvironment struct from the constant kernel +/// environment \p KernelEnvC. +ConstantStruct * +getConfigurationFromKernelEnvironment(ConstantStruct *KernelEnvC) { + constexpr const unsigned ConfigurationIdx = 0; + return dyn_cast( + KernelEnvC->getAggregateElement(ConfigurationIdx)); +} + +/// Return the \p UseGenericStateMachine flag from the constant kernel +/// environment \p KernelEnvC. +ConstantInt * +getUseGenericStateMachineFromKernelEnvironment(ConstantStruct *KernelEnvC) { + ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC); + constexpr const unsigned UseGenericStateMachineIdx = 0; + return dyn_cast( + ConfigC->getAggregateElement(UseGenericStateMachineIdx)); +} + +/// Return the \p MayUseNestedParallelism flag from the constant kernel +/// environment \p KernelEnvC. +ConstantInt * +getMayUseNestedParallelismFromKernelEnvironment(ConstantStruct *KernelEnvC) { + ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC); + constexpr const unsigned MayUseNestedParallelismIdx = 1; + return cast( + ConfigC->getAggregateElement(MayUseNestedParallelismIdx)); +} + +/// Return the \p ExecMode flag from the constant kernel environment \p +/// KernelEnvC. +ConstantInt *getExecModeFromKernelEnvironment(ConstantStruct *KernelEnvC) { + ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC); + constexpr const unsigned ExecModeIdx = 2; + return dyn_cast(ConfigC->getAggregateElement(ExecModeIdx)); +} + /// The function kernel info abstract attribute, basically, what can we say /// about a function with regards to the KernelInfoState. struct AAKernelInfoFunction : AAKernelInfo { @@ -3398,6 +3453,48 @@ return GuardedInstructions; } + GlobalVariable *getKernelEnvironementGlobalVariable() { + constexpr const int InitKernelEnvironmentArgNo = 0; + return cast( + KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo) + ->stripPointerCasts()); + } + + void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) { + constexpr const unsigned ConfigurationIdx = 0; + Constant *NewKernelEnvC = ConstantFoldInsertValueInstruction( + KernelEnvC, ConfigC, {ConfigurationIdx}); + assert(NewKernelEnvC && "Failed to create new kernel environment"); + KernelEnvC = cast(NewKernelEnvC); + } + + void setUseGenericStateMachineOfKernelEnvironment(ConstantInt *NewVal) { + ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC); + constexpr const unsigned UseGenericStateMachineIdx = 2; + Constant *NewConfigC = ConstantFoldInsertValueInstruction( + ConfigC, NewVal, {UseGenericStateMachineIdx}); + assert(NewConfigC && "Failed to create new configuration environment"); + setConfigurationOfKernelEnvironment(cast(NewConfigC)); + } + + void setMayUseNestedParallelismOfKernelEnvironment(ConstantInt *NewVal) { + ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC); + constexpr const unsigned ExecModeIdx = 1; + Constant *NewConfigC = + ConstantFoldInsertValueInstruction(ConfigC, NewVal, {ExecModeIdx}); + assert(NewConfigC && "Failed to create new configuration environment"); + setConfigurationOfKernelEnvironment(cast(NewConfigC)); + } + + void setExecModeOfKernelEnvironment(ConstantInt *NewVal) { + ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC); + constexpr const unsigned ExecModeIdx = 2; + Constant *NewConfigC = + ConstantFoldInsertValueInstruction(ConfigC, NewVal, {ExecModeIdx}); + assert(NewConfigC && "Failed to create new configuration environment"); + setConfigurationOfKernelEnvironment(cast(NewConfigC)); + } + /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { // This is a high-level transform that might change the constant arguments @@ -3446,6 +3543,9 @@ ReachingKernelEntries.insert(Fn); IsKernelEntry = true; + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + KernelEnvC = cast(KernelEnvGV->getInitializer()); + // For kernels we might need to initialize/finalize the IsSPMD state and // we need to register a simplification callback so that the Attributor // knows the constant arguments to __kmpc_target_init and @@ -3495,27 +3595,30 @@ return Val; }; - constexpr const int InitModeArgNo = 1; - constexpr const int DeinitModeArgNo = 1; - constexpr const int InitUseStateMachineArgNo = 2; - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), - StateMachineSimplifyCB); - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo), - ModeSimplifyCB); + Attributor::SimplifictionCallbackTy KernelConfigurationSimplifyCB = + [&](const IRPosition &IRP, const AbstractAttribute *AA, + bool &UsedAssumedInformation) -> std::optional { + return KernelEnvC; + }; + A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo), - ModeSimplifyCB); + IRPosition::value(*KernelEnvGV->getInitializer()), + KernelConfigurationSimplifyCB); + + ConstantInt *ExecModeC = getExecModeFromKernelEnvironment(KernelEnvC); + auto *AssumedExecModeC = ConstantInt::get( + ExecModeC->getType(), + ExecModeC->getZExtValue() | OMP_TGT_EXEC_MODE_GENERIC_SPMD); // Check if we know we are in SPMD-mode already. - ConstantInt *ModeArg = - dyn_cast(KernelInitCB->getArgOperand(InitModeArgNo)); - if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) + if (ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD) SPMDCompatibilityTracker.indicateOptimisticFixpoint(); - // This is a generic region but SPMDization is disabled so stop tracking. else if (DisableOpenMPOptSPMDization) + // This is a generic region but SPMDization is disabled so stop + // tracking. SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else + setExecModeOfKernelEnvironment(AssumedExecModeC); // Register virtual uses of functions we might need to preserve. auto RegisterVirtualUse = [&](RuntimeFunction RFKind, @@ -3616,21 +3719,29 @@ if (!KernelInitCB || !KernelDeinitCB) return ChangeStatus::UNCHANGED; - /// Insert nested Parallelism global variable - Function *Kernel = getAnchorScope(); - Module &M = *Kernel->getParent(); - Type *Int8Ty = Type::getInt8Ty(M.getContext()); - new GlobalVariable(M, Int8Ty, /* isConstant */ true, - GlobalValue::WeakAnyLinkage, - ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0), - Kernel->getName() + "_nested_parallelism"); + ChangeStatus Changed = ChangeStatus::UNCHANGED; // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. - ChangeStatus Changed = ChangeStatus::UNCHANGED; if (!changeToSPMDMode(A, Changed)) { if (!KernelInitCB->getCalledFunction()->isDeclaration()) - return buildCustomStateMachine(A); + Changed |= buildCustomStateMachine(A); + } + + if (KernelEnvC) { + ConstantInt *MayUseNestedParallelismC = + getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC); + ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get( + MayUseNestedParallelismC->getType(), NestedParallelism); + setMayUseNestedParallelismOfKernelEnvironment( + NewMayUseNestedParallelismC); + + // At last, update the KernelEnvc + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + if (KernelEnvGV->getInitializer() != KernelEnvC) { + KernelEnvGV->setInitializer(KernelEnvC); + Changed = ChangeStatus::CHANGED; + } } return Changed; @@ -3952,16 +4063,8 @@ assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!"); // Check if the kernel is already in SPMD mode, if so, return success. - GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( - (Kernel->getName() + "_exec_mode").str()); - assert(ExecMode && "Kernel without exec mode?"); - assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); - - // Set the global exec mode flag to indicate SPMD-Generic mode. - assert(isa(ExecMode->getInitializer()) && - "ExecMode is not an integer!"); - const int8_t ExecModeVal = - cast(ExecMode->getInitializer())->getSExtValue(); + auto *ExecModeC = getExecModeFromKernelEnvironment(KernelEnvC); + const int8_t ExecModeVal = ExecModeC->getSExtValue(); if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC) return true; @@ -3979,27 +4082,8 @@ // kernel is executed in. assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && "Initially non-SPMD kernel has SPMD exec mode!"); - ExecMode->setInitializer( - ConstantInt::get(ExecMode->getInitializer()->getType(), - ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD)); - - // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. - const int InitModeArgNo = 1; - const int DeinitModeArgNo = 1; - const int InitUseStateMachineArgNo = 2; - - auto &Ctx = getAnchorValue().getContext(); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitModeArgNo), - *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), - OMP_TGT_EXEC_MODE_SPMD)); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), - *ConstantInt::getBool(Ctx, false)); - A.changeUseAfterManifest( - KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), - *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), - OMP_TGT_EXEC_MODE_SPMD)); + setExecModeOfKernelEnvironment(ConstantInt::get( + ExecModeC->getType(), ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD)); ++NumOpenMPTargetRegionKernelsSPMD; @@ -4019,30 +4103,29 @@ if (!ReachedKnownParallelRegions.isValidState()) return ChangeStatus::UNCHANGED; - const int InitModeArgNo = 1; - const int InitUseStateMachineArgNo = 2; + GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable(); + assert(KernelEnvGV && "missing kernel environment global variable"); + auto *ExistingKernelEnvC = + cast(KernelEnvGV->getInitializer()); // Check if the current configuration is non-SPMD and generic state machine. // If we already have SPMD mode or a custom state machine we do not need to // go any further. If it is anything but a constant something is weird and // we give up. - ConstantInt *UseStateMachine = dyn_cast( - KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); - ConstantInt *Mode = - dyn_cast(KernelInitCB->getArgOperand(InitModeArgNo)); + ConstantInt *UseStateMachineC = + getUseGenericStateMachineFromKernelEnvironment(ExistingKernelEnvC); + ConstantInt *ModeC = getExecModeFromKernelEnvironment(ExistingKernelEnvC); // If we are stuck with generic mode, try to create a custom device (=GPU) // state machine which is specialized for the parallel regions that are // reachable by the kernel. - if (!UseStateMachine || UseStateMachine->isZero() || !Mode || - (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) + if (!UseStateMachineC->isZero() || + (ModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) return ChangeStatus::UNCHANGED; // If not SPMD mode, indicate we use a custom state machine now. - auto &Ctx = getAnchorValue().getContext(); - auto *FalseVal = ConstantInt::getBool(Ctx, false); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); + setUseGenericStateMachineOfKernelEnvironment( + ConstantInt::get(UseStateMachineC->getType(), false)); // If we don't actually need a state machine we are done here. This can // happen if there simply are no parallel regions. In the resulting kernel @@ -4121,6 +4204,7 @@ // UserCodeEntryBB: // user code // __kmpc_target_deinit(...) // + auto &Ctx = getAnchorValue().getContext(); Function *Kernel = getAssociatedFunction(); assert(Kernel && "Expected an associated function!"); @@ -4204,7 +4288,7 @@ StateMachineBeginBB->end()), DLoc)); - Value *Ident = KernelInitCB->getArgOperand(0); + Value *Ident = getIdentFromKernelEnvironment(KernelEnvC); Value *GTid = KernelInitCB; FunctionCallee BarrierFn = @@ -4334,6 +4418,39 @@ ChangeStatus updateImpl(Attributor &A) override { KernelInfoState StateBefore = getState(); + // When we leave this function this RAII will make sure the member + // KernelEnvC is updated properly depending on the state. That member is + // used for simplification of values and needs to be up to date at all + // times. + struct UpdateKernelEnvCRAII { + AAKernelInfoFunction &AA; + + UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {} + + ~UpdateKernelEnvCRAII() { + if (!AA.KernelEnvC) + return; + + GlobalVariable *KernelEnvGV = AA.getKernelEnvironementGlobalVariable(); + ConstantStruct *ExistingKernelEnvC = + cast(KernelEnvGV->getInitializer()); + + if (!AA.isValidState()) { + AA.KernelEnvC = ExistingKernelEnvC; + return; + } + + if (!AA.ReachedKnownParallelRegions.isValidState()) + AA.setUseGenericStateMachineOfKernelEnvironment( + getUseGenericStateMachineFromKernelEnvironment( + ExistingKernelEnvC)); + + if (!AA.SPMDCompatibilityTracker.isValidState()) + AA.setExecModeOfKernelEnvironment( + getExecModeFromKernelEnvironment(ExistingKernelEnvC)); + } + } RAII(*this); + // Callback to check a read/write instruction. auto CheckRWInst = [&](Instruction &I) { // We handle calls later. diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -128,6 +128,7 @@ -nocudalib -nogpulib -nostdinc -fopenmp -fopenmp-cuda-mode -Wno-unknown-cuda-version + -DOMPTARGET_DEVICE_RUNTIME -I${include_directory} -I${devicertl_base_directory}/../include ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL} diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h --- a/openmp/libomptarget/DeviceRTL/include/Debug.h +++ b/openmp/libomptarget/DeviceRTL/include/Debug.h @@ -50,8 +50,6 @@ struct DebugEntryRAII { DebugEntryRAII(const char *File, const unsigned Line, const char *Function); ~DebugEntryRAII(); - - static void init(); }; #endif diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -214,12 +214,14 @@ /// Kernel /// ///{ +// Forward declaration +struct KernelEnvironmentTy; + int8_t __kmpc_is_spmd_exec_mode(); -int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode, - bool UseGenericStateMachine); +int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment); -void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode); +void __kmpc_target_deinit(); ///} diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h --- a/openmp/libomptarget/DeviceRTL/include/State.h +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -17,6 +17,9 @@ #include "Types.h" #include "Utils.h" +// Forward declaration. +struct KernelEnvironmentTy; + #pragma omp begin declare target device_type(nohost) namespace ompx { @@ -113,7 +116,10 @@ #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) /// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD); +void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); + +/// Return the kernel environment associated with the current kernel. +KernelEnvironmentTy &getKernelEnvironment(); /// TODO enum ValueKind { diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -12,7 +12,7 @@ //===----------------------------------------------------------------------===// #include "Configuration.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "State.h" #include "Types.h" @@ -23,7 +23,6 @@ // defined by CGOpenMPRuntimeGPU extern uint32_t __omp_rtl_debug_kind; extern uint32_t __omp_rtl_assume_no_thread_state; -extern uint32_t __omp_rtl_assume_no_nested_parallelism; // This variable should be visibile to the plugin so we override the default // hidden visibility. @@ -53,7 +52,7 @@ bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } bool config::mayUseNestedParallelism() { - return !__omp_rtl_assume_no_nested_parallelism; + return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; } #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -12,8 +12,10 @@ #include "Debug.h" #include "Configuration.h" +#include "Environment.h" #include "Interface.h" #include "Mapping.h" +#include "State.h" #include "Types.h" using namespace ompx; @@ -31,15 +33,13 @@ } } -/// Current indentation level for the function trace. Only accessed by thread 0. -__attribute__((loader_uninitialized)) static uint32_t Level; -#pragma omp allocate(Level) allocator(omp_pteam_mem_alloc) - DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line, const char *Function) { if (config::isDebugMode(config::DebugKind::FunctionTracing) && mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) { + uint16_t &Level = state::getKernelEnvironment().DebugIndentionLevel; + for (int I = 0; I < Level; ++I) PRINTF("%s", " "); @@ -51,10 +51,10 @@ DebugEntryRAII::~DebugEntryRAII() { if (config::isDebugMode(config::DebugKind::FunctionTracing) && - mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) + mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) { + uint16_t &Level = state::getKernelEnvironment().DebugIndentionLevel; Level--; + } } -void DebugEntryRAII::init() { Level = 0; } - #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "Debug.h" +#include "Environment.h" #include "Interface.h" #include "Mapping.h" #include "State.h" @@ -23,11 +24,12 @@ #pragma omp begin declare target device_type(nohost) -static void inititializeRuntime(bool IsSPMD) { +static void inititializeRuntime(bool IsSPMD, + KernelEnvironmentTy &KernelEnvironment) { // Order is important here. synchronize::init(IsSPMD); mapping::init(IsSPMD); - state::init(IsSPMD); + state::init(IsSPMD, KernelEnvironment); } /// Simple generic state machine for worker threads. @@ -67,16 +69,17 @@ /// /// \param Ident Source location identification, can be NULL. /// -int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode, - bool UseGenericStateMachine) { +int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) { FunctionTracingRAII(); - const bool IsSPMD = - Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; + ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; + bool IsSPMD = Configuration.ExecMode & + llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; + bool UseGenericStateMachine = Configuration.UseGenericStateMachine; if (IsSPMD) { - inititializeRuntime(/* IsSPMD */ true); + inititializeRuntime(/* IsSPMD */ true, KernelEnvironment); synchronize::threadsAligned(); } else { - inititializeRuntime(/* IsSPMD */ false); + inititializeRuntime(/* IsSPMD */ false, KernelEnvironment); // No need to wait since only the main threads will execute user // code and workers will run into a barrier right away. } @@ -104,7 +107,7 @@ // thread's warp, so none of its threads can ever be active worker threads. if (UseGenericStateMachine && mapping::getThreadIdInBlock() < mapping::getBlockSize(IsSPMD)) { - genericStateMachine(Ident); + genericStateMachine(KernelEnvironment.Ident); } else { // Retrieve the work function just to ensure we always call // __kmpc_kernel_parallel even if a custom state machine is used. @@ -128,10 +131,9 @@ /// /// \param Ident Source location identification, can be NULL. /// -void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) { +void __kmpc_target_deinit() { FunctionTracingRAII(); - const bool IsSPMD = - Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; + bool IsSPMD = mapping::isSPMDMode(); state::assumeInitialState(IsSPMD); if (IsSPMD) return; diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -9,8 +9,8 @@ //===----------------------------------------------------------------------===// #include "State.h" -#include "Configuration.h" #include "Debug.h" +#include "Environment.h" #include "Interface.h" #include "Mapping.h" #include "Synchronization.h" @@ -34,6 +34,9 @@ extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment))); #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) +/// The kernel environment passed to the init method by the compiler. +static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr); + namespace { /// Fallback implementations are missing to trigger a link time error. @@ -241,15 +244,19 @@ } // namespace -void state::init(bool IsSPMD) { +void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { SharedMemorySmartStack.init(IsSPMD); if (mapping::isInitialThreadInLevel0(IsSPMD)) { TeamState.init(IsSPMD); - DebugEntryRAII::init(); ThreadStates = nullptr; + KernelEnvironmentPtr = &KernelEnvironment; } } +KernelEnvironmentTy &state::getKernelEnvironment() { + return *KernelEnvironmentPtr; +} + void state::enterDataEnvironment(IdentTy *Ident) { ASSERT(config::mayUseThreadStates() && "Thread state modified while explicitly disabled!"); diff --git a/openmp/libomptarget/include/DeviceEnvironment.h b/openmp/libomptarget/include/DeviceEnvironment.h deleted file mode 100644 --- a/openmp/libomptarget/include/DeviceEnvironment.h +++ /dev/null @@ -1,25 +0,0 @@ -//===---- device_environment.h - OpenMP GPU device environment ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Global device environment -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_ -#define _OMPTARGET_DEVICE_ENVIRONMENT_H_ - -// deviceRTL uses and DeviceRTL uses explicit definitions - -struct DeviceEnvironmentTy { - uint32_t DebugKind; - uint32_t NumDevices; - uint32_t DeviceNum; - uint32_t DynamicMemSize; -}; - -#endif diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/include/Environment.h @@ -0,0 +1,53 @@ +//===------------ Environment.h - OpenMP GPU environments --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Environments shared between host and device. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_ENVIRONMENT_H_ +#define _OMPTARGET_ENVIRONMENT_H_ + +#ifdef OMPTARGET_DEVICE_RUNTIME +#include "Types.h" +#else +#include "SourceInfo.h" + +#include + +using IdentTy = ident_t; +#endif + +#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" + +struct DeviceEnvironmentTy { + uint32_t DebugKind; + uint32_t NumDevices; + uint32_t DeviceNum; + uint32_t DynamicMemSize; +}; + +// NOTE: Please don't change the order of those members as they are used in the +// middle end. Always add the new data member at the end. +struct ConfigurationEnvironmentTy { + uint8_t UseGenericStateMachine; + uint8_t MayUseNestedParallelism; + llvm::omp::OMPTgtExecModeFlags ExecMode; +}; + +// NOTE: Please don't change the order of those members as they are used in the +// middle end. Always add the new data member at the end. +struct KernelEnvironmentTy { + ConfigurationEnvironmentTy Configuration; + IdentTy *Ident; + /// Current indentation level for the function trace. Only accessed by thread + /// 0. + uint16_t DebugIndentionLevel; +}; + +#endif // _OMPTARGET_ENVIRONMENT_H_ diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -21,7 +21,7 @@ #include #include "Debug.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "GlobalHandler.h" #include "PluginInterface.h" #include "Utilities.h" @@ -431,7 +431,7 @@ /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -19,7 +19,7 @@ #include #include "Debug.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "GlobalHandler.h" #include "JIT.h" #include "MemoryManager.h" @@ -627,6 +627,11 @@ /// Map of host pinned allocations used for optimize device transfers. PinnedAllocationMapTy PinnedAllocs; + +private: + /// Return the kernel environment object for kernel \p Name. + Expected + getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image); }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -554,32 +554,43 @@ return Plugin::success(); } -Expected -GenericDeviceTy::getExecutionModeForKernel(StringRef Name, - DeviceImageTy &Image) { - // Create a metadata object for the exec mode global (auto-generated). - StaticGlobalTy ExecModeGlobal(Name.data(), - "_exec_mode"); +Expected +GenericDeviceTy::getKernelEnvironmentForKernel(StringRef Name, + DeviceImageTy &Image) { + // Create a metadata object for the kernel environment object. + StaticGlobalTy KernelEnv(Name.data(), "_kernel_info"); - // Retrieve execution mode for the kernel. This may fail since some kernels - // may not have an execution mode. + // Retrieve kernel environment object for the kernel. GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); - if (auto Err = GHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) { + if (auto Err = GHandler.readGlobalFromImage(*this, Image, KernelEnv)) { // Consume the error since it is acceptable to fail. [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); - DP("Failed to read execution mode for '%s': %s\n" - "Using default SPMD (2) execution mode\n", - Name.data(), ErrStr.data()); + DP("Failed to read kernel environment object for '%s': %s\n", Name.data(), + ErrStr.data()); - return OMP_TGT_EXEC_MODE_SPMD; + return createStringError(inconvertibleErrorCode(), ErrStr); } + return KernelEnv.getValue(); +} + +Expected +GenericDeviceTy::getExecutionModeForKernel(StringRef Name, + DeviceImageTy &Image) { + auto KernelEnvOrError = getKernelEnvironmentForKernel(Name, Image); + // We error out directly if we can't read the kernel environment object. + if (!KernelEnvOrError) + return KernelEnvOrError.takeError(); + + auto &KernelEnv = *KernelEnvOrError; + auto ExecMode = KernelEnv.Configuration.ExecMode; + // Check that the retrieved execution mode is valid. - if (!GenericKernelTy::isValidExecutionMode(ExecModeGlobal.getValue())) - return Plugin::error("Invalid execution mode %d for '%s'", - ExecModeGlobal.getValue(), Name.data()); + if (!GenericKernelTy::isValidExecutionMode(ExecMode)) + return Plugin::error("Invalid execution mode %d for '%s'", ExecMode, + Name.data()); - return ExecModeGlobal.getValue(); + return ExecMode; } Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr, diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -17,7 +17,7 @@ #include #include "Debug.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "GlobalHandler.h" #include "PluginInterface.h" diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -17,7 +17,7 @@ #include #include "Debug.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "GlobalHandler.h" #include "PluginInterface.h" #include "omptarget.h" diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -37,7 +37,7 @@ #include "internal.h" #include "rt.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -23,7 +23,7 @@ #include #include "Debug.h" -#include "DeviceEnvironment.h" +#include "Environment.h" #include "omptarget.h" #include "omptargetplugin.h"