diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -2476,6 +2476,20 @@ } }; +/// To encapsulate helper methods to be used by target specific specializations +/// of CGOpenMPRuntimeGPU. +class CodeGenUtil { +public: + static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, + QualType FieldTy); + + template + static llvm::GlobalVariable *createGlobalStruct(CodeGenModule &CGM, QualType Ty, + bool IsConstant, + ArrayRef Data, + const Twine &Name, As &&... Args); +}; + } // namespace CodeGen } // namespace clang diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1048,17 +1048,6 @@ AlignmentSource::Decl); } -static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, - QualType FieldTy) { - auto *Field = FieldDecl::Create( - C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy, - C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - DC->addDecl(Field); - return Field; -} - CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator, StringRef Separator) : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator), @@ -1352,11 +1341,21 @@ } } +FieldDecl *clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(ASTContext &C, DeclContext *DC, + QualType FieldTy) { + auto *Field = FieldDecl::Create( + C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy, + C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + DC->addDecl(Field); + return Field; +} + template -static llvm::GlobalVariable * -createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant, - ArrayRef Data, const Twine &Name, - As &&... Args) { +llvm::GlobalVariable *clang::CodeGen::CodeGenUtil::createGlobalStruct( + CodeGenModule &CGM, QualType Ty, bool IsConstant, + ArrayRef Data, const Twine &Name, As &&... Args) { const auto *RD = cast(Ty->getAsTagDecl()); const CGRecordLayout &RL = CGM.getTypes().getCGRecordLayout(RD); ConstantInitBuilder CIBuilder(CGM); @@ -3072,7 +3071,7 @@ llvm::ConstantInt::get(CGM.Int32Ty, Flags), llvm::ConstantInt::get(CGM.Int32Ty, 0)}; std::string EntryName = getName({"omp_offloading", "entry", ""}); - llvm::GlobalVariable *Entry = createGlobalStruct( + llvm::GlobalVariable *Entry = CodeGenUtil::createGlobalStruct( CGM, getTgtOffloadEntryQTy(), /*IsConstant=*/true, Data, Twine(EntryName).concat(Name), llvm::GlobalValue::WeakAnyLinkage); @@ -3350,12 +3349,12 @@ ASTContext &C = CGM.getContext(); RecordDecl *RD = C.buildImplicitRecord("__tgt_offload_entry"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, C.VoidPtrTy); - addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy)); - addFieldToRecordDecl(C, RD, C.getSizeType()); - addFieldToRecordDecl( + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy)); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType()); + CodeGenUtil::addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); - addFieldToRecordDecl( + CodeGenUtil::addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); RD->completeDefinition(); RD->addAttr(PackedAttr::CreateImplicit(C)); @@ -3399,7 +3398,7 @@ if (Pair.second.isLocalPrivate() && VD->getType()->isLValueReferenceType()) Type = C.getPointerType(Type); - FieldDecl *FD = addFieldToRecordDecl(C, RD, Type); + FieldDecl *FD = CodeGenUtil::addFieldToRecordDecl(C, RD, Type); if (VD->hasAttrs()) { for (specific_attr_iterator I(VD->getAttrs().begin()), E(VD->getAttrs().end()); @@ -3433,27 +3432,27 @@ // }; RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union); UD->startDefinition(); - addFieldToRecordDecl(C, UD, KmpInt32Ty); - addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy); + CodeGenUtil::addFieldToRecordDecl(C, UD, KmpInt32Ty); + CodeGenUtil::addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy); UD->completeDefinition(); QualType KmpCmplrdataTy = C.getRecordType(UD); RecordDecl *RD = C.buildImplicitRecord("kmp_task_t"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, C.VoidPtrTy); - addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy); - addFieldToRecordDecl(C, RD, KmpInt32Ty); - addFieldToRecordDecl(C, RD, KmpCmplrdataTy); - addFieldToRecordDecl(C, RD, KmpCmplrdataTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy); if (isOpenMPTaskLoopDirective(Kind)) { QualType KmpUInt64Ty = CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0); QualType KmpInt64Ty = CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1); - addFieldToRecordDecl(C, RD, KmpUInt64Ty); - addFieldToRecordDecl(C, RD, KmpUInt64Ty); - addFieldToRecordDecl(C, RD, KmpInt64Ty); - addFieldToRecordDecl(C, RD, KmpInt32Ty); - addFieldToRecordDecl(C, RD, C.VoidPtrTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); } RD->completeDefinition(); return RD; @@ -3469,9 +3468,9 @@ // }; RecordDecl *RD = C.buildImplicitRecord("kmp_task_t_with_privates"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, KmpTaskTQTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpTaskTQTy); if (const RecordDecl *PrivateRD = createPrivatesRecordDecl(CGM, Privates)) - addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD)); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD)); RD->completeDefinition(); return RD; } @@ -4100,9 +4099,9 @@ RecordDecl *KmpAffinityInfoRD = C.buildImplicitRecord("kmp_task_affinity_info_t"); KmpAffinityInfoRD->startDefinition(); - addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType()); - addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType()); - addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy); + CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy); KmpAffinityInfoRD->completeDefinition(); KmpTaskAffinityInfoTy = C.getRecordType(KmpAffinityInfoRD); } @@ -4536,9 +4535,9 @@ if (KmpDependInfoTy.isNull()) { RecordDecl *KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info"); KmpDependInfoRD->startDefinition(); - addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType()); - addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType()); - addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy); + CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy); KmpDependInfoRD->completeDefinition(); KmpDependInfoTy = C.getRecordType(KmpDependInfoRD); } @@ -5985,13 +5984,13 @@ ASTContext &C = CGM.getContext(); RecordDecl *RD = C.buildImplicitRecord("kmp_taskred_input_t"); RD->startDefinition(); - const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *OrigFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType()); - const FieldDecl *InitFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *FlagsFD = addFieldToRecordDecl( + const FieldDecl *SharedFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *OrigFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *SizeFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType()); + const FieldDecl *InitFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *FiniFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *CombFD = CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *FlagsFD = CodeGenUtil::addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false)); RD->completeDefinition(); QualType RDType = C.getRecordType(RD); @@ -11104,9 +11103,9 @@ // }; RD = C.buildImplicitRecord("kmp_dim"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, Int64Ty); - addFieldToRecordDecl(C, RD, Int64Ty); - addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); RD->completeDefinition(); KmpDimTy = C.getRecordType(RD); } else { @@ -11536,8 +11535,8 @@ if (VI == I->getSecond().end()) { RecordDecl *RD = C.buildImplicitRecord("lasprivate.conditional"); RD->startDefinition(); - VDField = addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType()); - FiredField = addFieldToRecordDecl(C, RD, C.CharTy); + VDField = CodeGenUtil::addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType()); + FiredField = CodeGenUtil::addFieldToRecordDecl(C, RD, C.CharTy); RD->completeDefinition(); NewType = C.getRecordType(RD); Address Addr = CGF.CreateMemTemp(NewType, C.getDeclAlign(VD), VD->getName()); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -27,6 +27,16 @@ public: explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM); +private: + /// Curret nesting level of parallel region + int ParallelLevel = 0; + + /// Maximum nesting level of parallel region + int MaxParallelLevel = 0; + + /// Struct to store kernel descriptors + QualType TgtAttributeStructQTy; + /// Get the GPU warp size. llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; @@ -35,6 +45,72 @@ /// Get the maximum number of threads in a block of the GPU. llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; + + /// Allocate global variable for TransferMedium + llvm::GlobalVariable * + allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty, + StringRef TransferMediumName) override; + + /// Allocate global variable for SharedStaticRD + llvm::GlobalVariable * + allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) override; + + /// Get global variable KernelStaticGlobalized which is a shared pointer for + /// the global memory in the global memory buffer used for the given kernel + llvm::GlobalVariable * + allocateKernelStaticGlobalized(CodeGenModule &CGM) override; + + /// Get target specific PrePostActionTy + PrePostActionTy *getPrePostActionTy() override; + + /// Target independent wrapper over target specific emitSPMDKernel() + void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Target independent wrapper over target specific emitNonSPMDKernel() + void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Create a unique global variable to indicate the flat-work-group-size + /// for this region. Values are [256..1024]. + static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name, + unsigned WGSize); + + /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct. + /// Also generate appropriate value of attribute amdgpu-flat-work-group-size + void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D, + llvm::Function *&OutlinedFn, bool IsGeneric); + + /// Returns __tgt_attribute_struct type. + QualType getTgtAttributeStructQTy(); + + /// Emit structure descriptor for a kernel + void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name, + int16_t WG_Size, int8_t Mode, + int8_t HostServices, int8_t MaxParallelLevel); + + class AMDGCNPrePostActionTy final : public PrePostActionTy { + int &ParallelLevel; + int &MaxParallelLevel; + + public: + AMDGCNPrePostActionTy(int &ParallelLevel, int &MaxParallelLevel) + : ParallelLevel(ParallelLevel), MaxParallelLevel(MaxParallelLevel) {} + void Enter(CodeGenFunction &CGF) override { + // Count the number of nested parallels. + if (ParallelLevel > MaxParallelLevel) + MaxParallelLevel = ParallelLevel; + ParallelLevel++; + } + void Exit(CodeGenFunction &CGF) override { ParallelLevel--; } + }; }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeAMDGCN.h" +#include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -30,6 +31,7 @@ : CGOpenMPRuntimeGPU(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP AMDGCN can only handle device code."); + StaticRDLinkage = llvm::GlobalValue::PrivateLinkage; } llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { @@ -59,3 +61,176 @@ return Bld.CreateTrunc( Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty); } + +llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal( + CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) { + return new llvm::GlobalVariable( + CGM.getModule(), Ty, /*isConstant=*/false, + llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty), + TransferMediumName, + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared), + /*isExternallyInitialized*/ true); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) { + return new llvm::GlobalVariable( + CGM.getModule(), LLVMStaticTy, + /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_", + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared), + /*isExternallyInitialized*/ true); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) { + return new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared), + /*isExternallyInitialized*/ true); +} + +void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM, + StringRef Name, + unsigned WGSize) { + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"), + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_device), + /*isExternallyInitialized*/ false); + CGM.addCompilerUsedGlobal(GVMode); +} + +void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM, + const OMPExecutableDirective &D, + llvm::Function *&OutlinedFn, + bool IsGeneric) { + int FlatAttr = 0; + bool FlatAttrEmitted = false; + unsigned DefaultWorkGroupSz = + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size); + + if (isOpenMPTeamsDirective(D.getDirectiveKind()) || + isOpenMPParallelDirective(D.getDirectiveKind())) { + const auto *ThreadLimitClause = D.getSingleClause(); + const auto *NumThreadsClause = D.getSingleClause(); + unsigned MaxWorkGroupSz = + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size); + unsigned CompileTimeThreadLimit = 0; + // Only one of thread_limit or num_threads is used, cant do it for both + if (ThreadLimitClause && !NumThreadsClause) { + Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit(); + clang::Expr::EvalResult Result; + if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext())) + CompileTimeThreadLimit = Result.Val.getInt().getExtValue(); + } else if (!ThreadLimitClause && NumThreadsClause) { + Expr *NumThreadsExpr = NumThreadsClause->getNumThreads(); + clang::Expr::EvalResult Result; + if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext())) + CompileTimeThreadLimit = Result.Val.getInt().getExtValue(); + } + + // Add kernel metadata if ThreadLimit Clause is compile time constant > 0 + if (CompileTimeThreadLimit > 0) { + // Add the WarpSize to generic, to reflect what runtime dispatch does. + if (IsGeneric) + CompileTimeThreadLimit += + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size); + if (CompileTimeThreadLimit > MaxWorkGroupSz) + CompileTimeThreadLimit = MaxWorkGroupSz; + std::string AttrVal = llvm::utostr(CompileTimeThreadLimit); + FlatAttr = CompileTimeThreadLimit; + OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", + AttrVal + "," + AttrVal); + setPropertyWorkGroupSize(CGM, OutlinedFn->getName(), + CompileTimeThreadLimit); + } + FlatAttrEmitted = true; + } // end of amdgcn teams or parallel directive + + // emit amdgpu-flat-work-group-size if not emitted already. + if (!FlatAttrEmitted) { + std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz); + OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", + FlatAttrVal + "," + FlatAttrVal); + } + // Emit a kernel descriptor for runtime. + StringRef KernDescName = OutlinedFn->getName(); + CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CGM, KernDescName, FlatAttr, + IsGeneric, + 1, // Uses HostServices + MaxParallelLevel); + // Reset it to zero for any subsequent kernel + MaxParallelLevel = 0; +} + +void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false); +} + +void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true); +} + +PrePostActionTy *CGOpenMPRuntimeAMDGCN::getPrePostActionTy() { + return new AMDGCNPrePostActionTy(ParallelLevel, MaxParallelLevel); +} + +/// Emit structure descriptor for a kernel +void CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc( + CodeGenModule &CGM, StringRef Name, int16_t WG_Size, int8_t Mode, + int8_t HostServices, int8_t MaxParallelLevel) { + + // Create all device images + llvm::Constant *AttrData[] = { + llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version + llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes + llvm::ConstantInt::get(CGM.Int16Ty, WG_Size), + llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC + llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices + llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests + + llvm::GlobalVariable *AttrImages = clang::CodeGen::CodeGenUtil::createGlobalStruct( + CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData, + Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage); + CGM.addCompilerUsedGlobal(AttrImages); +} + +// Create Tgt Attribute Struct type. +QualType CGOpenMPRuntimeAMDGCN::getTgtAttributeStructQTy() { + ASTContext &C = CGM.getContext(); + QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1); + QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1); + if (TgtAttributeStructQTy.isNull()) { + RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct"); + RD->startDefinition(); + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, + KmpInt16Ty); // Struct Size in bytes. + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty); // Mode + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty); // HostServices + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty); // MaxParallelLevel + RD->completeDefinition(); + TgtAttributeStructQTy = C.getRecordType(RD); + } + return TgtAttributeStructQTy; +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -33,6 +33,14 @@ /// Unknown execution mode (orphaned directive). EM_Unknown, }; + +protected: + /// true if we're definitely in the parallel region. + bool IsInParallelRegion = false; + + /// Linkage type of StaticRD Global variable + llvm::GlobalValue::LinkageTypes StaticRDLinkage; + private: /// Parallel outlined function work for workers to execute. llvm::SmallVector Work; @@ -99,36 +107,6 @@ uint64_t Size, int32_t Flags, llvm::GlobalValue::LinkageTypes Linkage) override; - /// Emit outlined function specialized for the Fork-Join - /// programming model for applicable target directives on the NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - - /// Emit outlined function specialized for the Single Program - /// Multiple Data programming model for applicable target directives on the - /// NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// \param CodeGen Object containing the target statements. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - /// Emit outlined function for 'target' directive on the NVPTX /// device. /// \param D Directive to emit. @@ -194,6 +172,36 @@ /// Full/Lightweight runtime mode. Used for better optimization. unsigned getDefaultLocationReserved2Flags() const override; + /// Emit outlined function specialized for the Fork-Join + /// programming model for applicable target directives on the NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); + + /// Emit outlined function specialized for the Single Program + /// Multiple Data programming model for applicable target directives on the + /// NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// \param CodeGen Object containing the target statements. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); + public: explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM); void clear() override; @@ -211,6 +219,39 @@ /// Get the maximum number of threads in a block of the GPU. virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; + /// Allocate global variable for TransferMedium + virtual llvm::GlobalVariable * + allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty, + StringRef TransferMediumName) = 0; + + /// Allocate global variable for SharedStaticRD + virtual llvm::GlobalVariable * + allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) = 0; + + /// Allocate global variable for KernelStaticGlobalized + virtual llvm::GlobalVariable * + allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0; + + /// Get target specific PrePostAction + virtual PrePostActionTy *getPrePostActionTy() = 0; + + /// Target independent wrapper over target specific emitSPMDKernel() + virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) = 0; + + /// Target independent wrapper over target specific emitNonSPMDKernel() + virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) = 0; + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. virtual void emitProcBindClause(CodeGenFunction &CGF, @@ -424,8 +465,6 @@ /// true if currently emitting code for target/teams/distribute region, false /// - otherwise. bool IsInTTDRegion = false; - /// true if we're definitely in the parallel region. - bool IsInParallelRegion = false; /// Map between an outlined function and its wrapper. llvm::DenseMap WrapperFunctionsMap; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeGPU.h" +#include "CGOpenMPRuntimeAMDGCN.h" #include "CGOpenMPRuntimeNVPTX.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -1190,13 +1191,8 @@ // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + auto &RT = static_cast(CGM.getOpenMPRuntime()); + KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM); } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); @@ -1321,13 +1317,8 @@ // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + auto &RT = static_cast(CGM.getOpenMPRuntime()); + KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM); } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); @@ -1888,12 +1879,11 @@ bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); if (Mode) - emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); + emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); else - emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - + emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } @@ -1963,22 +1953,9 @@ const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - bool &IsInParallelRegion; - bool PrevIsInParallelRegion; - - public: - NVPTXPrePostActionTy(bool &IsInParallelRegion) - : IsInParallelRegion(IsInParallelRegion) {} - void Enter(CodeGenFunction &CGF) override { - PrevIsInParallelRegion = IsInParallelRegion; - IsInParallelRegion = true; - } - void Exit(CodeGenFunction &CGF) override { - IsInParallelRegion = PrevIsInParallelRegion; - } - } Action(IsInParallelRegion); - CodeGen.setAction(Action); + auto &RT = static_cast(CGM.getOpenMPRuntime()); + std::unique_ptr Action(RT.getPrePostActionTy()); + CodeGen.setAction(*Action); bool PrevIsInTTDRegion = IsInTTDRegion; IsInTTDRegion = false; bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; @@ -3231,6 +3208,7 @@ CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); CGBuilderTy &Bld = CGF.Builder; + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // This array is used as a medium to transfer, one reduce element at a time, // the data from the first lane of every warp to lanes in the first warp @@ -3246,16 +3224,11 @@ unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); if (!TransferMedium) { auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); - unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); - TransferMedium = new llvm::GlobalVariable( - M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, - llvm::Constant::getNullValue(Ty), TransferMediumName, - /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, - SharedAddressSpace); + TransferMedium = + RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName); CGM.addCompilerUsedGlobal(TransferMedium); } - auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // Get the CUDA thread id of the current OpenMP thread on the GPU. llvm::Value *ThreadID = RT.getGPUThreadID(CGF); // nvptx_lane_id = nvptx_id % warpsize @@ -5104,6 +5077,7 @@ } void CGOpenMPRuntimeGPU::clear() { + auto &RT = static_cast(CGM.getOpenMPRuntime()); if (!GlobalizedRecords.empty() && !CGM.getLangOpts().OpenMPCUDATargetParallel) { ASTContext &C = CGM.getContext(); @@ -5152,9 +5126,6 @@ llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); } // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. if (!SharedStaticRD->field_empty()) { llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); QualType SubTy = C.getConstantArrayType( @@ -5171,13 +5142,7 @@ if (!SharedStaticRD->field_empty()) { QualType StaticTy = C.getRecordType(SharedStaticRD); llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); + auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( GV, CGM.VoidPtrTy); for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { @@ -5198,14 +5163,11 @@ C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); + auto *GV = + new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty, + /*isConstant=*/false, RT.StaticRDLinkage, + llvm::Constant::getNullValue(LLVMArr2Ty), + "_openmp_static_glob_rd_$_"); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( GV, CGM.VoidPtrTy); for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { @@ -5238,7 +5200,7 @@ // Restore CommonLinkage as soon as nvlink is fixed. auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMReductionsBufferTy, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, + /*isConstant=*/false, RT.StaticRDLinkage, llvm::Constant::getNullValue(LLVMReductionsBufferTy), "_openmp_teams_reductions_buffer_$_"); KernelTeamsReductionPtr->setInitializer( diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -35,6 +35,54 @@ /// Get the maximum number of threads in a block of the GPU. llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; + + /// Allocate global variable for TransferMedium + llvm::GlobalVariable *allocateTransferMediumGlobal(CodeGenModule &CGM, + llvm::ArrayType *Ty, + StringRef Name) override; + + /// Allocate global variable for SharedStaticRD + llvm::GlobalVariable * + allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) override; + + /// Allocate global variable for KernelStaticGlobalized + llvm::GlobalVariable * + allocateKernelStaticGlobalized(CodeGenModule &CGM) override; + + /// Get target specific PrePostAction + PrePostActionTy *getPrePostActionTy() override; + + /// Target independent wrapper over target specific emitSPMDKernel() + void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Target independent wrapper over target specific emitNonSPMDKernel() + void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// NVPTX specific class for PrePostActionTy + class NVPTXPrePostActionTy final : public PrePostActionTy { + bool &IsInParallelRegion; + bool PrevIsInParallelRegion; + + public: + NVPTXPrePostActionTy(bool &IsInParallelRegion) + : IsInParallelRegion(IsInParallelRegion) {} + void Enter(CodeGenFunction &CGF) override { + PrevIsInParallelRegion = IsInParallelRegion; + IsInParallelRegion = true; + } + void Exit(CodeGenFunction &CGF) override { + IsInParallelRegion = PrevIsInParallelRegion; + } + }; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -30,6 +30,11 @@ : CGOpenMPRuntimeGPU(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); + + // FIXME: nvlink does not handle weak linkage correctly (object with the + // different size are reported as erroneous). + // Restore CommonLinkage as soon as nvlink is fixed. + StaticRDLinkage = llvm::GlobalValue::InternalLinkage; } llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) { @@ -54,3 +59,56 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); } + +llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal( + CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) { + return new llvm::GlobalVariable( + CGM.getModule(), Ty, /*isConstant=*/false, + llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty), + TransferMediumName, + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) { + return new llvm::GlobalVariable( + CGM.getModule(), LLVMStaticTy, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, + llvm::Constant::getNullValue(LLVMStaticTy), + "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) { + return new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); +} + +void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); +} + +void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); +} + +PrePostActionTy *CGOpenMPRuntimeNVPTX::getPrePostActionTy() { + return new NVPTXPrePostActionTy(IsInParallelRegion); +} diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp --- a/clang/test/OpenMP/amdgcn_target_codegen.cpp +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -8,6 +8,29 @@ #define N 1000 +// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l[[LINE1]]_exec_mode = weak constant i8 1 + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l[[LINE2]]_exec_mode = weak constant i8 0 + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3:.+]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 3 }, align 2 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_max_parallel_levelv_l[[LINE3]]_exec_mode = weak constant i8 0 + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4:.+]]_wg_size = weak addrspace(1) constant i16 10 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l[[LINE4]]_exec_mode = weak constant i8 0 + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5:.+]]_wg_size = weak addrspace(1) constant i16 74 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l[[LINE5]]_exec_mode = weak constant i8 1 + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6:.+]]_wg_size = weak addrspace(1) constant i16 1024 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 1024, i8 1, i8 1, i8 0 }, align 2 +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_attributes_max_work_group_sizev_l[[LINE6]]_exec_mode = weak constant i8 1 + int test_amdgcn_target_tid_threads() { // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads @@ -40,4 +63,65 @@ return arr[0]; } +int test_amdgcn_target_max_parallel_level() { + // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_max_parallel_level + int arr[N]; + +#pragma omp target parallel for + for (int i = 0; i < N; i++) +#pragma omp parallel for + for (int j = 0; j < N; j++) +#pragma omp parallel for + for (int k = 0; k < N; k++) + for (int l = 0; l < N; l++) +#pragma omp parallel for + for (int m = 0; m < N; m++) + arr[m] = 0; + + return arr[0]; +} + +int test_amdgcn_target_attributes_spmd() { + int arr[N]; + +// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10" +#pragma omp target parallel num_threads(10) + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + +int test_amdgcn_target_attributes_non_spmd() { + int arr[N]; + +// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74" +#pragma omp target teams thread_limit(10) + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + +int test_amdgcn_target_attributes_max_work_group_size() { + int arr[N]; + +// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024" +#pragma omp target teams thread_limit(1500) + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + #endif + +// CHECK: !0 = !{i32 0, i32 [[ARG1:[0-9]+]], i32 [[ARG2:[0-9]+]], !"_Z37test_amdgcn_target_max_parallel_levelv", i32 [[LINE3]], i32 2} +// CHECK: !1 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z30test_amdgcn_target_tid_threadsv", i32 [[LINE1]], i32 0} +// CHECK: !2 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z35test_amdgcn_target_tid_threads_simdv", i32 [[LINE2]], i32 1} +// CHECK: !3 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z38test_amdgcn_target_attributes_non_spmdv", i32 [[LINE5]], i32 4} +// CHECK: !4 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z34test_amdgcn_target_attributes_spmdv", i32 [[LINE4]], i32 3} +// CHECK: !5 = !{i32 0, i32 [[ARG1]], i32 [[ARG2]], !"_Z49test_amdgcn_target_attributes_max_work_group_sizev", i32 [[LINE6]], i32 5} \ No newline at end of file