diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -2488,6 +2488,20 @@ } }; +/// To encapsulate helper methods to be used by target specific specializations +/// of CGOpenMPRuntimeGPU. +class CodeGenUtil { +public: + static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, + QualType FieldTy); + + template + static llvm::GlobalVariable * + createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant, + ArrayRef Data, const Twine &Name, + As &&... Args); +}; + } // namespace CodeGen } // namespace clang diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1048,17 +1048,6 @@ AlignmentSource::Decl); } -static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, - QualType FieldTy) { - auto *Field = FieldDecl::Create( - C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy, - C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - DC->addDecl(Field); - return Field; -} - CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator, StringRef Separator) : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator), @@ -1352,11 +1341,21 @@ } } +FieldDecl *clang::CodeGen::CodeGenUtil::CodeGenUtil::addFieldToRecordDecl( + ASTContext &C, DeclContext *DC, QualType FieldTy) { + auto *Field = FieldDecl::Create( + C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy, + C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + DC->addDecl(Field); + return Field; +} + template -static llvm::GlobalVariable * -createGlobalStruct(CodeGenModule &CGM, QualType Ty, bool IsConstant, - ArrayRef Data, const Twine &Name, - As &&... Args) { +llvm::GlobalVariable *clang::CodeGen::CodeGenUtil::createGlobalStruct( + CodeGenModule &CGM, QualType Ty, bool IsConstant, + ArrayRef Data, const Twine &Name, As &&... Args) { const auto *RD = cast(Ty->getAsTagDecl()); const CGRecordLayout &RL = CGM.getTypes().getCGRecordLayout(RD); ConstantInitBuilder CIBuilder(CGM); @@ -3082,7 +3081,7 @@ llvm::ConstantInt::get(CGM.Int32Ty, Flags), llvm::ConstantInt::get(CGM.Int32Ty, 0)}; std::string EntryName = getName({"omp_offloading", "entry", ""}); - llvm::GlobalVariable *Entry = createGlobalStruct( + llvm::GlobalVariable *Entry = CodeGenUtil::createGlobalStruct( CGM, getTgtOffloadEntryQTy(), /*IsConstant=*/true, Data, Twine(EntryName).concat(Name), llvm::GlobalValue::WeakAnyLinkage); @@ -3360,12 +3359,12 @@ ASTContext &C = CGM.getContext(); RecordDecl *RD = C.buildImplicitRecord("__tgt_offload_entry"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, C.VoidPtrTy); - addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy)); - addFieldToRecordDecl(C, RD, C.getSizeType()); - addFieldToRecordDecl( + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy)); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType()); + CodeGenUtil::addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); - addFieldToRecordDecl( + CodeGenUtil::addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); RD->completeDefinition(); RD->addAttr(PackedAttr::CreateImplicit(C)); @@ -3423,7 +3422,7 @@ if (isAllocatableDecl(VD)) Type = C.getPointerType(Type); } - FieldDecl *FD = addFieldToRecordDecl(C, RD, Type); + FieldDecl *FD = CodeGenUtil::addFieldToRecordDecl(C, RD, Type); if (VD->hasAttrs()) { for (specific_attr_iterator I(VD->getAttrs().begin()), E(VD->getAttrs().end()); @@ -3457,27 +3456,27 @@ // }; RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union); UD->startDefinition(); - addFieldToRecordDecl(C, UD, KmpInt32Ty); - addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy); + CodeGenUtil::addFieldToRecordDecl(C, UD, KmpInt32Ty); + CodeGenUtil::addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy); UD->completeDefinition(); QualType KmpCmplrdataTy = C.getRecordType(UD); RecordDecl *RD = C.buildImplicitRecord("kmp_task_t"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, C.VoidPtrTy); - addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy); - addFieldToRecordDecl(C, RD, KmpInt32Ty); - addFieldToRecordDecl(C, RD, KmpCmplrdataTy); - addFieldToRecordDecl(C, RD, KmpCmplrdataTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpCmplrdataTy); if (isOpenMPTaskLoopDirective(Kind)) { QualType KmpUInt64Ty = CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0); QualType KmpInt64Ty = CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1); - addFieldToRecordDecl(C, RD, KmpUInt64Ty); - addFieldToRecordDecl(C, RD, KmpUInt64Ty); - addFieldToRecordDecl(C, RD, KmpInt64Ty); - addFieldToRecordDecl(C, RD, KmpInt32Ty); - addFieldToRecordDecl(C, RD, C.VoidPtrTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpUInt64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt32Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); } RD->completeDefinition(); return RD; @@ -3493,9 +3492,9 @@ // }; RecordDecl *RD = C.buildImplicitRecord("kmp_task_t_with_privates"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, KmpTaskTQTy); + CodeGenUtil::addFieldToRecordDecl(C, RD, KmpTaskTQTy); if (const RecordDecl *PrivateRD = createPrivatesRecordDecl(CGM, Privates)) - addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD)); + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD)); RD->completeDefinition(); return RD; } @@ -4128,9 +4127,9 @@ RecordDecl *KmpAffinityInfoRD = C.buildImplicitRecord("kmp_task_affinity_info_t"); KmpAffinityInfoRD->startDefinition(); - addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType()); - addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType()); - addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy); + CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getIntPtrType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, C.getSizeType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpAffinityInfoRD, FlagsTy); KmpAffinityInfoRD->completeDefinition(); KmpTaskAffinityInfoTy = C.getRecordType(KmpAffinityInfoRD); } @@ -4568,9 +4567,9 @@ if (KmpDependInfoTy.isNull()) { RecordDecl *KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info"); KmpDependInfoRD->startDefinition(); - addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType()); - addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType()); - addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy); + CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType()); + CodeGenUtil::addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy); KmpDependInfoRD->completeDefinition(); KmpDependInfoTy = C.getRecordType(KmpDependInfoRD); } @@ -6017,13 +6016,19 @@ ASTContext &C = CGM.getContext(); RecordDecl *RD = C.buildImplicitRecord("kmp_taskred_input_t"); RD->startDefinition(); - const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *OrigFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType()); - const FieldDecl *InitFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); - const FieldDecl *FlagsFD = addFieldToRecordDecl( + const FieldDecl *SharedFD = + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *OrigFD = + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *SizeFD = + CodeGenUtil::addFieldToRecordDecl(C, RD, C.getSizeType()); + const FieldDecl *InitFD = + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *FiniFD = + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *CombFD = + CodeGenUtil::addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *FlagsFD = CodeGenUtil::addFieldToRecordDecl( C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false)); RD->completeDefinition(); QualType RDType = C.getRecordType(RD); @@ -9058,9 +9063,9 @@ RecordDecl *RD; RD = C.buildImplicitRecord("descriptor_dim"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, Int64Ty); - addFieldToRecordDecl(C, RD, Int64Ty); - addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); RD->completeDefinition(); QualType DimTy = C.getRecordType(RD); @@ -11669,9 +11674,9 @@ // }; RD = C.buildImplicitRecord("kmp_dim"); RD->startDefinition(); - addFieldToRecordDecl(C, RD, Int64Ty); - addFieldToRecordDecl(C, RD, Int64Ty); - addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); + CodeGenUtil::addFieldToRecordDecl(C, RD, Int64Ty); RD->completeDefinition(); KmpDimTy = C.getRecordType(RD); } else { @@ -12124,8 +12129,9 @@ if (VI == I->getSecond().end()) { RecordDecl *RD = C.buildImplicitRecord("lasprivate.conditional"); RD->startDefinition(); - VDField = addFieldToRecordDecl(C, RD, VD->getType().getNonReferenceType()); - FiredField = addFieldToRecordDecl(C, RD, C.CharTy); + VDField = CodeGenUtil::addFieldToRecordDecl( + C, RD, VD->getType().getNonReferenceType()); + FiredField = CodeGenUtil::addFieldToRecordDecl(C, RD, C.CharTy); RD->completeDefinition(); NewType = C.getRecordType(RD); Address Addr = CGF.CreateMemTemp(NewType, C.getDeclAlign(VD), VD->getName()); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -27,6 +27,10 @@ public: explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM); +private: + /// Struct to store kernel descriptors + QualType TgtAttributeStructQTy; + /// Get the GPU warp size. llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; @@ -35,6 +39,38 @@ /// Get the maximum number of threads in a block of the GPU. llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; + + /// Target independent wrapper over target specific emitSPMDKernel() + void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Target independent wrapper over target specific emitNonSPMDKernel() + void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Create a unique global variable to indicate the flat-work-group-size + /// for this region. Values are [256..1024]. + static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name, + unsigned WGSize); + + /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct. + /// Also generate appropriate value of attribute amdgpu-flat-work-group-size + void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D, + llvm::Function *&OutlinedFn, bool IsGeneric); + + /// Returns __tgt_attribute_struct type. + QualType getTgtAttributeStructQTy(); + + /// Emit structure descriptor for a kernel + void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name, + int16_t WG_Size, int8_t Mode, + int8_t HostServices); }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeAMDGCN.h" +#include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -26,10 +27,14 @@ using namespace CodeGen; using namespace llvm::omp; +// +// Definitions of all virtual methods defined in CGOpenMPRuntimeGPU +// CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM) : CGOpenMPRuntimeGPU(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP AMDGCN can only handle device code."); + KernelStaticGlobalizedLinkage = llvm::GlobalValue::WeakAnyLinkage; } llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { @@ -59,3 +64,145 @@ return Bld.CreateTrunc( Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty); } + +void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false); +} + +void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true); +} + +// +// Definitions of AMDGCN specific methods +// +void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM, + StringRef Name, + unsigned WGSize) { + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Twine(Name, "_wg_size"), + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); + + CGM.addCompilerUsedGlobal(GVMode); +} + +void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM, + const OMPExecutableDirective &D, + llvm::Function *&OutlinedFn, + bool IsGeneric) { + int FlatAttr = 0; + bool FlatAttrEmitted = false; + unsigned DefaultWorkGroupSz = + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size); + + if (isOpenMPTeamsDirective(D.getDirectiveKind()) || + isOpenMPParallelDirective(D.getDirectiveKind())) { + const auto *ThreadLimitClause = D.getSingleClause(); + const auto *NumThreadsClause = D.getSingleClause(); + unsigned MaxWorkGroupSz = + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size); + unsigned CompileTimeThreadLimit = 0; + // Only one of thread_limit or num_threads is used, cant do it for both + if (ThreadLimitClause && !NumThreadsClause) { + Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit(); + clang::Expr::EvalResult Result; + if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext())) + CompileTimeThreadLimit = Result.Val.getInt().getExtValue(); + } else if (!ThreadLimitClause && NumThreadsClause) { + Expr *NumThreadsExpr = NumThreadsClause->getNumThreads(); + clang::Expr::EvalResult Result; + if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext())) + CompileTimeThreadLimit = Result.Val.getInt().getExtValue(); + } + + // Add kernel metadata if ThreadLimit Clause is compile time constant > 0 + if (CompileTimeThreadLimit > 0) { + // Add the WarpSize to generic, to reflect what runtime dispatch does. + if (IsGeneric) + CompileTimeThreadLimit += + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size); + if (CompileTimeThreadLimit > MaxWorkGroupSz) + CompileTimeThreadLimit = MaxWorkGroupSz; + std::string AttrVal = llvm::utostr(CompileTimeThreadLimit); + FlatAttr = CompileTimeThreadLimit; + OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", + AttrVal + "," + AttrVal); + setPropertyWorkGroupSize(CGM, OutlinedFn->getName(), + CompileTimeThreadLimit); + } + FlatAttrEmitted = true; + } // end of amdgcn teams or parallel directive + + // emit amdgpu-flat-work-group-size if not emitted already. + if (!FlatAttrEmitted) { + std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz); + OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", + FlatAttrVal + "," + FlatAttrVal); + } + // Emit a kernel descriptor for runtime. + StringRef KernDescName = OutlinedFn->getName(); + CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc( + CGM, KernDescName, FlatAttr, IsGeneric, 1 /* Uses HostServices */); +} + +/// Emit structure descriptor for a kernel +void CGOpenMPRuntimeAMDGCN::emitStructureKernelDesc(CodeGenModule &CGM, + StringRef Name, + int16_t WG_Size, + int8_t Mode, + int8_t HostServices) { + + // Create all device images + llvm::Constant *AttrData[] = { + llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version + llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes + llvm::ConstantInt::get(CGM.Int16Ty, WG_Size), + llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC + llvm::ConstantInt::get(CGM.Int8Ty, HostServices) // 1 => use HostServices + }; + + llvm::GlobalVariable *AttrImages = + clang::CodeGen::CodeGenUtil::createGlobalStruct( + CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), + AttrData, Twine(Name, "_kern_desc"), + llvm::GlobalValue::WeakAnyLinkage); + + CGM.addCompilerUsedGlobal(AttrImages); +} + +// Create Tgt Attribute Struct type. +QualType CGOpenMPRuntimeAMDGCN::getTgtAttributeStructQTy() { + ASTContext &C = CGM.getContext(); + QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1); + QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1); + if (TgtAttributeStructQTy.isNull()) { + RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct"); + RD->startDefinition(); + // Version + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); + // Struct Size in bytes. + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); + // WG_size + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt16Ty); + // Mode + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty); + // HostServices + clang::CodeGen::CodeGenUtil::addFieldToRecordDecl(C, RD, KmpInt8Ty); + RD->completeDefinition(); + TgtAttributeStructQTy = C.getRecordType(RD); + } + return TgtAttributeStructQTy; +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -33,6 +33,11 @@ /// Unknown execution mode (orphaned directive). EM_Unknown, }; + +protected: + /// Linkage type of KernelStaticGlobalized variable + llvm::GlobalValue::LinkageTypes KernelStaticGlobalizedLinkage; + private: /// Parallel outlined function work for workers to execute. llvm::SmallVector Work; @@ -99,6 +104,7 @@ uint64_t Size, int32_t Flags, llvm::GlobalValue::LinkageTypes Linkage) override; +protected: /// Emit outlined function specialized for the Fork-Join /// programming model for applicable target directives on the NVPTX device. /// \param D Directive to emit. @@ -129,6 +135,7 @@ llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); +private: /// Emit outlined function for 'target' directive on the NVPTX /// device. /// \param D Directive to emit. @@ -211,6 +218,22 @@ /// Get the maximum number of threads in a block of the GPU. virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; + /// Target independent wrapper over target specific emitSPMDKernel() + virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) = 0; + + /// Target independent wrapper over target specific emitNonSPMDKernel() + virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) = 0; + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. virtual void emitProcBindClause(CodeGenFunction &CGF, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeGPU.h" +#include "CGOpenMPRuntimeAMDGCN.h" #include "CGOpenMPRuntimeNVPTX.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -1099,10 +1100,10 @@ // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); if (!KernelStaticGlobalized) { + auto &RT = static_cast(CGM.getOpenMPRuntime()); KernelStaticGlobalized = new llvm::GlobalVariable( CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), + RT.KernelStaticGlobalizedLinkage, llvm::UndefValue::get(CGM.VoidPtrTy), "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); @@ -1231,10 +1232,10 @@ // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); if (!KernelStaticGlobalized) { + auto &RT = static_cast(CGM.getOpenMPRuntime()); KernelStaticGlobalized = new llvm::GlobalVariable( CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), + RT.KernelStaticGlobalizedLinkage, llvm::UndefValue::get(CGM.VoidPtrTy), "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); @@ -1478,11 +1479,11 @@ bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); if (Mode) - emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); + emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); else - emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); + emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -27,6 +27,7 @@ public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); +private: /// Get the GPU warp size. llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; @@ -35,6 +36,20 @@ /// Get the maximum number of threads in a block of the GPU. llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; + + /// Target independent wrapper over target specific emitSPMDKernel() + void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Target independent wrapper over target specific emitNonSPMDKernel() + void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -30,6 +30,7 @@ : CGOpenMPRuntimeGPU(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); + KernelStaticGlobalizedLinkage = llvm::GlobalValue::InternalLinkage; } llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) { @@ -54,3 +55,19 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); } + +void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); +} + +void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); +}