diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -495,6 +495,8 @@ QualType TgtOffloadEntryQTy; /// Entity that registers the offloading constants that were emitted so /// far. + + QualType TgtAttributeStructQTy; class OffloadEntriesInfoManagerTy { CodeGenModule &CGM; @@ -1755,6 +1757,14 @@ llvm::FunctionCallee OutlinedFn, ArrayRef Args = llvm::None) const; + /// Returns __tgt_attribute_struct type. + QualType getTgtAttributeStructQTy(); + + /// Emit structure descriptor for a kernel + void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name, + int16_t WG_Size, int8_t Mode, + int8_t HostServices, int8_t MaxParallelLevel); + /// Emits OpenMP-specific function prolog. /// Required for device constructs. virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -3335,6 +3335,47 @@ } } +/// Emit structure descriptor for a kernel +void CGOpenMPRuntime::emitStructureKernelDesc(CodeGenModule &CGM, + StringRef Name, int16_t WG_Size, + int8_t Mode, int8_t HostServices, + int8_t MaxParallelLevel) { + + // Create all device images + llvm::Constant *AttrData[] = { + llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version + llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes + llvm::ConstantInt::get(CGM.Int16Ty, WG_Size), + llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC + llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices + llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests + + llvm::GlobalVariable *AttrImages = createGlobalStruct( + CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData, + Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage); + CGM.addCompilerUsedGlobal(AttrImages); +} + +// Create Tgt Attribute Sruct type. +QualType CGOpenMPRuntime::getTgtAttributeStructQTy() { + ASTContext &C = CGM.getContext(); + QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1); + QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1); + if (TgtAttributeStructQTy.isNull()) { + RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct"); + RD->startDefinition(); + addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version + addFieldToRecordDecl(C, RD, KmpInt16Ty); // Struct Size in bytes. + addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size + addFieldToRecordDecl(C, RD, KmpInt8Ty); // Mode + addFieldToRecordDecl(C, RD, KmpInt8Ty); // HostServices + addFieldToRecordDecl(C, RD, KmpInt8Ty); // MaxParallelLevel + RD->completeDefinition(); + TgtAttributeStructQTy = C.getRecordType(RD); + } + return TgtAttributeStructQTy; +} + QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() { // Make sure the type of the entry is already created. This is the type we // have to create: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -25,6 +25,10 @@ class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU { public: + /// Nesting level of parallel region. + int ParallelLevel = 0; + int MaxParallelLevel = 0; + explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM); /// Get the GPU warp size. @@ -35,6 +39,46 @@ /// Get the maximum number of threads in a block of the GPU. llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; + + /// Allocate global variable for TransferMedium + virtual llvm::GlobalVariable * + allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty, + StringRef TransferMediumName) override; + + /// Allocate global variable for SharedStaticRD + virtual llvm::GlobalVariable * + allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) override; + + /// Get global variable KernelStaticGlobalized which is a shared pointer for + /// the global memory in the global memory buffer used for the given kernel + virtual llvm::GlobalVariable * + allocateKernelStaticGlobalized(CodeGenModule &CGM) override; + + /// Emit target specifc SPMD kernel + virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Emit target specific Non-SPMD kernel + virtual void + emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + + /// Create a unique global variable to indicate the flat-work-group-size + /// for this region. Values are [256..1024]. + static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name, + unsigned WGSize); + + /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct. + /// Also generate appropriate value of attribute amdgpu-flat-work-group-size + void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D, + llvm::Function *&OutlinedFn, bool IsGeneric); }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeAMDGCN.h" +#include "CGOpenMPRuntime.h" #include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -30,6 +31,7 @@ : CGOpenMPRuntimeGPU(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP AMDGCN can only handle device code."); + StaticRDLinkage = llvm::GlobalValue::PrivateLinkage; } llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { @@ -59,3 +61,133 @@ return Bld.CreateTrunc( Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty); } + +llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal( + CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) { + return new llvm::GlobalVariable( + CGM.getModule(), Ty, /*isConstant=*/false, + llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty), + TransferMediumName, + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared), + /*isExternallyInitialized*/ true); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) { + return new llvm::GlobalVariable( + CGM.getModule(), LLVMStaticTy, + /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_", + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared), + /*isExternallyInitialized*/ true); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) { + return new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared), + /*isExternallyInitialized*/ true); +} + +void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM, + StringRef Name, + unsigned WGSize) { + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"), + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_device), + /*isExternallyInitialized*/ false); + CGM.addCompilerUsedGlobal(GVMode); +} + +void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM, + const OMPExecutableDirective &D, + llvm::Function *&OutlinedFn, + bool IsGeneric) { + if (!CGM.getTriple().isAMDGCN()) + return; + int FlatAttr = 0; + bool flatAttrEmitted = false; + unsigned DefaultWorkGroupSz = + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size); + + if (isOpenMPTeamsDirective(D.getDirectiveKind()) || + isOpenMPParallelDirective(D.getDirectiveKind())) { + const auto *ThreadLimitClause = D.getSingleClause(); + const auto *NumThreadsClause = D.getSingleClause(); + unsigned MaxWorkGroupSz = + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size); + unsigned compileTimeThreadLimit = 0; + // Only one of thread_limit or num_threads is used, cant do it for both + if (ThreadLimitClause && !NumThreadsClause) { + Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit(); + clang::Expr::EvalResult Result; + if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext())) + compileTimeThreadLimit = Result.Val.getInt().getExtValue(); + } else if (!ThreadLimitClause && NumThreadsClause) { + Expr *NumThreadsExpr = NumThreadsClause->getNumThreads(); + clang::Expr::EvalResult Result; + if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext())) + compileTimeThreadLimit = Result.Val.getInt().getExtValue(); + } + + // Add kernel metadata if ThreadLimit Clause is compile time constant > 0 + if (compileTimeThreadLimit > 0) { + // Add the WarpSize to generic, to reflect what runtime dispatch does. + if (IsGeneric) + compileTimeThreadLimit += + CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size); + if (compileTimeThreadLimit > MaxWorkGroupSz) + compileTimeThreadLimit = MaxWorkGroupSz; + std::string AttrVal = llvm::utostr(compileTimeThreadLimit); + FlatAttr = compileTimeThreadLimit; + OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", + AttrVal + "," + AttrVal); + setPropertyWorkGroupSize(CGM, OutlinedFn->getName(), + compileTimeThreadLimit); + } + flatAttrEmitted = true; + } // end of amdgcn teams or parallel directive + + // emit amdgpu-flat-work-group-size if not emitted already. + if (!flatAttrEmitted) { + std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz); + OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", + FlatAttrVal + "," + FlatAttrVal); + } + // Emit a kernel descriptor for runtime. + StringRef KernDescName = OutlinedFn->getName(); + CGOpenMPRuntime::emitStructureKernelDesc(CGM, KernDescName, FlatAttr, + IsGeneric, + 1, // Uses HostServices + MaxParallelLevel); + // Reset it to zero for any subsequent kernel + MaxParallelLevel = 0; +} + +void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false); +} + +void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -33,6 +33,9 @@ /// Unknown execution mode (orphaned directive). EM_Unknown, }; + /// Linkage type of StaticRD Global variable + llvm::GlobalValue::LinkageTypes StaticRDLinkage; + private: /// Parallel outlined function work for workers to execute. llvm::SmallVector Work; @@ -99,36 +102,6 @@ uint64_t Size, int32_t Flags, llvm::GlobalValue::LinkageTypes Linkage) override; - /// Emit outlined function specialized for the Fork-Join - /// programming model for applicable target directives on the NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - - /// Emit outlined function specialized for the Single Program - /// Multiple Data programming model for applicable target directives on the - /// NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// \param CodeGen Object containing the target statements. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - /// Emit outlined function for 'target' directive on the NVPTX /// device. /// \param D Directive to emit. @@ -211,6 +184,64 @@ /// Get the maximum number of threads in a block of the GPU. virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; + /// Allocate global variable for TransferMedium + virtual llvm::GlobalVariable * + allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty, + StringRef TransferMediumName) = 0; + + /// Allocate global variable for SharedStaticRD + virtual llvm::GlobalVariable * + allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) = 0; + + /// Allocate global variable for KernelStaticGlobalized + virtual llvm::GlobalVariable * + allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0; + + virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) = 0; + + virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) = 0; + + /// Emit outlined function specialized for the Single Program + /// Multiple Data programming model for applicable target directives on the + /// NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// \param CodeGen Object containing the target statements. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); + + /// Emit outlined function specialized for the Fork-Join + /// programming model for applicable target directives on the NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. virtual void emitProcBindClause(CodeGenFunction &CGF, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeGPU.h" +#include "CGOpenMPRuntimeAMDGCN.h" #include "CGOpenMPRuntimeNVPTX.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" @@ -1190,13 +1191,8 @@ // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + auto &RT = static_cast(CGM.getOpenMPRuntime()); + KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM); } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); @@ -1321,13 +1317,8 @@ // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + auto &RT = static_cast(CGM.getOpenMPRuntime()); + KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM); } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); @@ -1396,11 +1387,11 @@ // warps participate in parallel work. static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode) { - auto *GVMode = - new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), - Twine(Name, "_exec_mode")); + auto *GVMode = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, + /*isConstant=*/true, llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), + Twine(Name, "_exec_mode")); CGM.addCompilerUsedGlobal(GVMode); } @@ -1888,12 +1879,11 @@ bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); if (Mode) - emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); + emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); else - emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - + emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } @@ -3231,6 +3221,7 @@ CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); CGBuilderTy &Bld = CGF.Builder; + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // This array is used as a medium to transfer, one reduce element at a time, // the data from the first lane of every warp to lanes in the first warp @@ -3246,16 +3237,11 @@ unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); if (!TransferMedium) { auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); - unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); - TransferMedium = new llvm::GlobalVariable( - M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, - llvm::Constant::getNullValue(Ty), TransferMediumName, - /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, - SharedAddressSpace); + TransferMedium = + RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName); CGM.addCompilerUsedGlobal(TransferMedium); } - auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // Get the CUDA thread id of the current OpenMP thread on the GPU. llvm::Value *ThreadID = RT.getGPUThreadID(CGF); // nvptx_lane_id = nvptx_id % warpsize @@ -5104,6 +5090,7 @@ } void CGOpenMPRuntimeGPU::clear() { + auto &RT = static_cast(CGM.getOpenMPRuntime()); if (!GlobalizedRecords.empty() && !CGM.getLangOpts().OpenMPCUDATargetParallel) { ASTContext &C = CGM.getContext(); @@ -5152,9 +5139,6 @@ llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); } // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. if (!SharedStaticRD->field_empty()) { llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); QualType SubTy = C.getConstantArrayType( @@ -5171,13 +5155,7 @@ if (!SharedStaticRD->field_empty()) { QualType StaticTy = C.getRecordType(SharedStaticRD); llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); + auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( GV, CGM.VoidPtrTy); for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { @@ -5198,14 +5176,11 @@ C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); + auto *GV = + new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty, + /*isConstant=*/false, RT.StaticRDLinkage, + llvm::Constant::getNullValue(LLVMArr2Ty), + "_openmp_static_glob_rd_$_"); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( GV, CGM.VoidPtrTy); for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { @@ -5238,7 +5213,7 @@ // Restore CommonLinkage as soon as nvlink is fixed. auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMReductionsBufferTy, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, + /*isConstant=*/false, RT.StaticRDLinkage, llvm::Constant::getNullValue(LLVMReductionsBufferTy), "_openmp_teams_reductions_buffer_$_"); KernelTeamsReductionPtr->setInitializer( diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -35,6 +35,34 @@ /// Get the maximum number of threads in a block of the GPU. llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; + + /// Allocate global variable for TransferMedium + virtual llvm::GlobalVariable * + allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty, + StringRef Name) override; + + /// Allocate global variable for SharedStaticRD + virtual llvm::GlobalVariable * + allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) override; + + /// Allocate global variable for KernelStaticGlobalized + virtual llvm::GlobalVariable * + allocateKernelStaticGlobalized(CodeGenModule &CGM) override; + + /// Emit target specific SPMD kernel + virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; + /// Emit target specific Non-SPMD kernel + virtual void + emitNonSPMDKernelWrapper(const OMPExecutableDirective &D, + StringRef ParentName, llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) override; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -30,6 +30,11 @@ : CGOpenMPRuntimeGPU(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); + + // FIXME: nvlink does not handle weak linkage correctly (object with the + // different size are reported as erroneous). + // Restore CommonLinkage as soon as nvlink is fixed. + StaticRDLinkage = llvm::GlobalValue::InternalLinkage; } llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) { @@ -54,3 +59,52 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); } + +llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal( + CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) { + return new llvm::GlobalVariable( + CGM.getModule(), Ty, /*isConstant=*/false, + llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty), + TransferMediumName, + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM, + llvm::Type *LLVMStaticTy) { + return new llvm::GlobalVariable( + CGM.getModule(), LLVMStaticTy, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, + llvm::Constant::getNullValue(LLVMStaticTy), + "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); +} + +llvm::GlobalVariable * +CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) { + return new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); +} + +void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); +} + +void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper( + const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); +} diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp --- a/clang/test/OpenMP/amdgcn_target_codegen.cpp +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -8,6 +8,22 @@ #define N 1000 +// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef + +// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l36_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2 +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l36_exec_mode = weak constant i8 1 + +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l52_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2 +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l52_exec_mode = weak constant i8 0 + +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_wg_size = weak addrspace(1) constant i16 10 +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2 +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_exec_mode = weak constant i8 0 + +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_wg_size = weak addrspace(1) constant i16 74 +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2 +// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_exec_mode = weak constant i8 1 + int test_amdgcn_target_tid_threads() { // CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads @@ -40,4 +56,40 @@ return arr[0]; } +int test_amdgcn_target_attributes_spmd() { + int arr[N]; + +// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10" +#pragma omp target parallel num_threads(10) + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + +int test_amdgcn_target_attributes_non_spmd() { + int arr[N]; + +// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74" +#pragma omp target teams thread_limit(10) + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + +int test_amdgcn_target_attributes_max_work_group_size() { + int arr[N]; + +// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024" +#pragma omp target teams thread_limit(1500) + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + #endif