diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -210,8 +210,8 @@ unsigned char RegParmMax, SSERegParmMax; TargetCXXABI TheCXXABI; const LangASMap *AddrSpaceMap; - const unsigned *GridValues = - nullptr; // Array of target-specific GPU grid values that must be + const llvm::omp::GV *GridValues = + nullptr; // target-specific GPU grid values that must be // consistent between host RTL (plugin), device RTL, and clang. mutable StringRef PlatformName; @@ -1410,10 +1410,10 @@ return LangAS::Default; } - /// Return a target-specific GPU grid value based on the GVIDX enum \p gv - unsigned getGridValue(llvm::omp::GVIDX gv) const { + /// Return a target-specific GPU grid values + const llvm::omp::GV &getGridValue() const { assert(GridValues != nullptr && "GridValues not initialized"); - return GridValues[gv]; + return *GridValues; } /// Retrieve the name of the platform as it is used in the diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -335,7 +335,7 @@ llvm::AMDGPU::getArchAttrR600(GPUKind)) { resetDataLayout(isAMDGCN(getTriple()) ? DataLayoutStringAMDGCN : DataLayoutStringR600); - GridValues = llvm::omp::AMDGPUGpuGridValues; + GridValues = &llvm::omp::AMDGPUGridValues; setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D || !isAMDGCN(Triple)); diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -65,7 +65,7 @@ TLSSupported = false; VLASupported = false; AddrSpaceMap = &NVPTXAddrSpaceMap; - GridValues = llvm::omp::NVPTXGpuGridValues; + GridValues = &llvm::omp::NVPTXGridValues; UseAddrSpaceMapMangling = true; // Define available target features diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -20,6 +20,7 @@ #include "clang/AST/StmtVisitor.h" #include "clang/Basic/Cuda.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/IntrinsicsAMDGPU.h" using namespace clang; @@ -35,7 +36,7 @@ llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; // return constant compile-time target-specific warp size - unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; return Bld.getInt32(WarpSize); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -17,7 +17,6 @@ #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "clang/AST/StmtOpenMP.h" -#include "llvm/Frontend/OpenMP/OMPGridValues.h" namespace clang { namespace CodeGen { diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -339,7 +339,7 @@ assert(!GlobalizedRD && "Record for globalized variables is built already."); ArrayRef EscapedDeclsForParallel, EscapedDeclsForTeams; - unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; if (IsInTTDRegion) EscapedDeclsForTeams = EscapedDecls.getArrayRef(); else @@ -535,8 +535,7 @@ /// on the NVPTX device, to generate more efficient code. static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; - unsigned LaneIDBits = - CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); + unsigned LaneIDBits = CGF.getTarget().getGridValue().GV_Warp_Size_Log2; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); } @@ -546,8 +545,8 @@ /// on the NVPTX device, to generate more efficient code. static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; - unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( - llvm::omp::GV_Warp_Size_Log2_Mask); + unsigned LaneIDMask = + CGF.getContext().getTargetInfo().getGridValue().GV_Warp_Size_Log2_Mask; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), "nvptx_lane_id"); @@ -1308,7 +1307,7 @@ const RecordDecl *GlobalizedRD = nullptr; llvm::SmallVector LastPrivatesReductions; llvm::SmallDenseMap MappedDeclsFields; - unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; // Globalize team reductions variable unconditionally in all modes. if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); @@ -2089,7 +2088,7 @@ "__openmp_nvptx_data_transfer_temporary_storage"; llvm::GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); - unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; if (!TransferMedium) { auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -29,68 +29,69 @@ /// /// Example usage in clang: /// const unsigned slot_size = -/// ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size); +/// ctx.GetTargetInfo().getGridValue().GV_Warp_Size; /// /// Example usage in libomptarget/deviceRTLs: /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" /// #ifdef __AMDGPU__ -/// #define GRIDVAL AMDGPUGpuGridValues +/// #define GRIDVAL AMDGPUGridValues /// #else -/// #define GRIDVAL NVPTXGpuGridValues +/// #define GRIDVAL NVPTXGridValues /// #endif /// ... Then use this reference for GV_Warp_Size in the deviceRTL source. -/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] +/// llvm::omp::GRIDVAL().GV_Warp_Size /// /// Example usage in libomptarget hsa plugin: /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" -/// #define GRIDVAL AMDGPUGpuGridValues +/// #define GRIDVAL AMDGPUGridValues /// ... Then use this reference to access GV_Warp_Size in the hsa plugin. -/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] +/// llvm::omp::GRIDVAL().GV_Warp_Size /// /// Example usage in libomptarget cuda plugin: /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" -/// #define GRIDVAL NVPTXGpuGridValues +/// #define GRIDVAL NVPTXGridValues /// ... Then use this reference to access GV_Warp_Size in the cuda plugin. -/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] +/// llvm::omp::GRIDVAL().GV_Warp_Size /// -enum GVIDX { + +struct GV { /// The maximum number of workers in a kernel. /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z - GV_Threads, + const unsigned GV_Threads; /// The size reserved for data in a shared memory slot. - GV_Slot_Size, + const unsigned GV_Slot_Size; /// The default value of maximum number of threads in a worker warp. - GV_Warp_Size, + const unsigned GV_Warp_Size; /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size /// for NVPTX. - GV_Warp_Size_32, + const unsigned GV_Warp_Size_32; /// The number of bits required to represent the max number of threads in warp - GV_Warp_Size_Log2, + const unsigned GV_Warp_Size_Log2; /// GV_Warp_Size * GV_Slot_Size, - GV_Warp_Slot_Size, + const unsigned GV_Warp_Slot_Size; /// the maximum number of teams. - GV_Max_Teams, + const unsigned GV_Max_Teams; /// Global Memory Alignment - GV_Mem_Align, + const unsigned GV_Mem_Align; /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) - GV_Warp_Size_Log2_Mask, + const unsigned GV_Warp_Size_Log2_Mask; // An alternative to the heavy data sharing infrastructure that uses global // memory is one that uses device __shared__ memory. The amount of such space // (in bytes) reserved by the OpenMP runtime is noted here. - GV_SimpleBufferSize, + const unsigned GV_SimpleBufferSize; // The absolute maximum team size for a working group - GV_Max_WG_Size, + const unsigned GV_Max_WG_Size; // The default maximum team size for a working group - GV_Default_WG_Size, + const unsigned GV_Default_WG_Size; // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN. - GV_Max_Warp_Number, + const unsigned GV_Max_Warp_Number; /// The slot size that should be reserved for a working warp. /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) - GV_Warp_Size_Log2_MaskL + const unsigned GV_Warp_Size_Log2_MaskL; }; /// For AMDGPU GPUs -static constexpr unsigned AMDGPUGpuGridValues[] = { +static constexpr GV AMDGPUGridValues = { 448, // GV_Threads 256, // GV_Slot_Size 64, // GV_Warp_Size @@ -108,7 +109,7 @@ }; /// For Nvidia GPUs -static constexpr unsigned NVPTXGpuGridValues[] = { +static constexpr GV NVPTXGridValues = { 992, // GV_Threads 256, // GV_Slot_Size 32, // GV_Warp_Size diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -501,14 +501,11 @@ static const unsigned HardTeamLimit = (1 << 16) - 1; // 64K needed to fit in uint16 static const int DefaultNumTeams = 128; - static const int Max_Teams = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; - static const int Warp_Size = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; - static const int Max_WG_Size = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size]; + static const int Max_Teams = llvm::omp::AMDGPUGridValues.GV_Max_Teams; + static const int Warp_Size = llvm::omp::AMDGPUGridValues.GV_Warp_Size; + static const int Max_WG_Size = llvm::omp::AMDGPUGridValues.GV_Max_WG_Size; static const int Default_WG_Size = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; + llvm::omp::AMDGPUGridValues.GV_Default_WG_Size; using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, size_t size, hsa_agent_t); @@ -1058,9 +1055,8 @@ DeviceInfo.WarpSize[device_id] = wavefront_size; } else { DP("Default wavefront size: %d\n", - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]); - DeviceInfo.WarpSize[device_id] = - llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; + llvm::omp::AMDGPUGridValues.GV_Warp_Size); + DeviceInfo.WarpSize[device_id] = llvm::omp::AMDGPUGridValues.GV_Warp_Size; } // Adjust teams to the env variables