diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -15,6 +15,7 @@ #define LLVM_CLANG_BASIC_TARGETINFO_H #include "clang/Basic/AddressSpaces.h" +#include "clang/Basic/CodeGenOptions.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/Specifiers.h" @@ -29,6 +30,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/VersionTuple.h" #include @@ -196,6 +198,8 @@ unsigned char RegParmMax, SSERegParmMax; TargetCXXABI TheCXXABI; const LangASMap *AddrSpaceMap; + const unsigned *GridValues; + const uint64_t *LongGridValues; mutable StringRef PlatformName; mutable VersionTuple PlatformMinVersion; @@ -1306,6 +1310,12 @@ return LangAS::Default; } + int getGridValue(llvm::GPU::GVIDX gv) const { return GridValues[gv]; } + + uint64_t getLongGridValue(llvm::GPU::GVLIDX gv) const { + return LongGridValues[gv]; + } + /// Retrieve the name of the platform as it is used in the /// availability attribute. StringRef getPlatformName() const { return PlatformName; } diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -17,6 +17,7 @@ #include "clang/Basic/MacroBuilder.h" #include "clang/Basic/TargetBuiltins.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/DataLayout.h" using namespace clang; @@ -286,6 +287,8 @@ resetDataLayout(isAMDGCN(getTriple()) ? DataLayoutStringAMDGCN : DataLayoutStringR600); assert(DataLayout->getAllocaAddrSpace() == Private); + GridValues = &(llvm::GPU::AMDGPUGpuGridValues[0]); + LongGridValues = &(llvm::GPU::AMDGPUGpuLongGridValues[0]); setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D || !isAMDGCN(Triple)); diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/MacroBuilder.h" #include "clang/Basic/TargetBuiltins.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace clang; using namespace clang::targets; @@ -62,6 +63,8 @@ TLSSupported = false; VLASupported = false; AddrSpaceMap = &NVPTXAddrSpaceMap; + GridValues = &(llvm::GPU::NVPTXGpuGridValues[0]); + LongGridValues = &(llvm::GPU::NVPTXGpuLongGridValues[0]); UseAddrSpaceMapMangling = true; // Define available target features diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -0,0 +1,140 @@ +//====--- OMPGridValues.h - Language-specific address spaces --*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Provides definitions for Target specific Grid Values +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_GRIDVALUES_H +#define LLVM_OPENMP_GRIDVALUES_H + +namespace llvm { + +namespace GPU { + +/// \brief Defines various target-specific GPU grid values that must be +/// consistent between host RTL (plugin), device RTL, and clang. +/// We can change grid values for a "fat" binary so that different +/// passes get the correct values when generating code for a +/// multi-target binary. Both amdgcn and nvptx values are stored in +/// this file. In the future, should there be differences between GPUs +/// of the same architecture, then simply make a different array and +/// use the new array name. +/// +/// Example usage in clang: +/// const unsigned slot_size = ctx.GetTargetInfo().getGridValue(GV_Warp_Size); +/// +/// Example usage in libomptarget/deviceRTLs: +/// #include "OMPGridValues.h" +/// #ifdef __AMDGPU__ +/// #define GRIDVAL AMDGPUGpuGridValues +/// #else +/// #define GRIDVAL NVPTXGpuGridValues +/// #endif +/// ... Then use this reference for GV_Warp_Size in the deviceRTL source. +/// GRIDVAL[GV_Warp_Size] +/// +/// Example usage in libomptarget hsa plugin: +/// #include "OMPGridValues.h" +/// #define GRIDVAL AMDGPUGpuGridValues +/// ... Then use this reference to access GV_Warp_Size in the hsa plugin. +/// GRIDVAL[GV_Warp_Size] +/// +/// Example usage in libomptarget cuda plugin: +/// #include "OMPGridValues.h" +/// #define GRIDVAL NVPTXGpuGridValues +/// ... Then use this reference to access GV_Warp_Size in the cuda plugin. +/// GRIDVAL[GV_Warp_Size] +/// +enum GVIDX { + /// The maximum number of workers in a kernel. + /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z + GV_Threads, + /// The size reserved for data in a shared memory slot. + GV_Slot_Size, + /// The default value of maximum number of threads in a worker warp. + GV_Warp_Size, + /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size + /// for NVPTX. + GV_Warp_Size_32, + /// The number of bits required to represent the max number of threads in warp + GV_Warp_Size_Log2, + /// GV_Warp_Size * GV_Slot_Size, + GV_Warp_Slot_Size, + /// the maximum number of teams. + GV_Max_Teams, + /// Global Memory Alignment + GV_Mem_Align, + /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) + GV_Warp_Size_Log2_Mask, + // An alternative to the heavy data sharing infrastructure that uses global + // memory is one that uses device __shared__ memory. The amount of such space + // (in bytes) reserved by the OpenMP runtime is noted here. + GV_SimpleBufferSize, + // The absolute maximum team size for a working group + GV_Max_WG_Size, + // The default maximum team size for a working group + GV_Default_WG_Size, + // This is GV_Max_WG_Size / GV_WarpSize. 32 for Nvidia and 16 for AMD. + GV_Max_Warp_Number +}; + +enum GVLIDX { + /// The slot size that should be reserved for a working warp. + /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) + GV_Warp_Size_Log2_MaskL +}; + +/// For AMDGPU GPUs +static constexpr unsigned AMDGPUGpuGridValues[] = { + 448, // GV_Threads FIXME: How can we make this bigger? + 256, // GV_Slot_Size + 64, // GV_Warp_Size + 32, // GV_Warp_Size_32 + 6, // GV_Warp_Size_Log2 + 64 * 256, // GV_Warp_Slot_Size + 128, // GV_Max_Teams + 256, // GV_Mem_Align + 63, // GV_Warp_Size_Log2_Mask + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size, + 256, // GV_Defaut_WG_Size + 1024 / 64 // This is GV_Max_WG_Size / GV_WarpSize + +}; +static constexpr uint64_t AMDGPUGpuLongGridValues[] = { + 63 // GV_Warp_Size_Log2_MaskL +}; + +/// For Nvidia GPUs +static constexpr unsigned NVPTXGpuGridValues[] = { + 992, // GV_Threads + 256, // GV_Slot_Size + 32, // GV_Warp_Size + 32, // GV_Warp_Size_32 + 5, // GV_Warp_Size_Log2 + 32 * 256, // GV_Warp_Slot_Size + 1024, // GV_Max_Teams + 256, // GV_Mem_Align + (~0u >> (32 - 5)), // GV_Warp_Size_Log2_Mask + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size + 128, // GV_Defaut_WG_Size + 1024 / 32 // GV_Max_WG_Size / GV_WarpSize +}; + +static constexpr uint64_t NVPTXGpuLongGridValues[] = { + 31 // GV_Warp_Size_Log2_MaskL +}; + +} // namespace GPU +} // namespace llvm + +#endif // LLVM_OPENMP_GRIDVALUES_H