diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -695,7 +695,7 @@ ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; const uint64_t MaxScratchPerWorkitem = - GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize(); + STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ProgInfo.ScratchSize, @@ -879,15 +879,14 @@ ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; - // Scratch is allocated in 256 dword blocks. - unsigned ScratchAlignShift = 10; + // Scratch is allocated in 64-dword or 256-dword blocks. + unsigned ScratchAlignShift = + STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; // We need to program the hardware with the amount of scratch memory that // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - ProgInfo.ScratchBlocks = - alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1ULL << ScratchAlignShift) >> - ScratchAlignShift; + ProgInfo.ScratchBlocks = divideCeil( + ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; @@ -946,6 +945,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &STM = MF.getSubtarget(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -957,7 +957,10 @@ OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); - OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. @@ -966,8 +969,10 @@ OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); - OutStreamer->emitIntValue( - S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -201,9 +201,6 @@ SIFrameLowering FrameLowering; public: - // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. - static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); - GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); ~GCNSubtarget() override; @@ -266,9 +263,19 @@ return (Generation)Gen; } + unsigned getMaxWaveScratchSize() const { + // See COMPUTE_TMPRING_SIZE.WAVESIZE. + if (getGeneration() < GFX11) { + // 13-bit field in units of 256-dword. + return (256 * 4) * ((1 << 13) - 1); + } + // 15-bit field in units of 64-dword. + return (64 * 4) * ((1 << 15) - 1); + } + /// Return the number of high bits known to be zero for a frame index. unsigned getKnownHighZeroBitsForFrameIndex() const { - return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); } int getLDSBankCount() const { diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -1036,10 +1036,12 @@ #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 -#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) +#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 -#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) +#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) #define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 #define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll --- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll +++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll @@ -1,10 +1,15 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s -; Check SPI_TMPRING_SIZE.WAVESIZE = 5 +; SPI_TMPRING_SIZE.WAVESIZE = 5 ; GFX10: .long 165608 ; GFX10-NEXT: .long 20480 +; SPI_TMPRING_SIZE.WAVESIZE = 17 +; GFX11: .long 165608 +; GFX11-NEXT: .long 69632 + ; GCN-LABEL: {{^}}scratch_ps: ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}} ; GCN-DAG: s_mov_b32 s6, -1{{$}}