Index: llvm/lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -61,9 +61,7 @@ const DebugLoc &DL, Register ScratchWaveOffsetReg) const; - Register - getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF, - Register ScratchWaveOffsetReg) const; + Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; void emitEntryFunctionScratchRsrcRegSetup( MachineFunction &MF, MachineBasicBlock &MBB, Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -30,6 +30,11 @@ ST.getMaxNumSGPRs(MF) / 4); } +static ArrayRef getAllSGPRs(const GCNSubtarget &ST, + const MachineFunction &MF) { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); +} + // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -257,7 +262,7 @@ // Shift down registers reserved for the scratch RSRC. Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( - MachineFunction &MF, Register ScratchWaveOffsetReg) const { + MachineFunction &MF) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -294,15 +299,11 @@ // reserved for VCC etc. for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - // - // FIXME: The preloaded SGPR count is not accurate for shaders as the - // scratch wave offset may be in a fixed SGPR or - // SITargetLowering::allocateSystemSGPRs may choose some free SGPR for the - // scratch wave offset. We explicitly avoid the scratch wave offset to - // account for this. + // reserved input we needed. Also for PAL, make sure we don't clobber + // the GIT pointer passed in SGPR0 or SGPR8. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) { + (!ST.isAmdPalOS() || + !TRI->isSubRegisterEq(Reg, MFI->getGITPtrLoReg(MF)))) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -330,15 +331,16 @@ SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); assert(MFI->isEntryFunction()); - Register ScratchWaveOffsetReg = MFI->getPreloadedReg( + Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); // FIXME: Hack to not crash in situations which emitted an error. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister) + if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) return; // We need to do the replacement of the private segment buffer register even @@ -347,8 +349,7 @@ // // This will return `AMDGPU::NoRegister` in cases where there are no actual // uses of the SRSRC. - Register ScratchRsrcReg = - getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg); + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); // Make the selected register live throughout the function. if (ScratchRsrcReg != AMDGPU::NoRegister) { @@ -379,6 +380,32 @@ DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); + // We found the SRSRC first because it needs four registers and has an + // alignment requirement. If the SRSRC that we found is clobbering with + // the scratch wave offset, which may be in a fixed SGPR or a free SGPR + // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch + // wave offset to a free SGPR. + Register ScratchWaveOffsetReg = AMDGPU::NoRegister; + if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { + ArrayRef AllSGPRs = getAllSGPRs(ST, MF); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + AllSGPRs = AllSGPRs.slice( + std::min(static_cast(AllSGPRs.size()), NumPreloaded)); + for (MCPhysReg Reg : AllSGPRs) { + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && + (!ST.isAmdPalOS() || MFI->getGITPtrLoReg(MF) != Reg)) { + ScratchWaveOffsetReg = Reg; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + break; + } + } + } else { + ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; + } + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + if (MF.getFrameInfo().hasCalls()) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); @@ -393,8 +420,8 @@ } if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) { - MRI.addLiveIn(ScratchWaveOffsetReg); - MBB.addLiveIn(ScratchWaveOffsetReg); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } if (MFI->hasFlatScratchInit()) { @@ -437,19 +464,7 @@ const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); BuildMI(MBB, I, DL, GetPC64, Rsrc01); } - auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in - if (ST.hasMergedShaders()) { - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - // Low GIT address is passed in s8 rather than s0 for an LS+HS or - // ES+GS merged shader on gfx9+. - GitPtrLo = AMDGPU::SGPR8; - break; - default: - break; - } - } + auto GitPtrLo = MFI->getGITPtrLoReg(MF); MF.getRegInfo().addLiveIn(GitPtrLo); MBB.addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -676,6 +676,8 @@ return GITPtrHigh; } + Register getGITPtrLoReg(const MachineFunction &MF) const; + uint32_t get32BitAddressHighBits() const { return HighBitsOf32BitAddress; } Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -439,6 +439,27 @@ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } +Register +SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { + const GCNSubtarget &ST = MF.getSubtarget(); + if (!ST.isAmdPalOS()) + return AMDGPU::NoRegister; + auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in + if (ST.hasMergedShaders()) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + // Low GIT address is passed in s8 rather than s0 for an LS+HS or + // ES+GS merged shader on gfx9+. + GitPtrLo = AMDGPU::SGPR8; + return GitPtrLo; + default: + return GitPtrLo; + } + } + return GitPtrLo; +} + static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI) { yaml::StringValue Dest; Index: llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -print-after=prologepilog < %s 2>&1 | FileCheck -check-prefix=CHECK %s + +; On PAL, we need to ensure SRSRC do not clobber GIT pointer, passed +; in SGPR8 for HS or GS + +; CHECK-NOT: $sgpr8_sgpr9 = S_GETPC_B64 +; CHECK-NOT: $sgpr8 = S_MOV_B32 $sgpr8 +; CHECK-NOT: $sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX4_IMM + +define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %mergedGroupInfo) { +entry: + call void @llvm.amdgcn.init.exec(i64 -1) + %threadIdInWave = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %shr = lshr i32 %mergedGroupInfo, 12 + %vertCountInSubgroup = and i32 %shr, 511 + %shl = shl nuw nsw i32 undef, 5 + %threadIdInSubgroup = add i32 %shl, %threadIdInWave + br label %endAllocReq + +endAllocReq: + br label %endExpPrim + +endExpPrim: + %cmp = icmp ult i32 %threadIdInSubgroup, %vertCountInSubgroup + br i1 %cmp, label %expVert, label %endExpVert + +expVert: + store float 1.0, float addrspace(5)* undef, align 4 + br label %endExpVert + +endExpVert: + ret void +} + +declare void @llvm.amdgcn.init.exec(i64 immarg) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) + + Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -14,14 +14,14 @@ ; ; GCN-LABEL: {{^}}ps_main: -; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s6, -1 -; SI-DAG: s_mov_b32 s7, 0xe8f000 -; VI-DAG: s_mov_b32 s7, 0xe80000 -; GFX9-DAG: s_mov_b32 s7, 0xe00000 -; GFX10_W32-DAG: s_mov_b32 s7, 0x31c16000 -; GFX10_W64-DAG: s_mov_b32 s7, 0x31e16000 +; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s2, -1 +; SI-DAG: s_mov_b32 s3, 0xe8f000 +; VI-DAG: s_mov_b32 s3, 0xe80000 +; GFX9-DAG: s_mov_b32 s3, 0xe00000 +; GFX10_W32-DAG: s_mov_b32 s3, 0x31c16000 +; GFX10_W64-DAG: s_mov_b32 s3, 0x31e16000 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-NOT: s_mov_b32 s0 @@ -39,7 +39,7 @@ } ; GCN-LABEL: {{^}}vs_main: -; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -51,7 +51,7 @@ } ; GCN-LABEL: {{^}}cs_main: -; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_cs float @cs_main(i32 %idx) { @@ -62,7 +62,7 @@ } ; GCN-LABEL: {{^}}hs_main: -; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -79,7 +79,7 @@ } ; GCN-LABEL: {{^}}gs_main: -; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen