diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -103,6 +103,11 @@ const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; + void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, + int Offset, unsigned EltSize, Register VGPR, + int64_t VGPRLanes, RegScavenger *RS, + bool IsLoad) const; + /// If \p OnlyToVGPR is true, this will only succeed if this bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -863,6 +863,145 @@ } } +// Generate a VMEM access which loads or stores the VGPR containing an SGPR +// spill such that all the lanes set in VGPRLanes are loaded or stored. +// This generates exec mask manipulation and will use SGPRs available in MI +// or VGPR lanes in the VGPR to save and restore the exec mask. +void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, + int Index, int Offset, + unsigned EltSize, Register VGPR, + int64_t VGPRLanes, + RegScavenger *RS, + bool IsLoad) const { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + Register SuperReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + ArrayRef SplitParts = getRegSplitParts(RC, EltSize); + unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + unsigned FirstPart = isWave32 ? Offset * 16 : Offset * 32; + + bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + const bool SuperRegIsExec = + SuperReg == AMDGPU::EXEC || SuperReg == AMDGPU::EXEC_LO; + + // If exec mask is stored in the VGPR, make sure it is stored after + // any lanes used by the spill (16 lanes on Wave32, 32 lanes on Wave64). + const unsigned ExecLoLane = SuperRegIsExec ? 0 : (isWave32 ? 16 : 32); + const unsigned ExecHiLane = SuperRegIsExec ? 1 : (isWave32 ? 17 : 33); + + // Always try to use the src/dst SGPRs to hold a copy of the exec mask. + // This is not possible when the src value must be valid after spill + // or src is smaller than exec mask. In which case use VGPR. + bool StoreExecInVGPR = !IsLoad && (SuperRegIsExec || !IsKill); + + // On Wave32 only handle EXEC_LO. + // On Wave64 only update EXEC_HI if there is sufficent space for a copy. + bool OnlyExecLo = isWave32 || NumSubRegs == 1; + + unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + Register SavedExecReg; + + // Backup EXEC + if (SuperRegIsExec) { + // do nothing; exec is already stored in VGPR or will be overwritten + } else if (StoreExecInVGPR) { + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + VGPR) + .addReg(AMDGPU::EXEC_LO) + .addImm(ExecLoLane) + .addReg(VGPR, getUndefRegState(IsLoad)); + + if (!isWave32) { + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + VGPR) + .addReg(AMDGPU::EXEC_HI) + .addImm(ExecHiLane) + .addReg(VGPR); + } + } else { + if (OnlyExecLo) { + SavedExecReg = NumSubRegs == 1 + ? SuperReg + : getSubReg(SuperReg, SplitParts[FirstPart]); + } else { + SavedExecReg = + getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart]), + AMDGPU::sub0, &AMDGPU::SGPR_64RegClass); + // If src/dst is an odd size it is possible subreg0 is not aligned. + if (!SavedExecReg && NumSubRegs > 2) + SavedExecReg = + getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart + 1]), + AMDGPU::sub0, &AMDGPU::SGPR_64RegClass); + } + + assert(SavedExecReg); + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); + } + + // Setup EXEC + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); + + // Load/store VGPR + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); + + Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) + ? getBaseRegister() + : getFrameRegister(*MF); + + Align Alignment = FrameInfo.getObjectAlign(Index); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, + EltSize, Alignment); + + if (IsLoad) { + buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + Index, + VGPR, false, + MFI->getScratchRSrcReg(), FrameReg, + Offset * EltSize, MMO, + RS); + } else { + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + Index, + VGPR, !StoreExecInVGPR, + MFI->getScratchRSrcReg(), FrameReg, + Offset * EltSize, MMO, + RS); + // This only ever adds one VGPR spill + MFI->addToSpilledVGPRs(1); + } + + // Restore EXEC + if (SuperRegIsExec && IsLoad) { + // do nothing; exec will be overwritten + } else if (StoreExecInVGPR) { + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + AMDGPU::EXEC_LO) + .addReg(VGPR, getKillRegState(!IsLoad && isWave32)) + .addImm(ExecLoLane); + if (!isWave32) { + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + AMDGPU::EXEC_HI) + .addReg(VGPR, getKillRegState(!IsLoad)) + .addImm(ExecHiLane); + } + } else { + assert(SavedExecReg); + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) + .addReg(SavedExecReg, RegState::Kill); + } +} + bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -884,8 +1023,6 @@ bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg())); @@ -897,17 +1034,10 @@ ArrayRef SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - // Scavenged temporary VGPR to use. It must be scavenged once for any number - // of spilled subregs. - Register TmpVGPR; - - // SubReg carries the "Kill" flag when SubReg == SuperReg. - unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - Register SubReg = - NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToVGPR) { + if (SpillToVGPR) { + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; // During SGPR spilling to VGPR, determine if the VGPR is defined. The @@ -929,42 +1059,52 @@ // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. - } else { - // XXX - Can to VGPR spill fail for some subregisters but not others? - if (OnlyToVGPR) - return false; - - // Spill SGPR to a frame index. - if (!TmpVGPR.isValid()) - TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(SubReg, SubKillState); - - // There could be undef components of a spilled super register. - // TODO: Can we detect this and skip the spill? - if (NumSubRegs > 1) { - // The last implicit use of the SuperReg carries the "Kill" flag. - unsigned SuperKillState = 0; - if (i + 1 == e) - SuperKillState |= getKillRegState(IsKill); - Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); + } + } else { + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + + // SubReg carries the "Kill" flag when SubReg == SuperReg. + unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); + + unsigned PerVGPR = isWave32 ? 16 : 32; + unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; + int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; + + for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { + unsigned TmpVGPRFlags = RegState::Undef; + + // Write sub registers into the VGPR + for (unsigned i = Offset * PerVGPR, + e = std::min((Offset + 1) * PerVGPR, NumSubRegs); + i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); + + MachineInstrBuilder WriteLane = + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + TmpVGPR) + .addReg(SubReg, SubKillState) + .addImm(i % PerVGPR) + .addReg(TmpVGPR, TmpVGPRFlags); + TmpVGPRFlags = 0; + + // There could be undef components of a spilled super register. + // TODO: Can we detect this and skip the spill? + if (NumSubRegs > 1) { + // The last implicit use of the SuperReg carries the "Kill" flag. + unsigned SuperKillState = 0; + if (i + 1 == NumSubRegs) + SuperKillState |= getKillRegState(IsKill); + WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState); + } } - Align Alignment = FrameInfo.getObjectAlign(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO = - MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, - commonAlignment(Alignment, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpVGPR, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getStackPtrOffsetReg()) // soffset - .addImm(i * 4) // offset - .addMemOperand(MMO); + // Write out VGPR + buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, + RS, false); } } @@ -987,7 +1127,6 @@ if (OnlyToVGPR && !SpillToVGPR) return false; - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); @@ -1002,13 +1141,11 @@ ArrayRef SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - Register TmpVGPR; - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - Register SubReg = - NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); + if (SpillToVGPR) { + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); - if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), @@ -1018,36 +1155,36 @@ if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); - } else { - if (OnlyToVGPR) - return false; - - // Restore SGPR from a stack slot. - // FIXME: We should use S_LOAD_DWORD here for VI. - if (!TmpVGPR.isValid()) - TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - Align Alignment = FrameInfo.getObjectAlign(Index); - - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - - MachineMemOperand *MMO = - MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, - commonAlignment(Alignment, EltSize * i)); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getStackPtrOffsetReg()) // soffset - .addImm(i * 4) // offset - .addMemOperand(MMO); - - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpVGPR, RegState::Kill); - - if (NumSubRegs > 1) - MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } + } else { + Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + + unsigned PerVGPR = isWave32 ? 16 : 32; + unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; + int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; + + for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { + // Load in VGPR data + buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, + RS, true); + + // Unpack lanes + for (unsigned i = Offset * PerVGPR, + e = std::min((Offset + 1) * PerVGPR, NumSubRegs); + i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); + + bool LastSubReg = (i + 1 == e); + auto MIB = + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) + .addReg(TmpVGPR, getKillRegState(LastSubReg)) + .addImm(i); + + if (NumSubRegs > 1 && i == 0) + MIB.addReg(SuperReg, RegState::ImplicitDefine); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -28,10 +28,9 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill -; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill +; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 +; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 +; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -56,13 +55,10 @@ -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] +; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 +; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} @@ -109,10 +105,9 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill -; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill +; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 +; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 +; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -132,13 +127,10 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] +; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 +; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload @@ -186,10 +178,9 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 +; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 +; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] @@ -202,13 +193,10 @@ ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]] +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]] - -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]] +; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 +; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} @@ -221,10 +209,9 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], 0 +; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], 1 +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} @@ -249,13 +236,10 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] +; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 +; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -563,9 +563,8 @@ ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} ; GCN: buffer_load_dword v[[RESTORE_TMP:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 -; GCN: v_readfirstlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]] -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 -; GCN: v_readfirstlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]] +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]], 0 +; GCN: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]], 1 ; GCN: ;;#ASMSTART ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir @@ -0,0 +1,445 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s + + +# CHECK-LABEL: name: check_spill + +# S32 with kill +# CHECK: V_WRITELANE +# CHECK: $sgpr12 = S_MOV_B32 $exec_lo +# CHECK: $exec_lo = S_MOV_B32 1 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 + +# S32 without kill +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: $exec_lo = S_MOV_B32 1 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# CHECK: $exec_lo = V_READLANE + +# S64 with kill +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 3 +# GCN64: $exec = S_MOV_B64 3 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 + +# S64 without kill +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN64: V_WRITELANE +# GCN32: $exec_lo = S_MOV_B32 3 +# GCN64: $exec = S_MOV_B64 3 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# CHECK: $exec_lo = V_READLANE +# GCN64: $exec_hi = V_READLANE + +# S96 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 7 +# GCN64: $exec = S_MOV_B64 7 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 + +# S128 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 15 +# GCN64: $exec = S_MOV_B64 15 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 + +# S160 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 31 +# GCN64: $exec = S_MOV_B64 31 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 + +# S256 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 255 +# GCN64: $exec = S_MOV_B64 255 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 + +# S512 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN64: $exec = S_MOV_B64 65535 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 + +# S1024 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr64 = S_MOV_B32 $exec_lo +# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN32: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# CHECK: V_WRITELANE +# GCN32: $sgpr80 = S_MOV_B32 $exec_lo +# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN64: $exec = S_MOV_B64 4294967295 +# GCN32: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 164 +# GCN64: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80 +# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 + +--- | + + define amdgpu_kernel void @check_spill() #0 { + ret void + } + + define amdgpu_kernel void @check_reload() #0 { + ret void + } + + attributes #0 = { "frame-pointer"="all" } +... +--- +name: check_spill +tracksRegLiveness: true +liveins: + - { reg: '$sgpr4_sgpr5' } + - { reg: '$sgpr6_sgpr7' } + - { reg: '$sgpr8' } +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } + - { id: 1, type: spill-slot, size: 8, alignment: 4 } + - { id: 2, type: spill-slot, size: 12, alignment: 4 } + - { id: 3, type: spill-slot, size: 16, alignment: 4 } + - { id: 4, type: spill-slot, size: 20, alignment: 4 } + - { id: 5, type: spill-slot, size: 32, alignment: 4 } + - { id: 6, type: spill-slot, size: 64, alignment: 4 } + - { id: 7, type: spill-slot, size: 128, alignment: 4 } +machineFunctionInfo: + explicitKernArgSize: 660 + maxKernArgAlign: 4 + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + privateSegmentWaveByteOffset: { reg: '$sgpr9' } +body: | + bb.0: + liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 + + renamable $sgpr12 = IMPLICIT_DEF + SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12 = IMPLICIT_DEF + SI_SPILL_S32_SAVE $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13 = IMPLICIT_DEF + SI_SPILL_S64_SAVE killed $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13 = IMPLICIT_DEF + SI_SPILL_S64_SAVE $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF + SI_SPILL_S96_SAVE killed $sgpr12_sgpr13_sgpr14, %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF + SI_SPILL_S128_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF + SI_SPILL_S160_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16, %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF + SI_SPILL_S256_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF + SI_SPILL_S512_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF + SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + +# CHECK-LABEL: name: check_reload + +# S32 +# CHECK: $sgpr12 = S_MOV_B32 $exec_lo +# CHECK: $exec_lo = S_MOV_B32 1 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 +# CHECK: $sgpr12 = V_READLANE + +# S64 +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 3 +# GCN64: $exec = S_MOV_B64 3 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 +# CHECK: $sgpr12 = V_READLANE +# CHECK: $sgpr13 = V_READLANE + +# S96 +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 7 +# GCN64: $exec = S_MOV_B64 7 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 +# CHECK: $sgpr12 = V_READLANE +# CHECK: $sgpr13 = V_READLANE +# CHECK: $sgpr14 = V_READLANE + +# S128 +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 15 +# GCN64: $exec = S_MOV_B64 15 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 +# CHECK: $sgpr12 = V_READLANE +# CHECK: $sgpr13 = V_READLANE +# CHECK: $sgpr14 = V_READLANE +# CHECK: $sgpr15 = V_READLANE + +# S160 +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 31 +# GCN64: $exec = S_MOV_B64 31 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 +# CHECK: $sgpr12 = V_READLANE +# CHECK: $sgpr13 = V_READLANE +# CHECK: $sgpr14 = V_READLANE +# CHECK: $sgpr15 = V_READLANE +# CHECK: $sgpr16 = V_READLANE + +# S256 +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 255 +# GCN64: $exec = S_MOV_B64 255 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 +# CHECK: $sgpr12 = V_READLANE +# CHECK: $sgpr13 = V_READLANE +# CHECK: $sgpr14 = V_READLANE +# CHECK: $sgpr15 = V_READLANE +# CHECK: $sgpr16 = V_READLANE +# CHECK: $sgpr17 = V_READLANE +# CHECK: $sgpr18 = V_READLANE +# CHECK: $sgpr19 = V_READLANE + +# S512 +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN64: $exec = S_MOV_B64 65535 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 +# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 +# CHECK: $sgpr12 = V_READLANE +# CHECK: $sgpr13 = V_READLANE +# CHECK: $sgpr14 = V_READLANE +# CHECK: $sgpr15 = V_READLANE +# CHECK: $sgpr16 = V_READLANE +# CHECK: $sgpr17 = V_READLANE +# CHECK: $sgpr18 = V_READLANE +# CHECK: $sgpr19 = V_READLANE +# CHECK: $sgpr20 = V_READLANE +# CHECK: $sgpr21 = V_READLANE +# CHECK: $sgpr22 = V_READLANE +# CHECK: $sgpr23 = V_READLANE +# CHECK: $sgpr24 = V_READLANE +# CHECK: $sgpr25 = V_READLANE +# CHECK: $sgpr26 = V_READLANE +# CHECK: $sgpr27 = V_READLANE + +# S1024 +# GCN32: $sgpr64 = S_MOV_B32 $exec_lo +# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec +# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN64: $exec = S_MOV_B64 4294967295 +# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 +# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 +# CHECK: $sgpr64 = V_READLANE +# CHECK: $sgpr65 = V_READLANE +# CHECK: $sgpr66 = V_READLANE +# CHECK: $sgpr67 = V_READLANE +# CHECK: $sgpr68 = V_READLANE +# CHECK: $sgpr69 = V_READLANE +# CHECK: $sgpr70 = V_READLANE +# CHECK: $sgpr71 = V_READLANE +# CHECK: $sgpr72 = V_READLANE +# CHECK: $sgpr73 = V_READLANE +# CHECK: $sgpr74 = V_READLANE +# CHECK: $sgpr75 = V_READLANE +# CHECK: $sgpr76 = V_READLANE +# CHECK: $sgpr77 = V_READLANE +# CHECK: $sgpr78 = V_READLANE +# CHECK: $sgpr79 = V_READLANE +# GCN32: $sgpr80 = S_MOV_B32 $exec_lo +# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN32: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 164 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80 +# CHECK: $sgpr80 = V_READLANE +# CHECK: $sgpr81 = V_READLANE +# CHECK: $sgpr82 = V_READLANE +# CHECK: $sgpr83 = V_READLANE +# CHECK: $sgpr84 = V_READLANE +# CHECK: $sgpr85 = V_READLANE +# CHECK: $sgpr86 = V_READLANE +# CHECK: $sgpr87 = V_READLANE +# CHECK: $sgpr88 = V_READLANE +# CHECK: $sgpr89 = V_READLANE +# CHECK: $sgpr90 = V_READLANE +# CHECK: $sgpr91 = V_READLANE +# CHECK: $sgpr92 = V_READLANE +# CHECK: $sgpr93 = V_READLANE +# CHECK: $sgpr94 = V_READLANE +# CHECK: $sgpr95 = V_READLANE + +--- +name: check_reload +tracksRegLiveness: true +liveins: + - { reg: '$sgpr4_sgpr5' } + - { reg: '$sgpr6_sgpr7' } + - { reg: '$sgpr8' } +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } + - { id: 1, type: spill-slot, size: 8, alignment: 4 } + - { id: 2, type: spill-slot, size: 12, alignment: 4 } + - { id: 3, type: spill-slot, size: 16, alignment: 4 } + - { id: 4, type: spill-slot, size: 20, alignment: 4 } + - { id: 5, type: spill-slot, size: 32, alignment: 4 } + - { id: 6, type: spill-slot, size: 64, alignment: 4 } + - { id: 7, type: spill-slot, size: 128, alignment: 4 } +machineFunctionInfo: + explicitKernArgSize: 660 + maxKernArgAlign: 4 + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + privateSegmentWaveByteOffset: { reg: '$sgpr9' } +body: | + bb.0: + liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 + + renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14 = SI_SPILL_S96_RESTORE %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = SI_SPILL_S160_RESTORE %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -6,9 +6,13 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 +; SGPR-NEXT: s_mov_b64 exec, s[0:1] ; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] +; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 +; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 +; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 +; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 ; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -13,7 +13,7 @@ ; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 2 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 -; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] +; TOVMEM-DAG: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] @@ -24,7 +24,7 @@ ; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) -; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] +; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -35,9 +35,9 @@ } ; CHECK-LABEL: test_limited_sgpr -; GFX6: s_add_u32 s32, s32, 0x84100 +; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]] ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32 -; GFX6-NEXT: s_sub_u32 s32, s32, 0x84100 +; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]] ; GFX6: NumSgprs: 48 ; GFX6: ScratchSize: 8624 define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -11,11 +11,9 @@ ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 -; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: s_cbranch_scc1 -; VMEM: buffer_load_dword ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 { %wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -42,13 +40,9 @@ ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: s_cbranch_scc1 -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 { %wide.sgpr = call <3 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -77,15 +71,9 @@ ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: s_cbranch_scc1 -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 { %wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -116,17 +104,9 @@ ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4 -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: s_cbranch_scc1 -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x5(i32 addrspace(1)* %out, i32 %in) #0 { %wide.sgpr = call <5 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -162,23 +142,9 @@ ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6 ; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7 -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword ; VMEM: buffer_store_dword ; VMEM: s_cbranch_scc1 -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword -; VMEM: buffer_load_dword ; VMEM: buffer_load_dword define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 { %wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0