diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -106,11 +106,6 @@ const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; - void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, - int Offset, unsigned EltSize, Register VGPR, - int64_t VGPRLanes, RegScavenger *RS, - bool IsLoad) const; - /// If \p OnlyToVGPR is true, this will only succeed if this bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1048,123 +1048,6 @@ } } -// Generate a VMEM access which loads or stores the VGPR containing an SGPR -// spill such that all the lanes set in VGPRLanes are loaded or stored. -// This generates exec mask manipulation and will use SGPRs available in MI -// or VGPR lanes in the VGPR to save and restore the exec mask. -void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, - int Index, int Offset, - unsigned EltSize, Register VGPR, - int64_t VGPRLanes, - RegScavenger *RS, - bool IsLoad) const { - MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MBB->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - - Register SuperReg = MI->getOperand(0).getReg(); - const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - ArrayRef SplitParts = getRegSplitParts(RC, EltSize); - unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - unsigned FirstPart = Offset * 32; - unsigned ExecLane = 0; - - bool IsKill = MI->getOperand(0).isKill(); - const DebugLoc &DL = MI->getDebugLoc(); - - // Cannot handle load/store to EXEC - assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && - SuperReg != AMDGPU::EXEC && "exec should never spill"); - - // On Wave32 only handle EXEC_LO. - // On Wave64 only update EXEC_HI if there is sufficent space for a copy. - bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI; - - unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - Register SavedExecReg; - - // Backup EXEC - if (OnlyExecLo) { - SavedExecReg = - NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane])); - } else { - // If src/dst is an odd size it is possible subreg0 is not aligned. - for (; ExecLane < (NumSubRegs - 1); ++ExecLane) { - SavedExecReg = getMatchingSuperReg( - getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0, - &AMDGPU::SReg_64_XEXECRegClass); - if (SavedExecReg) - break; - } - } - assert(SavedExecReg); - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); - - // Setup EXEC - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); - - // Load/store VGPR - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); - - Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) - ? getBaseRegister() - : getFrameRegister(*MF); - - Align Alignment = FrameInfo.getObjectAlign(Index); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(*MF, Index); - MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, - EltSize, Alignment); - - if (IsLoad) { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR - : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, - Index, - VGPR, false, - FrameReg, - Offset * EltSize, MMO, - RS); - } else { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR - : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, Index, VGPR, - IsKill, FrameReg, - Offset * EltSize, MMO, RS); - // This only ever adds one VGPR spill - MFI->addToSpilledVGPRs(1); - } - - // Restore EXEC - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) - .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill)); - - // Restore clobbered SGPRs - if (IsLoad) { - // Nothing to do; register will be overwritten - } else if (!IsKill) { - // Restore SGPRs from appropriate VGPR lanes - if (!OnlyExecLo) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), - getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1])) - .addReg(VGPR) - .addImm(ExecLane + 1); - } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), - NumSubRegs == 1 ? SavedExecReg - : Register(getSubReg( - SuperReg, SplitParts[FirstPart + ExecLane]))) - .addReg(VGPR, RegState::Kill) - .addImm(ExecLane); - } -} - bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -1229,6 +1112,7 @@ // it are fixed. } } else { + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); // Scavenged temporary VGPR to use. It must be scavenged once for any number // of spilled subregs. Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); @@ -1237,42 +1121,34 @@ // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); - unsigned PerVGPR = 32; - unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; - int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; - - for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { - unsigned TmpVGPRFlags = RegState::Undef; - - // Write sub registers into the VGPR - for (unsigned i = Offset * PerVGPR, - e = std::min((Offset + 1) * PerVGPR, NumSubRegs); - i < e; ++i) { - Register SubReg = NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[i])); - - MachineInstrBuilder WriteLane = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR) - .addReg(SubReg, SubKillState) - .addImm(i % PerVGPR) - .addReg(TmpVGPR, TmpVGPRFlags); - TmpVGPRFlags = 0; - - // There could be undef components of a spilled super register. - // TODO: Can we detect this and skip the spill? - if (NumSubRegs > 1) { - // The last implicit use of the SuperReg carries the "Kill" flag. - unsigned SuperKillState = 0; - if (i + 1 == NumSubRegs) - SuperKillState |= getKillRegState(IsKill); - WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState); - } + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + Register SubReg = NumSubRegs == 1 + ? SuperReg + : Register(getSubReg(SuperReg, SplitParts[i])); + MachineInstrBuilder Mov = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(SubReg, SubKillState); + + if (NumSubRegs > 1) { + // The last implicit use of the SuperReg carries the "Kill" flag. + unsigned SuperKillState = 0; + if (i + 1 == e) + SuperKillState |= getKillRegState(IsKill); + Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); } - // Write out VGPR - buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, - RS, false); + Align Alignment = FrameInfo.getObjectAlign(Index); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + MachineMemOperand *MMO = + MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, + commonAlignment(Alignment, EltSize * i)); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) + .addReg(TmpVGPR, RegState::Kill) // src + .addFrameIndex(Index) // vaddr + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset + .addMemOperand(MMO); } } @@ -1325,34 +1201,37 @@ MIB.addReg(SuperReg, RegState::ImplicitDefine); } } else { + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); RS->setRegUsed(TmpVGPR); - unsigned PerVGPR = 32; - unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; - int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; - - for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { - // Load in VGPR data - buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, - RS, true); - - // Unpack lanes - for (unsigned i = Offset * PerVGPR, - e = std::min((Offset + 1) * PerVGPR, NumSubRegs); - i < e; ++i) { - Register SubReg = NumSubRegs == 1 - ? SuperReg - : Register(getSubReg(SuperReg, SplitParts[i])); - - bool LastSubReg = (i + 1 == e); - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) - .addReg(TmpVGPR, getKillRegState(LastSubReg)) - .addImm(i); - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); - } + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + Register SubReg = NumSubRegs == 1 + ? SuperReg + : Register(getSubReg(SuperReg, SplitParts[i])); + Align Alignment = FrameInfo.getObjectAlign(Index); + + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); + + MachineMemOperand *MMO = + MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, + commonAlignment(Alignment, EltSize * i)); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) + .addFrameIndex(Index) // vaddr + .addReg(MFI->getStackPtrOffsetReg()) // soffset + .addImm(i * 4) // offset + .addMemOperand(MMO); + + auto MIB = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpVGPR, RegState::Kill); + + if (NumSubRegs > 1) + MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } } diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -23,12 +23,14 @@ ; Spill saved exec ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec + ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:8 ; 4-byte Folded Spill ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]] ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -52,10 +54,12 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VMEM: s_waitcnt vmcnt(0) +; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:8 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 +; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} @@ -98,9 +102,10 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:8 ; 4-byte Folded Spill ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] @@ -121,10 +126,12 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VMEM: s_waitcnt vmcnt(0) +; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:8 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 +; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload @@ -173,9 +180,10 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] @@ -187,10 +195,12 @@ ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]] ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 +; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]] +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload +; VMEM: s_waitcnt vmcnt(0) +; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} @@ -205,9 +215,10 @@ ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1 -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]] +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] @@ -230,11 +241,12 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload +; VMEM: s_waitcnt vmcnt(0) +; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 +; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -15,8 +15,10 @@ ; SPILL-TO-VGPR: v_readlane_b32 s4, v40, 0 ; SPILL-TO-VGPR: v_readlane_b32 s5, v40, 1 -; NO-SPILL-TO-VGPR: v_readlane_b32 s4, v1, 0 -; NO-SPILL-TO-VGPR: v_readlane_b32 s5, v1, 1 +; NO-SPILL-TO_VGPR: v_mov_b32_e32 v1, s30 +; NO-SPILL-TO-VGPR: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; NO-SPILL-TO_VGPR: v_mov_b32_e32 v1, s31 +; NO-SPILL-TO-VGPR: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; SPILL-TO-VGPR: v_readlane_b32 s33, v40, 2 ; NO-SPILL-TO-VGPR: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -761,12 +761,10 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 @@ -842,13 +840,12 @@ ; GCN-NEXT: v_readlane_b32 s17, v31, 61 ; GCN-NEXT: v_readlane_b32 s18, v31, 62 ; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: s_mov_b64 exec, 3 ; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s1, v0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir +++ /dev/null @@ -1,467 +0,0 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,MUBUF %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN32,MUBUF %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,FLATSCR %s - - -# CHECK-LABEL: name: check_spill - -# FLATSCR: $sgpr33 = S_MOV_B32 0 -# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc -# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc - -# S32 with kill -# CHECK: V_WRITELANE -# CHECK: $sgpr12 = S_MOV_B32 $exec_lo -# CHECK: $exec_lo = S_MOV_B32 1 -# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 4 -# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 - -# S32 without kill -# CHECK: V_WRITELANE -# CHECK: $sgpr12 = S_MOV_B32 $exec_lo -# CHECK: $exec_lo = S_MOV_B32 1 -# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 4 -# CHECK: $sgpr12 = V_READLANE - -# S64 with kill -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 3 -# GCN64: $exec = S_MOV_B64 3 -# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 8 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 - -# S64 without kill -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 3 -# GCN64: $exec = S_MOV_B64 3 -# MUBUF: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 8 -# GCN32: $exec_lo = S_MOV_B32 $sgpr12 -# GCN64: $exec = S_MOV_B64 $sgpr12_sgpr13 -# GCN64: $sgpr13 = V_READLANE -# CHECK: $sgpr12 = V_READLANE - -# S96 -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 7 -# GCN64: $exec = S_MOV_B64 7 -# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 16 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 - -# S128 -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 15 -# GCN64: $exec = S_MOV_B64 15 -# MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 28 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 - -# S160 -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 31 -# GCN64: $exec = S_MOV_B64 31 -# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 44 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 - -# S256 -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 255 -# GCN64: $exec = S_MOV_B64 255 -# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 64 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 - -# S512 -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 65535 -# GCN64: $exec = S_MOV_B64 65535 -# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 96 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 - -# S1024 -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN32: $sgpr64 = S_MOV_B32 $exec_lo -# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 4294967295 -# GCN64: $exec = S_MOV_B64 4294967295 -# MUBUF: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 -# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 160 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 -# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 - ---- | - - define amdgpu_kernel void @check_spill() #0 { - ret void - } - - define amdgpu_kernel void @check_reload() #0 { - ret void - } - - attributes #0 = { "frame-pointer"="all" } -... ---- -name: check_spill -tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - - { reg: '$sgpr6_sgpr7' } - - { reg: '$sgpr8' } -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4 } - - { id: 1, type: spill-slot, size: 8, alignment: 4 } - - { id: 2, type: spill-slot, size: 12, alignment: 4 } - - { id: 3, type: spill-slot, size: 16, alignment: 4 } - - { id: 4, type: spill-slot, size: 20, alignment: 4 } - - { id: 5, type: spill-slot, size: 32, alignment: 4 } - - { id: 6, type: spill-slot, size: 64, alignment: 4 } - - { id: 7, type: spill-slot, size: 128, alignment: 4 } -machineFunctionInfo: - explicitKernArgSize: 660 - maxKernArgAlign: 4 - isEntryFunction: true - waveLimiter: true - scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - argumentInfo: - flatScratchInit: { reg: '$sgpr0_sgpr1' } - dispatchPtr: { reg: '$sgpr2_sgpr3' } - privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' } - kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } - workGroupIDX: { reg: '$sgpr10' } - privateSegmentWaveByteOffset: { reg: '$sgpr11' } -body: | - bb.0: - liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 - - renamable $sgpr12 = IMPLICIT_DEF - SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12 = IMPLICIT_DEF - SI_SPILL_S32_SAVE $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13 = IMPLICIT_DEF - SI_SPILL_S64_SAVE killed $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13 = IMPLICIT_DEF - SI_SPILL_S64_SAVE $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF - SI_SPILL_S96_SAVE killed $sgpr12_sgpr13_sgpr14, %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF - SI_SPILL_S128_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF - SI_SPILL_S160_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16, %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF - SI_SPILL_S256_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF - SI_SPILL_S512_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF - SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - -# CHECK-LABEL: name: check_reload - -# FLATSCR: $sgpr33 = S_MOV_B32 0 -# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc -# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc - -# S32 -# CHECK: $sgpr12 = S_MOV_B32 $exec_lo -# CHECK: $exec_lo = S_MOV_B32 1 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4 -# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 -# CHECK: $sgpr12 = V_READLANE - -# S64 -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 3 -# GCN64: $exec = S_MOV_B64 3 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 -# CHECK: $sgpr12 = V_READLANE -# CHECK: $sgpr13 = V_READLANE - -# S96 -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 7 -# GCN64: $exec = S_MOV_B64 7 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 -# CHECK: $sgpr12 = V_READLANE -# CHECK: $sgpr13 = V_READLANE -# CHECK: $sgpr14 = V_READLANE - -# S128 -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 15 -# GCN64: $exec = S_MOV_B64 15 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 -# CHECK: $sgpr12 = V_READLANE -# CHECK: $sgpr13 = V_READLANE -# CHECK: $sgpr14 = V_READLANE -# CHECK: $sgpr15 = V_READLANE - -# S160 -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 31 -# GCN64: $exec = S_MOV_B64 31 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 -# CHECK: $sgpr12 = V_READLANE -# CHECK: $sgpr13 = V_READLANE -# CHECK: $sgpr14 = V_READLANE -# CHECK: $sgpr15 = V_READLANE -# CHECK: $sgpr16 = V_READLANE - -# S256 -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 255 -# GCN64: $exec = S_MOV_B64 255 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 -# CHECK: $sgpr12 = V_READLANE -# CHECK: $sgpr13 = V_READLANE -# CHECK: $sgpr14 = V_READLANE -# CHECK: $sgpr15 = V_READLANE -# CHECK: $sgpr16 = V_READLANE -# CHECK: $sgpr17 = V_READLANE -# CHECK: $sgpr18 = V_READLANE -# CHECK: $sgpr19 = V_READLANE - -# S512 -# GCN32: $sgpr12 = S_MOV_B32 $exec_lo -# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 65535 -# GCN64: $exec = S_MOV_B64 65535 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 -# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 -# CHECK: $sgpr12 = V_READLANE -# CHECK: $sgpr13 = V_READLANE -# CHECK: $sgpr14 = V_READLANE -# CHECK: $sgpr15 = V_READLANE -# CHECK: $sgpr16 = V_READLANE -# CHECK: $sgpr17 = V_READLANE -# CHECK: $sgpr18 = V_READLANE -# CHECK: $sgpr19 = V_READLANE -# CHECK: $sgpr20 = V_READLANE -# CHECK: $sgpr21 = V_READLANE -# CHECK: $sgpr22 = V_READLANE -# CHECK: $sgpr23 = V_READLANE -# CHECK: $sgpr24 = V_READLANE -# CHECK: $sgpr25 = V_READLANE -# CHECK: $sgpr26 = V_READLANE -# CHECK: $sgpr27 = V_READLANE - -# S1024 -# GCN32: $sgpr64 = S_MOV_B32 $exec_lo -# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 4294967295 -# GCN64: $exec = S_MOV_B64 4294967295 -# MUBUF: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 -# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 -# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 -# CHECK: $sgpr64 = V_READLANE -# CHECK: $sgpr65 = V_READLANE -# CHECK: $sgpr66 = V_READLANE -# CHECK: $sgpr67 = V_READLANE -# CHECK: $sgpr68 = V_READLANE -# CHECK: $sgpr69 = V_READLANE -# CHECK: $sgpr70 = V_READLANE -# CHECK: $sgpr71 = V_READLANE -# CHECK: $sgpr72 = V_READLANE -# CHECK: $sgpr73 = V_READLANE -# CHECK: $sgpr74 = V_READLANE -# CHECK: $sgpr75 = V_READLANE -# CHECK: $sgpr76 = V_READLANE -# CHECK: $sgpr77 = V_READLANE -# CHECK: $sgpr78 = V_READLANE -# CHECK: $sgpr79 = V_READLANE -# CHECK: $sgpr80 = V_READLANE -# CHECK: $sgpr81 = V_READLANE -# CHECK: $sgpr82 = V_READLANE -# CHECK: $sgpr83 = V_READLANE -# CHECK: $sgpr84 = V_READLANE -# CHECK: $sgpr85 = V_READLANE -# CHECK: $sgpr86 = V_READLANE -# CHECK: $sgpr87 = V_READLANE -# CHECK: $sgpr88 = V_READLANE -# CHECK: $sgpr89 = V_READLANE -# CHECK: $sgpr90 = V_READLANE -# CHECK: $sgpr91 = V_READLANE -# CHECK: $sgpr92 = V_READLANE -# CHECK: $sgpr93 = V_READLANE -# CHECK: $sgpr94 = V_READLANE -# CHECK: $sgpr95 = V_READLANE - ---- -name: check_reload -tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - - { reg: '$sgpr6_sgpr7' } - - { reg: '$sgpr8' } -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4 } - - { id: 1, type: spill-slot, size: 8, alignment: 4 } - - { id: 2, type: spill-slot, size: 12, alignment: 4 } - - { id: 3, type: spill-slot, size: 16, alignment: 4 } - - { id: 4, type: spill-slot, size: 20, alignment: 4 } - - { id: 5, type: spill-slot, size: 32, alignment: 4 } - - { id: 6, type: spill-slot, size: 64, alignment: 4 } - - { id: 7, type: spill-slot, size: 128, alignment: 4 } -machineFunctionInfo: - explicitKernArgSize: 660 - maxKernArgAlign: 4 - isEntryFunction: true - waveLimiter: true - scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - argumentInfo: - flatScratchInit: { reg: '$sgpr0_sgpr1' } - dispatchPtr: { reg: '$sgpr2_sgpr3' } - privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' } - kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } - workGroupIDX: { reg: '$sgpr10' } - privateSegmentWaveByteOffset: { reg: '$sgpr11' } -body: | - bb.0: - liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 - - renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14 = SI_SPILL_S96_RESTORE %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = SI_SPILL_S160_RESTORE %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -6,13 +6,9 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 -; SGPR-NEXT: s_mov_b64 exec, s[0:1] +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 -; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 -; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 -; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 +; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] ; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -14,11 +14,8 @@ ; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], [[M0_LANE:[0-9]+]] -; TOVMEM: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0 -; TOVMEM: s_mov_b32 [[COPY_EXEC_LO:s[0-9]+]], exec_lo -; TOVMEM: s_mov_b32 exec_lo, 1 +; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Spill -; TOVMEM: s_mov_b32 exec_lo, [[COPY_EXEC_LO]] ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] @@ -28,7 +25,7 @@ ; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) -; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0 +; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir --- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir @@ -46,27 +46,19 @@ ; GFX9: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9: $vcc = IMPLICIT_DEF - ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc - ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX9: $vcc = S_MOV_B64 $exec - ; GFX9: $exec = S_MOV_B64 3 - ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX9: $exec = S_MOV_B64 $vcc - ; GFX9: $vcc_hi = V_READLANE_B32 $vgpr0, 1 - ; GFX9: $vcc_lo = V_READLANE_B32 killed $vgpr0, 0 + ; GFX9: $vgpr0 = V_MOV_B32_e32 $vcc_lo, implicit $exec, implicit $vcc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX9: $vgpr0 = V_MOV_B32_e32 $vcc_hi, implicit $exec, implicit $vcc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0 + 4, addrspace 5) ; GFX9: $vcc = IMPLICIT_DEF - ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc - ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc - ; GFX9: $vcc = S_MOV_B64 $exec - ; GFX9: $exec = S_MOV_B64 3 + ; GFX9: $vgpr0 = V_MOV_B32_e32 $vcc_lo, implicit $exec, implicit $vcc ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX9: $exec = S_MOV_B64 killed $vcc - ; GFX9: $vcc = S_MOV_B64 $exec - ; GFX9: $exec = S_MOV_B64 3 + ; GFX9: $vgpr0 = V_MOV_B32_e32 $vcc_hi, implicit $exec, implicit killed $vcc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0 + 4, addrspace 5) ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - ; GFX9: $exec = S_MOV_B64 killed $vcc - ; GFX9: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc - ; GFX9: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1 + ; GFX9: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; GFX9: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc ; GFX10-LABEL: name: check_vcc ; GFX10: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr9 ; GFX10: $sgpr33 = S_MOV_B32 0 @@ -77,27 +69,19 @@ ; GFX10: $sgpr96 = S_ADD_U32 $sgpr96, $sgpr9, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 ; GFX10: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 ; GFX10: $vcc = IMPLICIT_DEF - ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc - ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX10: $vcc = S_MOV_B64 $exec - ; GFX10: $exec = S_MOV_B64 3 - ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX10: $exec = S_MOV_B64 $vcc - ; GFX10: $vcc_hi = V_READLANE_B32 $vgpr0, 1 - ; GFX10: $vcc_lo = V_READLANE_B32 killed $vgpr0, 0 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vcc_lo, implicit $exec, implicit $vcc + ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vcc_hi, implicit $exec, implicit $vcc + ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0 + 4, addrspace 5) ; GFX10: $vcc = IMPLICIT_DEF - ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc - ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc - ; GFX10: $vcc = S_MOV_B64 $exec - ; GFX10: $exec = S_MOV_B64 3 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vcc_lo, implicit $exec, implicit $vcc ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX10: $exec = S_MOV_B64 killed $vcc - ; GFX10: $vcc = S_MOV_B64 $exec - ; GFX10: $exec = S_MOV_B64 3 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vcc_hi, implicit $exec, implicit killed $vcc + ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0 + 4, addrspace 5) ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - ; GFX10: $exec = S_MOV_B64 killed $vcc - ; GFX10: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc - ; GFX10: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1 + ; GFX10: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc + ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; GFX10: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc $vcc = IMPLICIT_DEF SI_SPILL_S64_SAVE $vcc, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32