diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -491,6 +491,9 @@ Register SGPRForBPSaveRestoreCopy; Optional BasePointerSaveIndex; + /// When spilling SGPRs, we may need a temporary stack slot to free a VGPR. + Optional SpillSGPRTmpIndex; + Register VGPRReservedForSGPRSpill; bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -106,6 +106,17 @@ const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; + void buildWaveVGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, + Register VGPR, RegScavenger *RS, bool IsLoad, + bool VGPRLive = false, + Register FreeSGPR = 0) const; + + void buildVGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, + int Offset, unsigned EltSize, Register VGPR, + RegScavenger *RS, bool IsLoad, + bool UseKillFromMI = true, + bool IsKill = true) const; + void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, int Offset, unsigned EltSize, Register VGPR, int64_t VGPRLanes, RegScavenger *RS, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1020,6 +1020,140 @@ } } +/// Save or restore all lanes of a VGPR to a stack slot without using any SGPRs. +/// +/// We need to save all lanes if we overwrite some lanes when storing an SGPR +/// into a VGPR with v_writelane. If we currently try to spill an SGPR, we do +/// not have a free SGPR to save EXEC to, so we save all currently active lanes, +/// then flip EXEC (EXEC = EXEC ^ -1), then save the rest of the lanes and flip +/// EXEC again to restore its original value. +void SIRegisterInfo::buildWaveVGPRSpillLoadStore(MachineBasicBlock::iterator MI, + int Index, Register VGPR, + RegScavenger *RS, bool IsLoad, + bool VGPRLive, + Register FreeSGPR) const { + unsigned EltSize = 4; + MachineBasicBlock *MBB = MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // If we have free SGPRs, use that to save EXEC. + Register SavedExecReg = AMDGPU::NoRegister; + if (FreeSGPR) { + const TargetRegisterClass *RC = getPhysRegClass(FreeSGPR); + + ArrayRef SplitParts = getRegSplitParts(RC, EltSize); + unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + + if (isWave32) { + SavedExecReg = NumSubRegs == 1 + ? FreeSGPR + : Register(getSubReg(FreeSGPR, SplitParts[0])); + } else { + // If src/dst is an odd size it is possible subreg0 is not aligned. + for (unsigned ExecLane = 0; ExecLane < (NumSubRegs - 1); ++ExecLane) { + SavedExecReg = + getMatchingSuperReg(getSubReg(FreeSGPR, SplitParts[ExecLane]), + AMDGPU::sub0, &AMDGPU::SReg_64_XEXECRegClass); + if (SavedExecReg) + break; + } + } + } + + if (!IsLoad && !VGPRLive) { + // FIXME LLVM may not know that VGPR is live in other lanes, we need to mark + // it as live, otherwise the MachineIR verifier complains. + // Only add this if VGPR is currently not live. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::INLINEASM)) + .addExternalSymbol("") + .addImm(0) // flags + .addReg(VGPR, RegState::ImplicitDefine); + } + + if (SavedExecReg) { + // Use SGPRs to save exec + Register ExecReg = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned ExecMovOpc = isWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecOrOpc = + isWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + // Save exec and activate all lanes + BuildMI(*MBB, MI, DL, TII->get(ExecOrOpc), SavedExecReg).addImm(-1); + + // Save/restore VGPR + buildVGPRSpillLoadStore(MI, Index, 0, EltSize, VGPR, RS, IsLoad); + + // Restore exec + // FIXME This often creates unnecessary exec moves + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addReg(SavedExecReg); + } else { + // We cannot set exec to -1, because we do not have a free SGPR, so save the + // currently active lanes, flip exec, save the rest of the lanes and flip + // exec again. + Register ExecReg = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned ExecNotOpc = isWave32 ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; + for (unsigned I = 0; I < 2; I++) { + // Mark the second store as kill + buildVGPRSpillLoadStore(MI, Index, 0, EltSize, VGPR, RS, IsLoad, false, + !IsLoad && I == 1); + // Flip exec + BuildMI(*MBB, MI, DL, TII->get(ExecNotOpc), ExecReg).addReg(ExecReg); + } + } + + if (IsLoad && !VGPRLive) { + // FIXME LLVM may not know that VGPR is used in other lanes, we need to mark + // it as used, otherwise it can be removed. + // Only add this if VGPR was not live before. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::INLINEASM)) + .addExternalSymbol("") + .addImm(0) // flags + .addReg(VGPR, RegState::Implicit); + } +} + +void SIRegisterInfo::buildVGPRSpillLoadStore(MachineBasicBlock::iterator MI, + int Index, int Offset, + unsigned EltSize, Register VGPR, + RegScavenger *RS, bool IsLoad, + bool UseKillFromMI, + bool IsKill) const { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + + if (UseKillFromMI) + IsKill = MI->getOperand(0).isKill(); + + // Load/store VGPR + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); + + Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) + ? getBaseRegister() + : getFrameRegister(*MF); + + Align Alignment = FrameInfo.getObjectAlign(Index); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, + EltSize, Alignment); + + if (IsLoad) { + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR + : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VGPR, false, FrameReg, Offset * EltSize, + MMO, RS); + } else { + unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR + : AMDGPU::BUFFER_STORE_DWORD_OFFSET; + buildSpillLoadStore(MI, Opc, Index, VGPR, IsKill, FrameReg, + Offset * EltSize, MMO, RS); + // This only ever adds one VGPR spill + MFI->addToSpilledVGPRs(1); + } +} + // Generate a VMEM access which loads or stores the VGPR containing an SGPR // spill such that all the lanes set in VGPRLanes are loaded or stored. // This generates exec mask manipulation and will use SGPRs available in MI @@ -1031,8 +1165,6 @@ RegScavenger *RS, bool IsLoad) const { MachineBasicBlock *MBB = MI->getParent(); - MachineFunction *MF = MBB->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); Register SuperReg = MI->getOperand(0).getReg(); @@ -1079,39 +1211,7 @@ // Setup EXEC BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); - // Load/store VGPR - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); - - Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) - ? getBaseRegister() - : getFrameRegister(*MF); - - Align Alignment = FrameInfo.getObjectAlign(Index); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(*MF, Index); - MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, - EltSize, Alignment); - - if (IsLoad) { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR - : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, - Index, - VGPR, false, - FrameReg, - Offset * EltSize, MMO, - RS); - } else { - unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR - : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, Index, VGPR, - IsKill, FrameReg, - Offset * EltSize, MMO, RS); - // This only ever adds one VGPR spill - MFI->addToSpilledVGPRs(1); - } + buildVGPRSpillLoadStore(MI, Index, Offset, EltSize, VGPR, RS, IsLoad); // Restore EXEC BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) @@ -1203,9 +1303,30 @@ } else { // Scavenged temporary VGPR to use. It must be scavenged once for any number // of spilled subregs. - Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + // FIXME: The liveness analysis is limited and does not tell if a register + // is in use in lanes that are currently inactive. We can never be sure if + // a register as actually in use in another lane, so we need to save all + // lanes of the chosen VGPR. Pick v0 because it doesn't make a difference. + Register TmpVGPR = AMDGPU::VGPR0; RS->setRegUsed(TmpVGPR); + // Reserve temporary stack slot + if (!MFI->SpillSGPRTmpIndex.hasValue()) { + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + MFI->SpillSGPRTmpIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); + } + unsigned TmpVGPRIndex = *MFI->SpillSGPRTmpIndex; + + // Check if TmpVGPR is currently live according to LLVM liveness info + RegScavenger TmpRS; + TmpRS.enterBasicBlock(*MBB); + TmpRS.forward(MI); + bool TmpVGPRLive = TmpRS.isRegUsed(TmpVGPR); + + // Save TmpVGPR + buildWaveVGPRSpillLoadStore(MI, TmpVGPRIndex, TmpVGPR, RS, false, + TmpVGPRLive); + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); @@ -1246,6 +1367,10 @@ buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, RS, false); } + + // Restore temporary VGPR + buildWaveVGPRSpillLoadStore(MI, TmpVGPRIndex, TmpVGPR, RS, true, + TmpVGPRLive, SuperReg); } MI->eraseFromParent(); @@ -1297,9 +1422,31 @@ MIB.addReg(SuperReg, RegState::ImplicitDefine); } } else { - Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + // FIXME: The liveness analysis is limited and does not tell if a register + // is in use in lanes that are currently inactive. We can never be sure if + // a register as actually in use in another lane, so we need to save all + // lanes of the chosen VGPR. Pick v0 because it doesn't make a difference. + Register TmpVGPR = AMDGPU::VGPR0; RS->setRegUsed(TmpVGPR); + if (!MFI->SpillSGPRTmpIndex.hasValue()) { + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + MFI->SpillSGPRTmpIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); + } + unsigned TmpVGPRIndex = MFI->SpillSGPRTmpIndex.getValue(); + + // Check if TmpVGPR is currently live according to LLVM liveness info + RegScavenger TmpRS; + TmpRS.enterBasicBlock(*MBB); + TmpRS.forward(MI); + bool TmpVGPRLive = TmpRS.isRegUsed(TmpVGPR); + + // Save temporary VGPR + buildWaveVGPRSpillLoadStore(MI, TmpVGPRIndex, TmpVGPR, RS, false, + TmpVGPRLive, SuperReg); + unsigned PerVGPR = 32; unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; @@ -1326,6 +1473,10 @@ MIB.addReg(SuperReg, RegState::ImplicitDefine); } } + + // Restore TmpVGPR + buildWaveVGPRSpillLoadStore(MI, TmpVGPRIndex, TmpVGPR, RS, true, + TmpVGPRLive); } MI->eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -112,6 +112,9 @@ ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] ; GCN: s_cmp_lg_u32 +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN-NEXT: s_cbranch_scc1 [[LOOP]] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -15,8 +15,8 @@ ; SPILL-TO-VGPR: v_readlane_b32 s4, v40, 0 ; SPILL-TO-VGPR: v_readlane_b32 s5, v40, 1 -; NO-SPILL-TO-VGPR: v_readlane_b32 s4, v1, 0 -; NO-SPILL-TO-VGPR: v_readlane_b32 s5, v1, 1 +; NO-SPILL-TO-VGPR: v_readlane_b32 s4, v0, 0 +; NO-SPILL-TO-VGPR: v_readlane_b32 s5, v0, 1 ; SPILL-TO-VGPR: v_readlane_b32 s33, v40, 2 ; NO-SPILL-TO-VGPR: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -761,12 +761,24 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: s_mov_b64 s[2:3], exec ; GCN-NEXT: s_mov_b64 exec, 3 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 @@ -842,6 +854,11 @@ ; GCN-NEXT: v_readlane_b32 s17, v31, 61 ; GCN-NEXT: v_readlane_b32 s18, v31, 62 ; GCN-NEXT: v_readlane_b32 s19, v31, 63 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_mov_b64 exec, 3 ; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload @@ -849,6 +866,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -887,5 +911,262 @@ ret void } +; Same as @no_vgprs_last_sgpr_spill, some SGPR spills must go to memory. +; Additionally, v0 is live throughout the function. +define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { +; GCN-LABEL: no_vgprs_last_sgpr_spill_live_v0: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s54, -1 +; GCN-NEXT: s_mov_b32 s55, 0xe8f000 +; GCN-NEXT: s_add_u32 s52, s52, s3 +; GCN-NEXT: s_addc_u32 s53, s53, 0 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v31, s4, 0 +; GCN-NEXT: v_writelane_b32 v31, s5, 1 +; GCN-NEXT: v_writelane_b32 v31, s6, 2 +; GCN-NEXT: v_writelane_b32 v31, s7, 3 +; GCN-NEXT: v_writelane_b32 v31, s8, 4 +; GCN-NEXT: v_writelane_b32 v31, s9, 5 +; GCN-NEXT: v_writelane_b32 v31, s10, 6 +; GCN-NEXT: v_writelane_b32 v31, s11, 7 +; GCN-NEXT: v_writelane_b32 v31, s12, 8 +; GCN-NEXT: v_writelane_b32 v31, s13, 9 +; GCN-NEXT: v_writelane_b32 v31, s14, 10 +; GCN-NEXT: v_writelane_b32 v31, s15, 11 +; GCN-NEXT: v_writelane_b32 v31, s16, 12 +; GCN-NEXT: v_writelane_b32 v31, s17, 13 +; GCN-NEXT: v_writelane_b32 v31, s18, 14 +; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v31, s4, 16 +; GCN-NEXT: v_writelane_b32 v31, s5, 17 +; GCN-NEXT: v_writelane_b32 v31, s6, 18 +; GCN-NEXT: v_writelane_b32 v31, s7, 19 +; GCN-NEXT: v_writelane_b32 v31, s8, 20 +; GCN-NEXT: v_writelane_b32 v31, s9, 21 +; GCN-NEXT: v_writelane_b32 v31, s10, 22 +; GCN-NEXT: v_writelane_b32 v31, s11, 23 +; GCN-NEXT: v_writelane_b32 v31, s12, 24 +; GCN-NEXT: v_writelane_b32 v31, s13, 25 +; GCN-NEXT: v_writelane_b32 v31, s14, 26 +; GCN-NEXT: v_writelane_b32 v31, s15, 27 +; GCN-NEXT: v_writelane_b32 v31, s16, 28 +; GCN-NEXT: v_writelane_b32 v31, s17, 29 +; GCN-NEXT: v_writelane_b32 v31, s18, 30 +; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v31, s4, 32 +; GCN-NEXT: v_writelane_b32 v31, s5, 33 +; GCN-NEXT: v_writelane_b32 v31, s6, 34 +; GCN-NEXT: v_writelane_b32 v31, s7, 35 +; GCN-NEXT: v_writelane_b32 v31, s8, 36 +; GCN-NEXT: v_writelane_b32 v31, s9, 37 +; GCN-NEXT: v_writelane_b32 v31, s10, 38 +; GCN-NEXT: v_writelane_b32 v31, s11, 39 +; GCN-NEXT: v_writelane_b32 v31, s12, 40 +; GCN-NEXT: v_writelane_b32 v31, s13, 41 +; GCN-NEXT: v_writelane_b32 v31, s14, 42 +; GCN-NEXT: v_writelane_b32 v31, s15, 43 +; GCN-NEXT: v_writelane_b32 v31, s16, 44 +; GCN-NEXT: v_writelane_b32 v31, s17, 45 +; GCN-NEXT: v_writelane_b32 v31, s18, 46 +; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v31, s4, 48 +; GCN-NEXT: v_writelane_b32 v31, s5, 49 +; GCN-NEXT: v_writelane_b32 v31, s6, 50 +; GCN-NEXT: v_writelane_b32 v31, s7, 51 +; GCN-NEXT: v_writelane_b32 v31, s8, 52 +; GCN-NEXT: v_writelane_b32 v31, s9, 53 +; GCN-NEXT: v_writelane_b32 v31, s10, 54 +; GCN-NEXT: v_writelane_b32 v31, s11, 55 +; GCN-NEXT: v_writelane_b32 v31, s12, 56 +; GCN-NEXT: v_writelane_b32 v31, s13, 57 +; GCN-NEXT: v_writelane_b32 v31, s14, 58 +; GCN-NEXT: v_writelane_b32 v31, s15, 59 +; GCN-NEXT: v_writelane_b32 v31, s16, 60 +; GCN-NEXT: v_writelane_b32 v31, s17, 61 +; GCN-NEXT: v_writelane_b32 v31, s18, 62 +; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s[2:3] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: v_writelane_b32 v0, s2, 0 +; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, s1 +; GCN-NEXT: s_cbranch_scc1 BB3_2 +; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[16:31] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[0:15] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload +; GCN-NEXT: s_not_b64 exec, exec +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[36:51] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[4:19] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s[0:1] +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: BB3_2: ; %ret +; GCN-NEXT: s_endpgm + call void asm sideeffect "", "~{v[0:7]}" () #0 + call void asm sideeffect "", "~{v[8:15]}" () #0 + call void asm sideeffect "", "~{v[16:23]}" () #0 + call void asm sideeffect "", "~{v[24:27]}"() #0 + call void asm sideeffect "", "~{v[28:29]}"() #0 + call void asm sideeffect "", "~{v30}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + %vgpr0 = call i32 asm sideeffect "; def $0", "=v" () #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "v"(i32 %vgpr0) #0 + br label %ret + +ret: + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-waves-per-eu"="8,8" } diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -13,7 +13,11 @@ ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: s_nop 4 +; SGPR-NEXT: buffer_load_dword v0, off, s[{{[0-9]+:[0-9]+}}], 0 +; SGPR-NEXT: s_not_b64 exec, exec +; SGPR-NEXT: buffer_load_dword v0, off, s[96:99], 0 ; 4-byte Folded Reload +; SGPR-NEXT: s_not_b64 exec, exec +; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 ; ALL: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -7,6 +7,8 @@ ; GCN-LABEL: {{^}}spill_m0: +; TOVMEM: #ASMSTART +; TOVMEM: #ASMSTART ; GCN: #ASMSTART ; GCN-NEXT: s_mov_b32 m0, 0 ; GCN-NEXT: #ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -46,6 +46,7 @@ ; CHECK-LABEL: test_limited_sgpr ; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32 ; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6: NumSgprs: 48