Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -963,6 +963,12 @@ const DebugLoc &DL, unsigned DestReg) const; + MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DestReg, + RegScavenger &RS) const; + static bool isKillTerminator(unsigned Opcode); const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6081,6 +6081,23 @@ .addReg(UnusedCarry, RegState::Define | RegState::Dead); } +MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DestReg, + RegScavenger &RS) const { + if (ST.hasAddNoCarry()) + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); + + Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false); + // TODO: Users need to deal with this. + if (!UnusedCarry.isValid()) + report_fatal_error("failed to scavenge unused carry-out SGPR"); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} + bool SIInstrInfo::isKillTerminator(unsigned Opcode) { switch (Opcode) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -301,32 +301,17 @@ bool SIRegisterInfo::requiresFrameIndexScavenging( const MachineFunction &MF) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasStackObjects()) - return true; - - // May need to deal with callee saved registers. - const SIMachineFunctionInfo *Info = MF.getInfo(); - return !Info->isEntryFunction(); + // Do not use frame virtual registers. They used to be used for SGPRs, but + // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the + // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a + // spill. + return false; } bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return false; - - // The scavenger is used for large frames which may require finding a free - // register for large offsets. - if (!isUInt<12>(MFI.getStackSize())) - return true; - - // If using scalar stores, for spills, m0 is needed for the scalar store - // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual - // register for it during frame index elimination, so the scavenger is - // directly needed. - return MF.getSubtarget().hasScalarStores() && - MF.getInfo()->hasSpilledSGPRs(); + return MFI.hasStackObjects(); } bool SIRegisterInfo::requiresVirtualBaseRegisters( @@ -803,7 +788,6 @@ if (OnlyToVGPR && !SpillToVGPR) return false; - MachineRegisterInfo &MRI = MF->getRegInfo(); const GCNSubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -830,7 +814,7 @@ if (SpillToSMEM) { if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + M0CopyReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) .addReg(AMDGPU::M0); } @@ -849,6 +833,10 @@ ArrayRef SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + Register TmpVGPR; + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { @@ -925,14 +913,14 @@ // Spill SGPR to a frame index. // TODO: Should VI try to spill to VGPR and then spill to SMEM? - Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); // TODO: Should VI try to spill to VGPR and then spill to SMEM? MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(SubReg, SubKillState); - // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { @@ -950,7 +938,7 @@ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src + .addReg(TmpVGPR, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -974,7 +962,6 @@ RegScavenger *RS, bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); @@ -1001,7 +988,7 @@ if (SpillToSMEM) { if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + M0CopyReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) .addReg(AMDGPU::M0); } @@ -1026,6 +1013,8 @@ // SubReg carries the "Kill" flag when SubReg == SuperReg. int64_t FrOffset = FrameInfo.getObjectOffset(Index); + Register TmpVGPR; + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { Register SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); @@ -1080,7 +1069,8 @@ // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. - Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo @@ -1090,7 +1080,7 @@ MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -1099,7 +1089,7 @@ auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpReg, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -1150,7 +1140,6 @@ int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); @@ -1264,14 +1253,18 @@ // In an entry function/kernel the offset is already the absolute // address relative to the frame register. - Register DiffReg = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register TmpDiffReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + + // If there's no free SGPR, in-place modify the FP + Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; Register ResultReg = IsCopy ? MI->getOperand(0).getReg() : - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + // If there's no free SGPR, in-place modify the FP BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) .addReg(FrameReg) .addReg(MFI->getScratchWaveOffsetReg()); @@ -1284,7 +1277,7 @@ .addReg(DiffReg); } else { Register ScaledReg = - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) .addImm(Log2_32(ST.getWavefrontSize())) @@ -1292,23 +1285,32 @@ // TODO: Fold if use instruction is another add of a constant. if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) + + // FIXME: This can fail + TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS) .addImm(Offset) .addReg(ScaledReg, RegState::Kill) .addImm(0); // clamp bit } else { Register ConstOffsetReg = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) .addImm(Offset); - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) + TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS) .addReg(ConstOffsetReg, RegState::Kill) .addReg(ScaledReg, RegState::Kill) .addImm(0); // clamp bit } } + if (!TmpDiffReg.isValid()) { + // Restore the FP. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) + .addReg(FrameReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1346,7 +1348,7 @@ int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { - Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -7,7 +7,7 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] @@ -24,22 +24,20 @@ ; GCN-LABEL: {{^}}func_mov_fi_i32_offset: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 - -; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 +; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 +; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 +; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6 +; CI-NEXT: v_lshr_b32_e64 v0, [[SUB0]], 6 ; CI-NEXT: v_add_i32_e64 v1, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]] ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 ; CI-NEXT: ds_write_b32 v0, v1 -; GFX9: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] +; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 +; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB0]] +; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB1]] ; GFX9-DAG: ds_write_b32 v0, v0 - -; GFX9-DAG: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] ; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { @@ -55,7 +53,7 @@ ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] @@ -77,7 +75,7 @@ ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 @@ -112,7 +110,7 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 +; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] @@ -177,11 +175,11 @@ ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x200 +; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 +; GCN-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 -; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[SCALED]] +; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]] @@ -258,7 +256,7 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 +; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] Index: test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s + +# Frame virtual SGPRs should not be used, as the register scavenger cannot usefully spill them anymore. +# Spilling is also worse than increment and restore of a frame register. There should be no spills remaining. + +--- +name: scavenge_register_position +tracksRegLiveness: true + +stack: + - { id: 0, type: default, offset: 4096, size: 4, alignment: 8192 } + +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr34 + frameOffsetReg: $sgpr33 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr1 + + ; CHECK-LABEL: name: scavenge_register_position + ; CHECK: liveins: $vgpr1 + ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1048576, implicit-def $scc + ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1048576, implicit-def $scc + ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: S_ENDPGM 0, implicit $vcc + S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + S_ENDPGM 0, implicit $vcc +... Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -120,10 +120,10 @@ ; GCN: ; clobber m0 -; TOSMEM: s_mov_b32 s2, m0 +; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM: s_mov_b32 m0, s2 +; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz @@ -171,10 +171,10 @@ ; TOSMEM: s_mov_b32 m0, -1 -; TOSMEM: s_mov_b32 s0, m0 +; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_add_u32 m0, s3, 0x200 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, s0 +; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM: ds_write_b64