diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -839,7 +839,7 @@ BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) .addFrameIndex(FrameIdx); - if (ST.enableFlatScratch() ) { + if (ST.enableFlatScratch()) { BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg); @@ -2210,6 +2210,20 @@ return; } + bool NeedSaveSCC = false; + MachineBasicBlock::iterator I(MI); + if (RS->isRegUsed(AMDGPU::SCC)) { + while (I != MBB->end()) { + if (*I != MI && I->readsRegister(AMDGPU::SCC)) { + NeedSaveSCC = true; + break; + } + if (*I != MI && I->definesRegister(AMDGPU::SCC)) + break; + I++; + } + } + Register TmpSReg = UseSGPR ? TmpReg : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, @@ -2227,22 +2241,42 @@ FIOp.setIsKill(false); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) - .addReg(FrameReg) - .addImm(Offset); + I = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); + MachineBasicBlock::iterator P = I; if (!UseSGPR) - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addReg(TmpSReg, RegState::Kill); + I = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addReg(TmpSReg, RegState::Kill); if (TmpSReg == FrameReg) { // Undo frame register modification. - BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), - FrameReg) - .addReg(FrameReg) - .addImm(-Offset); + I = BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), + FrameReg) + .addReg(FrameReg) + .addImm(-Offset); } + if (NeedSaveSCC) { + Register SCCCopy = + RS->scavengeRegister(getBoolRC(), std::prev(P), 0, !UseSGPR); + if (!SCCCopy) + report_fatal_error("Cannot scavenge register in FI elimination!"); + BuildMI(*MBB, P, DL, + TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), + SCCCopy) + .addImm(-1) + .addImm(0); + unsigned Opcode = + ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(*MBB, std::next(I), DL, TII->get(Opcode)) + .addReg(SCCCopy, getDefRegState(true)) + .addReg(SCCCopy) + .addReg(Exec); + } return; } @@ -2347,11 +2381,11 @@ // If there were truly no free SGPRs, we need to undo everything. if (!TmpScaledReg.isValid()) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) - .addReg(ScaledReg, RegState::Kill) - .addImm(-Offset); + .addReg(ScaledReg, RegState::Kill) + .addImm(-Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(FrameReg) - .addImm(ST.getWavefrontSizeLog2()); + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()); } } } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir @@ -1055,8 +1055,10 @@ ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_CSELECT_B64 -1, 0, implicit $scc ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr32, 8200, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_AND_B64 $sgpr4_sgpr5, $exec, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX9-FLATSCR-NEXT: {{ $}} @@ -1072,8 +1074,10 @@ ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $sgpr4 = S_CSELECT_B32 -1, 0, implicit $scc ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr32, 8200, implicit-def $scc ; GFX10-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec + ; GFX10-FLATSCR-NEXT: $sgpr4 = S_AND_B32 $sgpr4, $exec_lo, implicit-def $scc ; GFX10-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX10-FLATSCR-NEXT: {{ $}} @@ -1156,8 +1160,10 @@ ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_CSELECT_B64 -1, 0, implicit $scc ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr32, 8200, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_AND_B64 $sgpr4_sgpr5, $exec, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) ; GFX9-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -1176,8 +1182,10 @@ ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; GFX10-FLATSCR-NEXT: $sgpr4 = S_CSELECT_B32 -1, 0, implicit $scc ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr32, 8200, implicit-def $scc ; GFX10-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec + ; GFX10-FLATSCR-NEXT: $sgpr4 = S_AND_B32 $sgpr4, $exec_lo, implicit-def $scc ; GFX10-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) ; GFX10-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -1260,7 +1268,9 @@ ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_CSELECT_B64 -1, 0, implicit $scc ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $sgpr4_sgpr5 = S_AND_B64 $sgpr4_sgpr5, $exec, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec ; GFX9-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX9-FLATSCR-NEXT: {{ $}} @@ -1278,7 +1288,9 @@ ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $sgpr4 = S_CSELECT_B32 -1, 0, implicit $scc ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $sgpr4 = S_AND_B32 $sgpr4, $exec_lo, implicit-def $scc ; GFX10-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec ; GFX10-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX10-FLATSCR-NEXT: {{ $}}