diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,6 +90,12 @@ return 100; } + // How much instructions we may lookup for SCC use/def + // to determine if we need to save SCC inserting Frame Lowering code + unsigned getSCCLivenessScanMaxLength() const { + return 100; + } + const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -839,7 +839,7 @@ BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg) .addFrameIndex(FrameIdx); - if (ST.enableFlatScratch() ) { + if (ST.enableFlatScratch()) { BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg); @@ -2209,6 +2209,23 @@ return; } + + bool NeedSaveSCC = false; + + MachineBasicBlock::iterator I(MI); + if (RS->isRegUsed(AMDGPU::SCC)) { + unsigned Depth = 0; + while (I != MBB->end()) { + if ((*I != MI && I->readsRegister(AMDGPU::SCC)) || + (++Depth == getSCCLivenessScanMaxLength())) { + NeedSaveSCC = true; + break; + } + if (*I != MI && I->definesRegister(AMDGPU::SCC)) + break; + I++; + } + } Register TmpSReg = UseSGPR ? TmpReg @@ -2227,22 +2244,45 @@ FIOp.setIsKill(false); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) - .addReg(FrameReg) - .addImm(Offset); + if (NeedSaveSCC) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) + .addReg(TmpSReg) + .addImm(0); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) + .addImm(0) + .addReg(TmpSReg); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); + } if (!UseSGPR) BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addReg(TmpSReg, RegState::Kill); + .addReg(TmpSReg, RegState::Kill); if (TmpSReg == FrameReg) { // Undo frame register modification. - BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), - FrameReg) - .addReg(FrameReg) - .addImm(-Offset); + if (NeedSaveSCC) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg) + .addReg(FrameReg) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32)) + .addReg(TmpSReg) + .addImm(0); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg) + .addImm(0) + .addReg(TmpSReg); + } else { + BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32), + FrameReg) + .addReg(FrameReg) + .addImm(-Offset); + } } - return; } @@ -2347,11 +2387,11 @@ // If there were truly no free SGPRs, we need to undo everything. if (!TmpScaledReg.isValid()) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) - .addReg(ScaledReg, RegState::Kill) - .addImm(-Offset); + .addReg(ScaledReg, RegState::Kill) + .addImm(-Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(FrameReg) - .addImm(ST.getWavefrontSizeLog2()); + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()); } } } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir @@ -1055,7 +1055,9 @@ ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc - ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc + ; GFX9-FLATSCR-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi ; GFX9-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec ; GFX9-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -1072,7 +1074,9 @@ ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc - ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc + ; GFX10-FLATSCR-NEXT: S_BITCMP1_B32 $vcc_lo, 0, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $vcc_lo = S_BITSET0_B32 0, $vcc_lo ; GFX10-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec ; GFX10-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc @@ -1156,7 +1160,9 @@ ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) - ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc + ; GFX9-FLATSCR-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi ; GFX9-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec ; GFX9-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX9-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) @@ -1176,7 +1182,9 @@ ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; GFX10-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) - ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc + ; GFX10-FLATSCR-NEXT: S_BITCMP1_B32 $vcc_lo, 0, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $vcc_lo = S_BITSET0_B32 0, $vcc_lo ; GFX10-FLATSCR-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec ; GFX10-FLATSCR-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX10-FLATSCR-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) @@ -1260,7 +1268,9 @@ ; GFX9-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX9-FLATSCR-NEXT: {{ $}} ; GFX9-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc - ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc + ; GFX9-FLATSCR-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc + ; GFX9-FLATSCR-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_hi, implicit $exec ; GFX9-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX9-FLATSCR-NEXT: {{ $}} @@ -1278,7 +1288,9 @@ ; GFX10-FLATSCR-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; GFX10-FLATSCR-NEXT: {{ $}} ; GFX10-FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc - ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADD_I32 $sgpr32, 8200, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $vcc_lo = S_ADDC_U32 $sgpr32, 8200, implicit-def $scc, implicit $scc + ; GFX10-FLATSCR-NEXT: S_BITCMP1_B32 $vcc_lo, 0, implicit-def $scc + ; GFX10-FLATSCR-NEXT: $vcc_lo = S_BITSET0_B32 0, $vcc_lo ; GFX10-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vcc_lo, implicit $exec ; GFX10-FLATSCR-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GFX10-FLATSCR-NEXT: {{ $}}