diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2232,75 +2232,96 @@ if (!IsMUBUF && !MFI->isEntryFunction()) { // Convert to a swizzled stack address by scaling by the wave size. - // // In an entry function/kernel the offset is already swizzled. - + bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); + bool LiveSCC = RS->isRegUsed(AMDGPU::SCC); + const TargetRegisterClass *RC = IsSALU && !LiveSCC + ? &AMDGPU::SReg_32RegClass + : &AMDGPU::VGPR_32RegClass; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - Register ResultReg = - IsCopy ? MI->getOperand(0).getReg() - : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + Register ResultReg = IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegister(RC, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { + unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 + : AMDGPU::V_LSHRREV_B32_e64; // XXX - This never happens because of emergency scavenging slot at 0? - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) - .addImm(ST.getWavefrontSizeLog2()) - .addReg(FrameReg); + auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(FrameReg); + if (IsSALU && !LiveSCC) + Shift.getInstr()->getOperand(3).setIsDead(true); // Mark SCC as dead. + if (IsSALU && LiveSCC) { + Register NewDest = + RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + NewDest) + .addReg(ResultReg); + ResultReg = NewDest; + } } else { - if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { - // Reuse ResultReg in intermediate step. - Register ScaledReg = ResultReg; - - BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), - ScaledReg) - .addImm(ST.getWavefrontSizeLog2()) - .addReg(FrameReg); - - const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; - - // TODO: Fold if use instruction is another add of a constant. - if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - // FIXME: This can fail - MIB.addImm(Offset); - MIB.addReg(ScaledReg, RegState::Kill); - if (!IsVOP2) + MachineInstrBuilder MIB; + if (!IsSALU) { + if (MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { + // Reuse ResultReg in intermediate step. + Register ScaledReg = ResultReg; + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), + ScaledReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(FrameReg); + + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + + // TODO: Fold if use instruction is another add of a constant. + if (IsVOP2 || AMDGPU::isInlinableLiteral32( + Offset, ST.hasInv2PiInlineImm())) { + // FIXME: This can fail + MIB.addImm(Offset); + MIB.addReg(ScaledReg, RegState::Kill); + if (!IsVOP2) + MIB.addImm(0); // clamp bit + } else { + assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && + "Need to reuse carry out register"); + + // Use scavenged unused carry out as offset register. + Register ConstOffsetReg; + if (!isWave32) + ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); + else + ConstOffsetReg = MIB.getReg(1); + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), + ConstOffsetReg) + .addImm(Offset); + MIB.addReg(ConstOffsetReg, RegState::Kill); + MIB.addReg(ScaledReg, RegState::Kill); MIB.addImm(0); // clamp bit - } else { - assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && - "Need to reuse carry out register"); - - // Use scavenged unused carry out as offset register. - Register ConstOffsetReg; - if (!isWave32) - ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); - else - ConstOffsetReg = MIB.getReg(1); - - BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) - .addImm(Offset); - MIB.addReg(ConstOffsetReg, RegState::Kill); - MIB.addReg(ScaledReg, RegState::Kill); - MIB.addImm(0); // clamp bit + } } - } else { + } + if (!MIB || IsSALU) { // We have to produce a carry out, and there isn't a free SGPR pair // for it. We can keep the whole computation on the SALU to avoid // clobbering an additional register at the cost of an extra mov. // We may have 1 free scratch SGPR even though a carry out is // unavailable. Only one additional mov is needed. - Register TmpScaledReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; + Register TmpScaledReg = RS->scavengeRegister( + &AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + Register ScaledReg = + TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(FrameReg) - .addImm(ST.getWavefrontSizeLog2()); + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) .addReg(ScaledReg, RegState::Kill) .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill); // If there were truly no free SGPRs, we need to undo everything. if (!TmpScaledReg.isValid()) { @@ -2308,8 +2329,8 @@ .addReg(ScaledReg, RegState::Kill) .addImm(-Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(FrameReg) - .addImm(ST.getWavefrontSizeLog2()); + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()); } } } diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir @@ -0,0 +1,109 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: func_add_constant_to_fi_divergent_i32 +tracksRegLiveness: true +stack: + - { id: 0, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + +body: | + bb.0: + liveins: $vgpr31, $sgpr30_sgpr31 + + ; GCN-LABEL: name: func_add_constant_to_fi_divergent_i32 + ; GCN: liveins: $vgpr31, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_AND_B32_e32 1023, killed $vgpr31, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr1, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GCN-NEXT: renamable $vgpr0, dead renamable $vcc = nuw V_ADD_CO_U32_e64 4, killed $vgpr0, 0, implicit $exec + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + renamable $vgpr0 = V_AND_B32_e32 1023, killed $vgpr31, implicit $exec + renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + renamable $vgpr0, dead renamable $vcc = nuw V_ADD_CO_U32_e64 4, killed $vgpr0, 0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + +... +--- +name: func_add_constant_to_fi_uniform_i32 +tracksRegLiveness: true +stack: + - { id: 0, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr30_sgpr31 + + ; GCN-LABEL: name: func_add_constant_to_fi_uniform_i32 + ; GCN: liveins: $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc_hi = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_I32 killed $vcc_hi, 4, implicit-def dead $scc + ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + renamable $sgpr4 = nuw S_ADD_I32 %stack.0, 4, implicit-def dead $scc + renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + +... +--- +name: func_other_fi_user_non_inline_imm_offset_i32 +tracksRegLiveness: true +stack: + - { id: 0, type: default, offset: 0, size: 512, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, type: default, offset: 0, size: 32, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 512, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr30_sgpr31 + + ; GCN-LABEL: name: func_other_fi_user_non_inline_imm_offset_i32 + ; GCN: liveins: $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 7, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vcc_hi = S_LSHR_B32 $sgpr32, 6, implicit-def $scc + ; GCN-NEXT: $vcc_hi = S_ADD_I32 killed $vcc_hi, 512, implicit-def $scc + ; GCN-NEXT: $vcc_hi = COPY killed $vcc_hi + ; GCN-NEXT: renamable $sgpr4 = S_MUL_I32 killed $vcc_hi, 9 + ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec + ; GCN-NEXT: $m0 = S_MOV_B32 -1 + ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + renamable $vgpr0 = V_MOV_B32_e32 7, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, implicit $exec + renamable $sgpr4 = S_MUL_I32 %stack.1, 9 + renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + +...