diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2234,73 +2234,84 @@ // Convert to a swizzled stack address by scaling by the wave size. // // In an entry function/kernel the offset is already swizzled. - + bool IsSALU = TII->isSALU(*MI); + const TargetRegisterClass *RC = IsSALU ? &AMDGPU::SReg_32RegClass + : &AMDGPU::VGPR_32RegClass; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; Register ResultReg = IsCopy ? MI->getOperand(0).getReg() - : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + : RS->scavengeRegister(RC, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { + unsigned OpCode = + IsSALU ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64; // XXX - This never happens because of emergency scavenging slot at 0? - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) - .addImm(ST.getWavefrontSizeLog2()) - .addReg(FrameReg); - } else { - if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { - // Reuse ResultReg in intermediate step. - Register ScaledReg = ResultReg; - - BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), - ScaledReg) + BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg) .addImm(ST.getWavefrontSizeLog2()) .addReg(FrameReg); - - const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; - - // TODO: Fold if use instruction is another add of a constant. - if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - // FIXME: This can fail - MIB.addImm(Offset); - MIB.addReg(ScaledReg, RegState::Kill); - if (!IsVOP2) + } else { + MachineInstrBuilder MIB; + if (!IsSALU) { + if (MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { + // Reuse ResultReg in intermediate step. + Register ScaledReg = ResultReg; + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), + ScaledReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(FrameReg); + + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + + // TODO: Fold if use instruction is another add of a constant. + if (IsVOP2 || AMDGPU::isInlinableLiteral32( + Offset, ST.hasInv2PiInlineImm())) { + // FIXME: This can fail + MIB.addImm(Offset); + MIB.addReg(ScaledReg, RegState::Kill); + if (!IsVOP2) + MIB.addImm(0); // clamp bit + } else { + assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && + "Need to reuse carry out register"); + + // Use scavenged unused carry out as offset register. + Register ConstOffsetReg; + if (!isWave32) + ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); + else + ConstOffsetReg = MIB.getReg(1); + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), + ConstOffsetReg) + .addImm(Offset); + MIB.addReg(ConstOffsetReg, RegState::Kill); + MIB.addReg(ScaledReg, RegState::Kill); MIB.addImm(0); // clamp bit - } else { - assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && - "Need to reuse carry out register"); - - // Use scavenged unused carry out as offset register. - Register ConstOffsetReg; - if (!isWave32) - ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); - else - ConstOffsetReg = MIB.getReg(1); - - BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) - .addImm(Offset); - MIB.addReg(ConstOffsetReg, RegState::Kill); - MIB.addReg(ScaledReg, RegState::Kill); - MIB.addImm(0); // clamp bit + } } - } else { + } + if (!MIB || IsSALU) { // We have to produce a carry out, and there isn't a free SGPR pair // for it. We can keep the whole computation on the SALU to avoid // clobbering an additional register at the cost of an extra mov. // We may have 1 free scratch SGPR even though a carry out is // unavailable. Only one additional mov is needed. - Register TmpScaledReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; + Register TmpScaledReg = RS->scavengeRegister( + &AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + Register ScaledReg = + TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(FrameReg) - .addImm(ST.getWavefrontSizeLog2()); + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) .addReg(ScaledReg, RegState::Kill) .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) - .addReg(ScaledReg, RegState::Kill); + .addReg(ScaledReg, RegState::Kill); // If there were truly no free SGPRs, we need to undo everything. if (!TmpScaledReg.isValid()) { @@ -2308,8 +2319,8 @@ .addReg(ScaledReg, RegState::Kill) .addImm(-Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(FrameReg) - .addImm(ST.getWavefrontSizeLog2()); + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()); } } } diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir @@ -0,0 +1,118 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +--- | + + declare i32 @llvm.amdgcn.workitem.id.x() #0 + + define void @func_add_constant_to_fi_divergent_i32() { + %alloca = alloca [2 x i32], align 4, addrspace(5) + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %alloca, i32 0, i32 %tid + %gep02 = getelementptr inbounds i32, i32 addrspace(5)* %1, i32 1 + store volatile i32 addrspace(5)* %gep02, i32 addrspace(5)* addrspace(3)* undef, align 4 + ret void + } + + define void @func_add_constant_to_fi_uniform_i32() { + %alloca = alloca [2 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %alloca, i32 0, i32 1 + store volatile i32 addrspace(5)* %gep0, i32 addrspace(5)* addrspace(3)* undef, align 4 + ret void + } + + define void @func_other_fi_user_non_inline_imm_offset_i32() { + %alloca0 = alloca [128 x i32], align 4, addrspace(5) + %alloca1 = alloca [8 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca0, i32 0, i32 65 + %gep11 = bitcast [8 x i32] addrspace(5)* %alloca1 to i32 addrspace(5)* + store volatile i32 7, i32 addrspace(5)* %gep0, align 4 + %ptrtoint = ptrtoint i32 addrspace(5)* %gep11 to i32 + %mul = mul i32 %ptrtoint, 9 + store volatile i32 %mul, i32 addrspace(3)* undef, align 4 + ret void + } + +... +# GCN-LABEL: name: func_add_constant_to_fi_divergent_i32{{$}} +# GCN: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32 +# GCN: $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr1, killed $vgpr0 +--- +name: func_add_constant_to_fi_divergent_i32 +tracksRegLiveness: true +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + +body: | + bb.0 (%ir-block.0): + liveins: $vgpr31, $sgpr30_sgpr31 + + renamable $vgpr0 = V_AND_B32_e32 1023, killed $vgpr31, implicit $exec + renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0.alloca, killed $vgpr0, implicit-def dead $vcc, implicit $exec + renamable $vgpr0, dead renamable $vcc = nuw V_ADD_CO_U32_e64 4, killed $vgpr0, 0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile store (s32) into `i32 addrspace(5)* addrspace(3)* undef`, addrspace 3) + S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + +... +# GCN-LABEL: name: func_add_constant_to_fi_uniform_i32{{$}} +# GCN: $vcc_hi = S_LSHR_B32 6, $sgpr32, implicit-def $scc +# GCN: $sgpr4 = nuw S_ADD_I32 killed $vcc_hi, 4 +--- +name: func_add_constant_to_fi_uniform_i32 +tracksRegLiveness: true +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0 (%ir-block.0): + liveins: $sgpr30_sgpr31 + + renamable $sgpr4 = nuw S_ADD_I32 %stack.0.alloca, 4, implicit-def dead $scc + renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile store (s32) into `i32 addrspace(5)* addrspace(3)* undef`, addrspace 3) + S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + +... +# GCN-LABEL: name: func_other_fi_user_non_inline_imm_offset_i32 +# GCN: $vcc_hi = S_LSHR_B32 $sgpr32, 6 +# GCN: $vcc_hi = S_ADD_I32 killed $vcc_hi, 512 +# GCN: $sgpr4 = S_MUL_I32 killed $vcc_hi, 9 +--- +name: func_other_fi_user_non_inline_imm_offset_i32 +tracksRegLiveness: true +stack: + - { id: 0, name: alloca0, type: default, offset: 0, size: 512, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: alloca1, type: default, offset: 0, size: 32, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 512, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0 (%ir-block.0): + liveins: $sgpr30_sgpr31 + + renamable $vgpr0 = V_MOV_B32_e32 7, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, %stack.0.alloca0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, implicit $exec :: (volatile store (s32) into %ir.gep0, addrspace 5) + renamable $sgpr4 = S_MUL_I32 %stack.1.alloca1, 9 + renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec + $m0 = S_MOV_B32 -1 + DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile store (s32) into `i32 addrspace(3)* undef`, addrspace 3) + S_SETPC_B64_return killed renamable $sgpr30_sgpr31 + +...