Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2539,6 +2539,7 @@ LiveVariables *LV) const { unsigned Opc = MI.getOpcode(); bool IsF16 = false; + bool ConstantBus = false; bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; @@ -2582,11 +2583,11 @@ const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + if ((ConstantBus = !Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || - !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { + !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg())))) { if (auto Imm = getFoldableImm(Src2)) { unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) @@ -2623,6 +2624,19 @@ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + + // If the source of the tied reg is a sgpr->vgpr copy then replace + // it back with the sgpr, to potentially kill the copy, if + // this doesn't violate the constant bus restriction + if (ConstantBus) { + const MachineFunction *MF = Src2->getParent()->getMF(); + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + if (auto *Def = MRI->getUniqueVRegDef(Src2->getReg())) { + if (Def->getOpcode() == AMDGPU::COPY) + Src2 = &Def->getOperand(1); + } + } + if (pseudoToMCOpcode(NewOpc) == -1) return nullptr; Index: test/CodeGen/AMDGPU/fmac-fma-sgpr-copy.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmac-fma-sgpr-copy.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s + +; CHECK: v_fma_f32 v0, v1, v0, s0 +define amdgpu_cs float @test1(<4 x i32> inreg %a, float %b, float %y) { +entry: + %buf.load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %a, i32 0, i32 0) + %vec1 = bitcast <4 x i32> %buf.load to <4 x float> + %.i095 = extractelement <4 x float> %vec1, i32 0 + %.i098 = fsub nnan arcp float %b, %.i095 + %fma1 = call float @llvm.fma.f32(float %y, float %.i098, float %.i095) #3 + ret float %fma1 +} + +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #2 +declare float @llvm.fma.f32(float, float, float) #1 + +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind readnone } +attributes #3 = { nounwind }