Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -592,6 +592,24 @@ return m_GXor(Src, m_AllOnesInt()); } +// Helper for ignoring a copy when matching +template struct IgnoreCopy_match { + SubPatternT SubPat; + IgnoreCopy_match(const SubPatternT &SP) : SubPat(SP) {} + + bool match(const MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (Def) + Reg = Def->getOperand(0).getReg(); + return SubPat.match(MRI, Reg); + } +}; + +template +inline IgnoreCopy_match m_IgnoreCopy(const SubPat &SP) { + return SP; +} + } // namespace MIPatternMatch } // namespace llvm Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -96,6 +96,7 @@ bool selectG_CONSTANT(MachineInstr &I) const; bool selectG_FNEG(MachineInstr &I) const; bool selectG_FABS(MachineInstr &I) const; + bool selectG_FMA_FMAD(MachineInstr &MI) const; bool selectG_AND_OR_XOR(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; @@ -180,6 +181,9 @@ InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; + std::pair + SelectVOP3PMadMixModsImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2309,6 +2309,62 @@ return true; } +static bool hasFP32Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo(); + return Info->getMode().allFP32Denormals(); +} + +bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &MI) const { + bool IsFMA = MI.getOpcode() == TargetOpcode::G_FMA; + bool HasMixInsts = + IsFMA ? Subtarget->hasFmaMixInsts() : Subtarget->hasMadMixInsts(); + Register Dst = MI.getOperand(0).getReg(); + LLT DstTy = MRI->getType(Dst); + if (DstTy.getSizeInBits() != 32 || !HasMixInsts) + return false; + + Register Src0, Src1, Src2; + unsigned Src0Mods, Src1Mods, Src2Mods; + std::tie(Src0, Src0Mods) = SelectVOP3PMadMixModsImpl(MI.getOperand(1)); + std::tie(Src1, Src1Mods) = SelectVOP3PMadMixModsImpl(MI.getOperand(2)); + std::tie(Src2, Src2Mods) = SelectVOP3PMadMixModsImpl(MI.getOperand(3)); + + assert((IsFMA || !hasFP32Denormals(*MI.getMF())) && + "fmad selected with denormals enabled"); + // TODO: We can select this with f32 denormals enabled if all the sources are + // converted from f16 (in which case fmad isn't legal). + + // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand + // using the conversion from f16. + if (!(Src0Mods & SISrcMods::OP_SEL_1 || Src1Mods & SISrcMods::OP_SEL_1 || + Src2Mods & SISrcMods::OP_SEL_1)) + return false; + + if (!RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Src1, AMDGPU::VGPR_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Src2, AMDGPU::VGPR_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI)) + return false; + + MachineBasicBlock *BB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opcode = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32; + + BuildMI(*BB, &MI, DL, TII.get(Opcode), Dst) + .addImm(Src0Mods) + .addReg(Src0) + .addImm(Src1Mods) + .addReg(Src1) + .addImm(Src2Mods) + .addReg(Src2) + .addImm(0) + .addImm(0) + .addImm(0); + + MI.eraseFromParent(); + return true; +} + static bool isConstant(const MachineInstr &MI) { return MI.getOpcode() == TargetOpcode::G_CONSTANT; } @@ -3165,6 +3221,11 @@ if (selectImpl(I, *CoverageInfo)) return true; return selectG_FABS(I); + case TargetOpcode::G_FMA: + case TargetOpcode::G_FMAD: + if (selectG_FMA_FMAD(I)) + return true; + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); case TargetOpcode::G_MERGE_VALUES: @@ -3443,6 +3504,72 @@ }}; } +static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, + Register &Out) { + Register LShlSrc; + if (mi_match(In, MRI, + m_IgnoreCopy(m_GTrunc(m_GLShr( + m_Reg(LShlSrc), m_IgnoreCopy(m_SpecificICst(16))))))) { + Out = LShlSrc; + return true; + } + return false; +} + +std::pair +AMDGPUInstructionSelector::SelectVOP3PMadMixModsImpl( + MachineOperand &Root) const { + unsigned Mods = 0; + Register Src; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); + + if (MI && MI->getOpcode() == AMDGPU::G_FPEXT && + MRI->getType(Src) == LLT::scalar(32) && + MRI->getType(MI->getOperand(1).getReg()) == LLT::scalar(16)) { + Src = MI->getOperand(1).getReg(); + + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + std::tie(Src, ModsTmp) = selectVOP3ModsImpl(MI->getOperand(1)); + + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; + + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } + + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. + // If the sources's op_sel is set, it picks the high half of the source + // register. + Mods |= SISrcMods::OP_SEL_1; + if (isExtractHiElt(*MRI, Src, Src)) { + Mods |= SISrcMods::OP_SEL_0; + // TODO: Should we try to look for neg/abs here? + } + } + + // If we looked through copies to find source modifiers on an SGPR operand, + // we now have an SGPR register source. To avoid potentially violating the + // constant bus restriction, we need to insert a copy to a VGPR. + if (Mods != 0 && + RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { + Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg()); + MachineInstr *OrigMI = Root.getParent(); + BuildMI(*OrigMI->getParent(), OrigMI, OrigMI->getDebugLoc(), + TII.get(AMDGPU::COPY), VGPRSrc) + .addReg(Src); + Src = VGPRSrc; + } + + return std::make_pair(Src, Mods); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector AddrInfo; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -8,9 +8,7 @@ define amdgpu_vs float @test_f16_f32_add_fma_ext_mul(float %x, float %y, float %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX9-DENORM-NEXT: v_mad_f32 v2, v3, v4, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, v4, v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 ; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-DENORM-NEXT: ; return to shader part epilog @@ -18,25 +16,22 @@ ; GFX10-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v0, v3, v2 +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v3, v2 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v3, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul half %u, %v @@ -50,12 +45,8 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_ext_fma_mul: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v3, v2 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v5, v1 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, v4, v2 op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_f16_f32_add_ext_fma_mul: @@ -94,34 +85,29 @@ define amdgpu_vs float @test_f16_f32_add_fma_ext_mul_rhs(float %x, float %y, float %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v3, v4 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul half %u, %v @@ -135,12 +121,8 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v3, v4 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_f16_f32_add_ext_fma_mul_rhs: @@ -181,72 +163,56 @@ ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v14, v0, v4 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v12, v1, v5 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v15, v2, v6 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v13, v3, v7 -; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_fmac_f32_e32 v14, v0, v4 -; GFX10-NEXT: v_fmac_f32_e32 v12, v1, v5 -; GFX10-NEXT: v_fmac_f32_e32 v15, v2, v6 -; GFX10-NEXT: v_fmac_f32_e32 v13, v3, v7 -; GFX10-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX10-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX10-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX10-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v14, v0, v4 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v12, v1, v5 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v15, v2, v6 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v13, v3, v7 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v14, v0, v4 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v12, v1, v5 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v15, v2, v6 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v13, v3, v7 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v @@ -339,72 +305,56 @@ ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v14, v4, v8 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v12, v5, v9 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v15, v6, v10 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v13, v7, v11 -; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_fmac_f32_e32 v14, v4, v8 -; GFX10-NEXT: v_fmac_f32_e32 v12, v5, v9 -; GFX10-NEXT: v_fmac_f32_e32 v15, v6, v10 -; GFX10-NEXT: v_fmac_f32_e32 v13, v7, v11 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX10-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v14, v4, v8 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v12, v5, v9 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v15, v6, v10 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v13, v7, v11 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v14, v4, v8 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v12, v5, v9 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v15, v6, v10 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v13, v7, v11 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll @@ -8,16 +8,15 @@ define amdgpu_vs float @test_f16_f32_add_ext_mul(half inreg %x, half inreg %y, float inreg %z) { ; GFX9-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX9-FAST-DENORM-NEXT: v_mad_f32 v0, v0, v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, s0, v0, v1 op_sel_hi:[1,1,0] ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v1, s2 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s1, v0 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast half %x, %y @@ -29,16 +28,15 @@ define amdgpu_vs float @test_f16_f32_add_ext_mul_rhs(half inreg %x, half inreg %y, float inreg %z) { ; GFX9-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul_rhs: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX9-FAST-DENORM-NEXT: v_mad_f32 v0, v0, v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, s0, v0, v1 op_sel_hi:[1,1,0] ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul_rhs: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v1, s2 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s1, v0 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast half %x, %y @@ -74,25 +72,16 @@ ; ; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s11 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, s12 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v6, s13 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, s4 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v8, s14 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v9, s5 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v5, s6 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v1, v1, v6, s7 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v2, v2, v7, s8 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v3, v3, v8, s9 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v4, v4, v9, s10 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <5 x half> %x, %y @@ -126,30 +115,18 @@ ; ; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s2, 16 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s12 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, s13 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, s14 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v6, s3 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v8, s4 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v9, s1 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v10, s5 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v11, s2 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v6, s6 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v1, v1, v7, s7 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v2, v2, v8, s8 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v3, v3, v9, s9 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v4, v4, v10, s10 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v5, v5, v11, s11 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <6 x half> %x, %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll @@ -7,16 +7,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_mul(half %x, half %y, float %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_mul: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %x, %y @@ -29,16 +25,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_mul_rhs(float %x, half %y, half %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-DENORM-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v1, v2, v0 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast half %y, %z @@ -65,18 +57,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v8, v0, -v4 -; GFX10-DENORM-NEXT: v_fma_f32 v1, v9, v1, -v5 -; GFX10-DENORM-NEXT: v_fma_f32 v2, v10, v2, -v6 -; GFX10-DENORM-NEXT: v_fma_f32 v3, v11, v3, -v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v0, v2, -v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v3, -v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %x, %y @@ -103,18 +89,10 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v8, v10, v0 -; GFX10-DENORM-NEXT: v_fma_f32 v1, -v4, v6, v1 -; GFX10-DENORM-NEXT: v_fma_f32 v2, -v9, v11, v2 -; GFX10-DENORM-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v6, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v7, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <4 x half> %y, %z Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll @@ -7,18 +7,14 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul(half %x, half %y, float %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %x, %y @@ -32,18 +28,14 @@ define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul(half %x, half %y, float %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %x, %y @@ -58,16 +50,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul2(float %x, half %y, half %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v0, v1, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %y, %z @@ -81,16 +69,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul2(float %x, half %y, half %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v0, v1, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %y, %z @@ -121,23 +105,15 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v0, v2, v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v0, v2, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v1, v3, v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, v3, v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80000000 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v4, v8, v10 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v5, v0, v2 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v6, v9, v11 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v7, v1, v3 ; GFX10-DENORM-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v1, s0, v5 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v7 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v1, s0, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v5 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %x, %y @@ -165,23 +141,15 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v0, v2, v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v0, v2, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v1, v3, v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, v3, v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80000000 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v4, v8, v10 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v5, v0, v2 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v6, v9, v11 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v7, v1, v3 ; GFX10-DENORM-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v1, s0, v5 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v7 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v1, s0, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v5 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %x, %y @@ -213,18 +181,10 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v0, v8, v10 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v1, v4, v6 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v9, v11 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v3, v5, v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v4, v6, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v4, v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v5, v7, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v5, v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %y, %z @@ -252,18 +212,10 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v0, v8, v10 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v1, v4, v6 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v9, v11 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v3, v5, v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v4, v6, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v4, v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v5, v7, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v5, v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %y, %z Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fma-mix.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fma-mix.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn --denormal-fp-math=preserve-sign -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn --denormal-fp-math=preserve-sign -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_vs float @test_op_sel_hi_src2(float %x, float %y, half %z) { +; GFX9-LABEL: test_op_sel_hi_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_op_sel_hi_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %FpextZ = fpext half %z to float + %fma = fadd fast float %XY, %FpextZ + ret float %fma +} + +define amdgpu_vs float @test_op_sel_hi_src0_src1(half %x, half %y, float %z) { +; GFX9-LABEL: test_op_sel_hi_src0_src1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_op_sel_hi_src0_src1: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast half %x, %y + %FpextXY = fpext half %XY to float + %fma = fadd fast float %FpextXY, %z + ret float %fma +} + +define amdgpu_vs float @test_neg_after_src2(float %x, float %y, half %z) { +; GFX9-LABEL: test_neg_after_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_neg_after_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %FpextZ = fpext half %z to float + %NegFpextZ = fneg float %FpextZ + %fma = fadd fast float %XY, %NegFpextZ + ret float %fma +} + +define amdgpu_vs float @test_neg_before_src2(float %x, float %y, half %z) { +; GFX9-LABEL: test_neg_before_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_neg_before_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %NegZ = fneg half %z + %FpextNegZ = fpext half %NegZ to float + %fma = fadd fast float %XY, %FpextNegZ + ret float %fma +} + +define amdgpu_vs float @test_abs_before_src2(float %x, float %y, half %z) { +; GFX9-LABEL: test_abs_before_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_abs_before_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %AbsZ = call half @llvm.fabs.f16(half %z) + %FpextAbsZ = fpext half %AbsZ to float + %fma = fadd fast float %XY, %FpextAbsZ + ret float %fma +} + +define amdgpu_vs float @test_abs_after_src2(float %x, float %y, half %z) { +; GFX9-LABEL: test_abs_after_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_abs_after_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %FpextZ = fpext half %z to float + %AbsFpExtZ = call float @llvm.fabs.f32(float %FpextZ) + %fma = fadd fast float %XY, %AbsFpExtZ + ret float %fma +} + +define amdgpu_vs float @test_abs_neg_src2(float %x, float %y, half %z) { +; GFX9-LABEL: test_abs_neg_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_abs_neg_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %FpextZ = fpext half %z to float + %AbsZ = call float @llvm.fabs.f32(float %FpextZ) + %negZ = fneg float %AbsZ + %fma = fadd fast float %XY, %negZ + ret float %fma +} + +define amdgpu_vs float @test_op_sel_op_sel_hi_src2(float %x, float %y, <2 x half> %vecz) { +; GFX9-LABEL: test_op_sel_op_sel_hi_src2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_op_sel_op_sel_hi_src2: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %XY = fmul fast float %x, %y + %z = extractelement <2 x half> %vecz, i32 1 + %FpextZ = fpext half %z to float + %fma = fadd fast float %XY, %FpextZ + ret float %fma +} + +define amdgpu_vs float @test_all_modifiers(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; GFX9-LABEL: test_all_modifiers: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mad_mix_f32 v0, |v0|, |v1|, -|v2| op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_all_modifiers: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_fma_mix_f32 v0, |v0|, |v1|, -|v2| op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX10-NEXT: ; return to shader part epilog +.entry: + %x = extractelement <2 x half> %vecx, i32 1 + %y = extractelement <2 x half> %vecy, i32 1 + %z = extractelement <2 x half> %vecz, i32 1 + %AbsX = call half @llvm.fabs.f16(half %x) + %AbsY = call half @llvm.fabs.f16(half %y) + %negX = fneg half %AbsX + %negY = fneg half %AbsY + %XY = fmul fast half %negX, %negY + %FpextXY = fpext half %XY to float + %AbsZ = call half @llvm.fabs.f16(half %z) + %FpextZ = fpext half %AbsZ to float + %negZ = fneg float %FpextZ + %fma = fadd fast float %FpextXY, %negZ + ret float %fma +} + +declare float @llvm.fabs.f32(float) +declare half @llvm.fabs.f16(half) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma-mix.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fma-mix.mir @@ -0,0 +1,242 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: test_op_sel_hi_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_op_sel_hi_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %4:vgpr + $vgpr0 = COPY %5:vgpr(s32) + +... +--- +name: test_op_sel_hi_src0_src1 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_op_sel_hi_src0_src1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 8, [[COPY]], 8, [[COPY1]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s16) = G_TRUNC %0:vgpr(s32) + %2:vgpr(s32) = COPY $vgpr1 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = COPY $vgpr2 + %5:vgpr(s32) = G_FPEXT %1:vgpr(s16) + %6:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %7:vgpr(s32) = G_FMA %5:vgpr, %6:vgpr, %4:vgpr + $vgpr0 = COPY %7:vgpr(s32) + +... +--- +name: test_neg_after_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_neg_after_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 9, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FNEG %4:vgpr + %6:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_neg_before_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_neg_before_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 9, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s16) = G_FNEG %3:vgpr + %5:vgpr(s32) = G_FPEXT %4:vgpr(s16) + %6:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_abs_before_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_abs_before_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 10, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s16) = G_FABS %3:vgpr + %5:vgpr(s32) = G_FPEXT %4:vgpr(s16) + %6:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_abs_after_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_abs_after_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 10, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FABS %4:vgpr + %6:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_abs_neg_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_abs_neg_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 11, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FABS %4:vgpr + %6:vgpr(s32) = G_FNEG %5:vgpr + %7:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %6:vgpr + $vgpr0 = COPY %7:vgpr(s32) + +... +--- +name: test_op_sel_op_sel_hi_src2 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_op_sel_op_sel_hi_src2 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 0, [[COPY]], 0, [[COPY1]], 12, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(<2 x s16>) = COPY $vgpr2 + %3:vgpr(s32) = G_BITCAST %2:vgpr(<2 x s16>) + %4:sgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = COPY %4:sgpr(s32) + %6:vgpr(s32) = G_LSHR %3:vgpr, %5:vgpr(s32) + %7:vgpr(s16) = G_TRUNC %6:vgpr(s32) + %8:vgpr(s32) = G_FPEXT %7:vgpr(s16) + %9:vgpr(s32) = G_FMA %0:vgpr, %1:vgpr, %8:vgpr + $vgpr0 = COPY %9:vgpr(s32) + +... +--- +name: test_all_modifiers +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_all_modifiers + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[V_FMA_MIX_F32_:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32 15, [[COPY]], 15, [[COPY1]], 15, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIX_F32_]] + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = COPY $vgpr1 + %2:vgpr(<2 x s16>) = COPY $vgpr2 + %3:vgpr(s32) = G_BITCAST %0:vgpr(<2 x s16>) + %4:sgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = COPY %4:sgpr(s32) + %6:vgpr(s32) = G_LSHR %3:vgpr, %5:vgpr(s32) + %7:vgpr(s16) = G_TRUNC %6:vgpr(s32) + %8:vgpr(s32) = G_BITCAST %1:vgpr(<2 x s16>) + %9:vgpr(s32) = COPY %4:sgpr(s32) + %10:vgpr(s32) = G_LSHR %8:vgpr, %9:vgpr(s32) + %11:vgpr(s16) = G_TRUNC %10:vgpr(s32) + %12:vgpr(s32) = G_BITCAST %2:vgpr(<2 x s16>) + %13:vgpr(s32) = COPY %4:sgpr(s32) + %14:vgpr(s32) = G_LSHR %12:vgpr, %13:vgpr(s32) + %15:vgpr(s16) = G_TRUNC %14:vgpr(s32) + %16:vgpr(s16) = G_FABS %7:vgpr + %17:vgpr(s16) = G_FABS %11:vgpr + %18:vgpr(s16) = G_FNEG %16:vgpr + %19:vgpr(s16) = G_FNEG %17:vgpr + %20:vgpr(s16) = G_FABS %15:vgpr + %21:vgpr(s32) = G_FPEXT %20:vgpr(s16) + %22:vgpr(s32) = G_FNEG %21:vgpr + %23:vgpr(s32) = G_FPEXT %18:vgpr(s16) + %24:vgpr(s32) = G_FPEXT %19:vgpr(s16) + %25:vgpr(s32) = G_FMA %23:vgpr, %24:vgpr, %22:vgpr + $vgpr0 = COPY %25:vgpr(s32) + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmad-mix.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmad-mix.mir @@ -0,0 +1,278 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- +name: test_op_sel_hi_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_op_sel_hi_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 8, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %4:vgpr + $vgpr0 = COPY %5:vgpr(s32) + +... +--- +name: test_op_sel_hi_src0_src1 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_op_sel_hi_src0_src1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 8, [[COPY]], 8, [[COPY1]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s16) = G_TRUNC %0:vgpr(s32) + %2:vgpr(s32) = COPY $vgpr1 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = COPY $vgpr2 + %5:vgpr(s32) = G_FPEXT %1:vgpr(s16) + %6:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %7:vgpr(s32) = G_FMAD %5:vgpr, %6:vgpr, %4:vgpr + $vgpr0 = COPY %7:vgpr(s32) + +... +--- +name: test_neg_after_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_neg_after_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 9, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FNEG %4:vgpr + %6:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_neg_before_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_neg_before_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 9, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s16) = G_FNEG %3:vgpr + %5:vgpr(s32) = G_FPEXT %4:vgpr(s16) + %6:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_abs_before_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_abs_before_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 10, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s16) = G_FABS %3:vgpr + %5:vgpr(s32) = G_FPEXT %4:vgpr(s16) + %6:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_abs_after_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_abs_after_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 10, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FABS %4:vgpr + %6:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %5:vgpr + $vgpr0 = COPY %6:vgpr(s32) + +... +--- +name: test_abs_neg_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_abs_neg_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 11, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %2:vgpr(s32) + %4:vgpr(s32) = G_FPEXT %3:vgpr(s16) + %5:vgpr(s32) = G_FABS %4:vgpr + %6:vgpr(s32) = G_FNEG %5:vgpr + %7:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %6:vgpr + $vgpr0 = COPY %7:vgpr(s32) + +... +--- +name: test_op_sel_op_sel_hi_src2 +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_op_sel_op_sel_hi_src2 + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 0, [[COPY]], 0, [[COPY1]], 12, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(<2 x s16>) = COPY $vgpr2 + %3:vgpr(s32) = G_BITCAST %2:vgpr(<2 x s16>) + %4:sgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = COPY %4:sgpr(s32) + %6:vgpr(s32) = G_LSHR %3:vgpr, %5:vgpr(s32) + %7:vgpr(s16) = G_TRUNC %6:vgpr(s32) + %8:vgpr(s32) = G_FPEXT %7:vgpr(s16) + %9:vgpr(s32) = G_FMAD %0:vgpr, %1:vgpr, %8:vgpr + $vgpr0 = COPY %9:vgpr(s32) + +... +--- +name: test_all_modifiers +legalized: true +regBankSelected: true +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_all_modifiers + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAD_MIX_F32_:%[0-9]+]]:vgpr_32 = V_MAD_MIX_F32 15, [[COPY]], 15, [[COPY1]], 15, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX10-NEXT: $vgpr0 = COPY [[V_MAD_MIX_F32_]] + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = COPY $vgpr1 + %2:vgpr(<2 x s16>) = COPY $vgpr2 + %3:vgpr(s32) = G_BITCAST %0:vgpr(<2 x s16>) + %4:sgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = COPY %4:sgpr(s32) + %6:vgpr(s32) = G_LSHR %3:vgpr, %5:vgpr(s32) + %7:vgpr(s16) = G_TRUNC %6:vgpr(s32) + %8:vgpr(s32) = G_BITCAST %1:vgpr(<2 x s16>) + %9:vgpr(s32) = COPY %4:sgpr(s32) + %10:vgpr(s32) = G_LSHR %8:vgpr, %9:vgpr(s32) + %11:vgpr(s16) = G_TRUNC %10:vgpr(s32) + %12:vgpr(s32) = G_BITCAST %2:vgpr(<2 x s16>) + %13:vgpr(s32) = COPY %4:sgpr(s32) + %14:vgpr(s32) = G_LSHR %12:vgpr, %13:vgpr(s32) + %15:vgpr(s16) = G_TRUNC %14:vgpr(s32) + %16:vgpr(s16) = G_FABS %7:vgpr + %17:vgpr(s16) = G_FABS %11:vgpr + %18:vgpr(s16) = G_FNEG %16:vgpr + %19:vgpr(s16) = G_FNEG %17:vgpr + %20:vgpr(s16) = G_FABS %15:vgpr + %21:vgpr(s32) = G_FPEXT %20:vgpr(s16) + %22:vgpr(s32) = G_FNEG %21:vgpr + %23:vgpr(s32) = G_FPEXT %18:vgpr(s16) + %24:vgpr(s32) = G_FPEXT %19:vgpr(s16) + %25:vgpr(s32) = G_FMAD %23:vgpr, %24:vgpr, %22:vgpr + $vgpr0 = COPY %25:vgpr(s32) + +...