diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -153,6 +153,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3_mad_mix_mods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization // directly before selecting a glue-less load, so hide this diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -97,6 +97,7 @@ bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const; bool selectG_EXTRACT(MachineInstr &I) const; + bool selectG_FMA_FMAD(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; bool selectG_BUILD_VECTOR(MachineInstr &I) const; @@ -293,6 +294,10 @@ ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const; ComplexRendererFns selectSMRDBufferSgprImm(MachineOperand &Root) const; + std::pair selectVOP3PMadMixModsImpl(MachineOperand &Root, + bool &Matched) const; + ComplexRendererFns selectVOP3PMadMixMods(MachineOperand &Root) const; + void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -522,6 +522,60 @@ return true; } +bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &I) const { + assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD); + + // Try to manually select MAD_MIX/FMA_MIX. + Register Dst = I.getOperand(0).getReg(); + LLT ResultTy = MRI->getType(Dst); + bool IsFMA = I.getOpcode() == AMDGPU::G_FMA; + if (ResultTy != LLT::scalar(32) || + (IsFMA ? !Subtarget->hasFmaMixInsts() : !Subtarget->hasMadMixInsts())) + return false; + + // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand + // using the conversion from f16. + bool MatchedSrc0, MatchedSrc1, MatchedSrc2; + auto [Src0, Src0Mods] = + selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0); + auto [Src1, Src1Mods] = + selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1); + auto [Src2, Src2Mods] = + selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2); + +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI = + I.getMF()->getInfo(); + AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); + assert((IsFMA || !Mode.allFP32Denormals()) && + "fmad selected with denormals enabled"); +#endif + + // TODO: We can select this with f32 denormals enabled if all the sources are + // converted from f16 (in which case fmad isn't legal). + if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2) + return false; + + const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32; + MachineInstr *MixInst = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst) + .addImm(Src0Mods) + .addReg(Src0) + .addImm(Src1Mods) + .addReg(Src1) + .addImm(Src2Mods) + .addReg(Src2) + .addImm(0) + .addImm(0) + .addImm(0); + + if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); Register DstReg = MI.getOperand(0).getReg(); @@ -3228,6 +3282,11 @@ return selectG_FABS(I); case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); + case TargetOpcode::G_FMA: + case TargetOpcode::G_FMAD: + if (selectG_FMA_FMAD(I)) + return true; + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_MERGE_VALUES: case TargetOpcode::G_CONCAT_VECTORS: return selectG_MERGE_VALUES(I); @@ -4679,6 +4738,137 @@ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; } +// Variant of stripBitCast that returns the instruction instead of a +// MachineOperand. +static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) { + if (MI->getOpcode() == AMDGPU::G_BITCAST) + return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); + return MI; +} + +// Figure out if this is really an extract of the high 16-bits of a dword, +// returns nullptr if it isn't. +static MachineInstr *isExtractHiElt(MachineInstr *Inst, + MachineRegisterInfo &MRI) { + Inst = stripBitCast(Inst, MRI); + + if (Inst->getOpcode() != AMDGPU::G_TRUNC) + return nullptr; + + MachineInstr *TruncOp = + getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI); + TruncOp = stripBitCast(TruncOp, MRI); + + // G_LSHR x, (G_CONSTANT i32 16) + if (TruncOp->getOpcode() == AMDGPU::G_LSHR) { + auto SrlAmount = getIConstantVRegValWithLookThrough( + TruncOp->getOperand(2).getReg(), MRI); + if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) { + MachineInstr *SrlOp = + getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); + return stripBitCast(SrlOp, MRI); + } + } + + // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0) + // 1, 0 swaps the low/high 16 bits. + // 1, 1 sets the high 16 bits to be the same as the low 16. + // in any case, it selects the high elts. + if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { + assert(MRI.getType(TruncOp->getOperand(0).getReg()) == + LLT::fixed_vector(2, 16)); + + ArrayRef Mask = TruncOp->getOperand(3).getShuffleMask(); + assert(Mask.size() == 2); + + if (Mask[0] == 1 && Mask[1] <= 1) { + MachineInstr *LHS = + getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); + return stripBitCast(LHS, MRI); + } + } + + return nullptr; +} + +std::pair +AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, + bool &Matched) const { + Matched = false; + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + + MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); + if (MI->getOpcode() == AMDGPU::G_FPEXT) { + MachineOperand *MO = &MI->getOperand(1); + Src = MO->getReg(); + MI = getDefIgnoringCopies(Src, *MRI); + + assert(MRI->getType(Src) == LLT::scalar(16)); + + // See through bitcasts. + // FIXME: Would be nice to use stripBitCast here. + if (MI->getOpcode() == AMDGPU::G_BITCAST) { + MO = &MI->getOperand(1); + Src = MO->getReg(); + MI = getDefIgnoringCopies(Src, *MRI); + } + + const auto CheckAbsNeg = [&]() { + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO); + MI = getDefIgnoringCopies(Src, *MRI); + + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; + + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } + }; + + CheckAbsNeg(); + + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the + // source register. + + Mods |= SISrcMods::OP_SEL_1; + + if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) { + Mods |= SISrcMods::OP_SEL_0; + MI = ExtractHiEltMI; + MO = &MI->getOperand(0); + Src = MO->getReg(); + + CheckAbsNeg(); + } + + Matched = true; + } + + return {Src, Mods}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { + Register Src; + unsigned Mods; + bool Matched; + std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -168,7 +168,7 @@ $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - $elt0)) + VGPR_32:$elt0)) >; def : GCNPat < @@ -181,7 +181,7 @@ $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - $elt0)) + VGPR_32:$elt0)) >; def : GCNPat < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -8,9 +8,7 @@ define amdgpu_vs float @test_f16_f32_add_fma_ext_mul(float %x, float %y, float %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX9-DENORM-NEXT: v_mad_f32 v2, v3, v4, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, v4, v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 ; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-DENORM-NEXT: ; return to shader part epilog @@ -18,25 +16,22 @@ ; GFX10-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v0, v3, v2 +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v3, v2 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_f32_add_fma_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v3, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul half %u, %v @@ -50,12 +45,8 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_ext_fma_mul: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v3, v2 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v5, v1 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, v4, v2 op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_f16_f32_add_ext_fma_mul: @@ -94,34 +85,29 @@ define amdgpu_vs float @test_f16_f32_add_fma_ext_mul_rhs(float %x, float %y, float %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v3, v4 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_f32_add_fma_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul half %u, %v @@ -135,12 +121,8 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half %z, half %u, half %v) { ; GFX9-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v3, v4 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_f16_f32_add_ext_fma_mul_rhs: @@ -181,72 +163,56 @@ ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v14, v0, v4 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v12, v1, v5 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v15, v2, v6 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v13, v3, v7 -; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_fmac_f32_e32 v14, v0, v4 -; GFX10-NEXT: v_fmac_f32_e32 v12, v1, v5 -; GFX10-NEXT: v_fmac_f32_e32 v15, v2, v6 -; GFX10-NEXT: v_fmac_f32_e32 v13, v3, v7 -; GFX10-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX10-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX10-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX10-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX10-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v14, v0, v4 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v12, v1, v5 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v15, v2, v6 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v13, v3, v7 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v14, v0, v4 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v12, v1, v5 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v15, v2, v6 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v13, v3, v7 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v14, v8 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v12, v9 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v15, v10 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v13, v11 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v @@ -339,72 +305,56 @@ ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v14, v4, v8 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v12, v5, v9 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v15, v6, v10 -; GFX9-DENORM-NEXT: v_mac_f32_e32 v13, v7, v11 -; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_fmac_f32_e32 v14, v4, v8 -; GFX10-NEXT: v_fmac_f32_e32 v12, v5, v9 -; GFX10-NEXT: v_fmac_f32_e32 v15, v6, v10 -; GFX10-NEXT: v_fmac_f32_e32 v13, v7, v11 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX10-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX10-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v14, v4, v8 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v12, v5, v9 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v15, v6, v10 -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v13, v7, v11 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v14, v12 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v15, v13 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v14, v4, v8 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v12, v5, v9 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v15, v6, v10 -; GFX10-DENORM-NEXT: v_fmac_f32_e32 v13, v7, v11 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v14 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v12 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v15 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v13 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1] +; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll @@ -8,16 +8,15 @@ define amdgpu_vs float @test_f16_f32_add_ext_mul(half inreg %x, half inreg %y, float inreg %z) { ; GFX9-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX9-FAST-DENORM-NEXT: v_mad_f32 v0, v0, v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, s0, v0, v1 op_sel_hi:[1,1,0] ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v1, s2 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s1, v0 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast half %x, %y @@ -29,16 +28,15 @@ define amdgpu_vs float @test_f16_f32_add_ext_mul_rhs(half inreg %x, half inreg %y, float inreg %z) { ; GFX9-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul_rhs: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX9-FAST-DENORM-NEXT: v_mad_f32 v0, v0, v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, s0, v0, v1 op_sel_hi:[1,1,0] ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_f16_f32_add_ext_mul_rhs: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v1, s2 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s1, v0 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast half %x, %y @@ -70,25 +68,16 @@ ; ; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s11, s0, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s1, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s3, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s4, 16 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s11 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, s12 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v6, s13 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, s4 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v8, s14 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v9, s5 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v5, s6 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v1, v1, v6, s7 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v2, v2, v7, s8 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v3, v3, v8, s9 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v4, v4, v9, s10 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <5 x half> %x, %y @@ -122,30 +111,18 @@ ; ; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs: ; GFX10-FAST-DENORM: ; %bb.0: ; %.entry -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s12, s0, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s13, s1, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s14, s2, 16 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-FAST-DENORM-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s12 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, s13 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, s14 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v6, s3 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, s0 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v8, s4 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v9, s1 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v10, s5 -; GFX10-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v11, s2 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v0, v0, v6, s6 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v1, v1, v7, s7 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v2, v2, v8, s8 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v3, v3, v9, s9 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v4, v4, v10, s10 -; GFX10-FAST-DENORM-NEXT: v_fma_f32 v5, v5, v11, s11 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-FAST-DENORM-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v0, s0, s3, v0 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v1, s0, s3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v2, s1, s4, v2 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v3, s1, s4, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v4, s2, s5, v4 op_sel_hi:[1,1,0] +; GFX10-FAST-DENORM-NEXT: v_fma_mix_f32 v5, s2, s5, v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-FAST-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <6 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll @@ -6,16 +6,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_mul(half %x, half %y, float %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_mul: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %x, %y @@ -28,16 +24,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_mul_rhs(float %x, half %y, half %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-DENORM-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v1, v2, v0 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast half %y, %z @@ -64,18 +56,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v8, v0, -v4 -; GFX10-DENORM-NEXT: v_fma_f32 v1, v9, v1, -v5 -; GFX10-DENORM-NEXT: v_fma_f32 v2, v10, v2, -v6 -; GFX10-DENORM-NEXT: v_fma_f32 v3, v11, v3, -v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v0, v2, -v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v3, -v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %x, %y @@ -102,18 +88,10 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v8, v10, v0 -; GFX10-DENORM-NEXT: v_fma_f32 v1, -v4, v6, v1 -; GFX10-DENORM-NEXT: v_fma_f32 v2, -v9, v11, v2 -; GFX10-DENORM-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v6, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v7, v2 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul fast <4 x half> %y, %z diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll @@ -6,16 +6,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul(half %x, half %y, float %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %x, %y @@ -29,16 +25,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul(half %x, half %y, float %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %x, %y @@ -53,16 +45,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul2(float %x, half %y, half %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX9-DENORM-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v1, v2, v0 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %y, %z @@ -76,16 +64,12 @@ define amdgpu_vs float @test_f16_to_f32_sub_neg_ext_mul2(float %x, half %y, half %z) { ; GFX9-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2: ; GFX9-DENORM: ; %bb.0: ; %entry -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX9-DENORM-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_to_f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v1, v2, v0 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v1, -v2, v0 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast half %y, %z @@ -113,20 +97,13 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v8, v0, -v4 -; GFX10-DENORM-NEXT: v_fma_f32 v1, v9, v1, -v5 -; GFX10-DENORM-NEXT: v_fma_f32 v2, v10, v2, -v6 -; GFX10-DENORM-NEXT: v_fma_f32 v3, v11, v3, -v7 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v3 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %x, %y @@ -154,20 +131,13 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, v8, v0, -v4 -; GFX10-DENORM-NEXT: v_fma_f32 v1, v9, v1, -v5 -; GFX10-DENORM-NEXT: v_fma_f32 v2, v10, v2, -v6 -; GFX10-DENORM-NEXT: v_fma_f32 v3, v11, v3, -v7 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v3 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v0, -v2, -v5 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v1, -v3, -v7 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v8, -v4 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v1, v9, -v6 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %x, %y @@ -196,20 +166,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v8, v10, v0 -; GFX10-DENORM-NEXT: v_fma_f32 v1, -v4, v6, v1 -; GFX10-DENORM-NEXT: v_fma_f32 v2, -v9, v11, v2 -; GFX10-DENORM-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v6 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %y, %z @@ -237,20 +199,12 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_fma_f32 v0, -v8, v10, v0 -; GFX10-DENORM-NEXT: v_fma_f32 v1, -v4, v6, v1 -; GFX10-DENORM-NEXT: v_fma_f32 v2, -v9, v11, v2 -; GFX10-DENORM-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v8, 0x80008000, v6 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v9, 0x80008000, v7 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, -v4, -v6, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, -v5, -v7, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, -v4, v8, v0 op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, -v5, v9, v2 op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog entry: %a = fmul fast <4 x half> %y, %z diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -1,7 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=VI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s + +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 { ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: @@ -10,23 +14,45 @@ ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -45,25 +71,51 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_mov_b32_e32 v0, 1.0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, 1.0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: s_movk_i32 s4, 0x3c00 +; GISEL-VI-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_e32 v0, s4, v0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -81,25 +133,49 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_mov_b32_e32 v0, v3 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GISEL-CI-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -111,30 +187,62 @@ } define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 { -; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -147,30 +255,62 @@ } define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 { -; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -190,23 +330,45 @@ ; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -225,23 +387,53 @@ ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -263,30 +455,67 @@ ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; VI-NEXT: flat_store_short v[0:1], v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-VI-NEXT: flat_store_short v[0:1], v0 +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) +; SDAG-VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-CI-NEXT: s_mov_b32 s6, -1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: flat_store_short v[0:1], v0 +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) +; GISEL-VI-NEXT: v_max_f16_e64 v0, v0, v0 clamp +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: s_mov_b32 s6, -1 +; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) +; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -308,3 +537,6 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} +; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -1,22 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX906 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX900 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s + +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s define half @mixlo_simple(float %src0, float %src1, float %src2) #0 { -; GFX906-LABEL: mixlo_simple: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] -; ; GFX900-LABEL: mixlo_simple: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; +; GFX906-LABEL: mixlo_simple: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: mixlo_simple: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -24,31 +29,38 @@ ; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: mixlo_simple: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: mixlo_simple: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: mixlo_simple: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) %cvt.result = fptrunc float %result to half ret half %cvt.result } define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { -; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX906-NEXT: s_setpc_b64 s[30:31] -; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -59,13 +71,23 @@ ; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -75,18 +97,18 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { -; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] -; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -96,13 +118,22 @@ ; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -111,18 +142,18 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { -; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX906-NEXT: s_setpc_b64 s[30:31] -; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp ; GFX900-NEXT: s_setpc_b64 s[30:31] ; +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX906-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -132,13 +163,30 @@ ; VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -149,13 +197,6 @@ } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 { -; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX906-NEXT: s_setpc_b64 s[30:31] -; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -163,6 +204,13 @@ ; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -172,13 +220,22 @@ ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -192,6 +249,14 @@ ; operation only clobbers relevant lane. define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; ; GFX906-LABEL: v_mad_mix_v2f32: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -200,52 +265,77 @@ ; GFX906-NEXT: v_mov_b32_e32 v0, v3 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v2f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v2f32: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v5, v3, v4 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_v2f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v2f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-VI-LABEL: v_mad_mix_v2f32: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -255,76 +345,138 @@ } define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v3f32: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v3f32: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v3f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v3f32: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v3f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; VI-NEXT: v_mac_f32_e32 v8, v6, v7 -; VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; VI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v3f32: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v3f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; SDAG-CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v3f32: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v3f32: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v3f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_mac_f32_e32 v7, v1, v4 -; CI-NEXT: v_mac_f32_e32 v6, v0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; CI-NEXT: v_mac_f32_e32 v8, v2, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-VI-LABEL: v_mad_mix_v3f32: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v3f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -334,95 +486,172 @@ } define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v4f32: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_mov_b32_e32 v0, v7 -; GFX906-NEXT: v_mov_b32_e32 v1, v6 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v4f32: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v7 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v6 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v4f32: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v7 +; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v6 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v4f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; VI-NEXT: v_mac_f32_e32 v10, v6, v8 -; VI-NEXT: v_mac_f32_e32 v11, v7, v9 -; VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; VI-NEXT: v_cvt_f16_f32_e32 v3, v4 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v4f32: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9 +; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8 +; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v4f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; SDAG-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; SDAG-CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; SDAG-CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v4f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_mac_f32_e32 v10, v2, v6 -; CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; CI-NEXT: v_mac_f32_e32 v8, v0, v4 -; CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX900-LABEL: v_mad_mix_v4f32: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v4f32: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v7 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v4f32: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v4f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> @@ -434,6 +663,14 @@ ; FIXME (DAG): Fold clamp define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; ; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -442,52 +679,91 @@ ; GFX906-NEXT: v_mov_b32_e32 v0, v3 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v2 clamp +; SDAG-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v5, v3, v4 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e64 v1, v2 clamp -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -499,82 +775,167 @@ } ; FIXME (DAG): Should be packed into 2 registers per argument? +; FIXME (GIsel): V_PK_MAX clamp could be folded into mixlo define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; VI-NEXT: v_mac_f32_e32 v8, v6, v7 -; VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp -; VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mac_f32_e32 v8, v2, v5 -; CI-NEXT: v_mac_f32_e32 v6, v0, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; CI-NEXT: v_mac_f32_e32 v7, v1, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v2, v0 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v7 -; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v4 clamp +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v8 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v5 clamp +; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_bfe_u32 v1, v2, 0, 16 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3 +; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -586,6 +947,17 @@ } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; ; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -597,84 +969,154 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9 +; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 +; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; VI-NEXT: v_mac_f32_e32 v10, v6, v8 -; VI-NEXT: v_mac_f32_e32 v11, v7, v9 -; VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_sdwa v1, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp -; VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; SDAG-CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; SDAG-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; SDAG-CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v4 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v10 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v11 clamp +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mac_f32_e32 v11, v3, v7 -; CI-NEXT: v_mac_f32_e32 v8, v0, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mac_f32_e32 v10, v2, v6 -; CI-NEXT: v_cvt_f32_f16_e64 v3, v0 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v10 -; CI-NEXT: v_mac_f32_e32 v9, v1, v5 -; CI-NEXT: v_cvt_f32_f16_e64 v2, v0 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GISEL-CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> @@ -685,61 +1127,140 @@ ret <4 x half> %clamp } +; FIXME (GISel): Packed Vectors handling isn't great for now, so we don't end up with +; a build_vector to select the mixhi. Issue is more specifically with how insert_vector_elt is being +; legalized (bitwise ops instead of shuffle/build_vector for instance). define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v5, v3, v4 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e64 v1, v2 clamp -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v2 clamp +; SDAG-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_max_f16_e64 v0, v3, v3 clamp +; GISEL-GFX900-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GISEL-GFX900-NEXT: v_and_or_b32 v0, v4, v1, v0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v4, v3 +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_max_f16_e64 v0, v3, v3 clamp +; GISEL-GFX906-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GISEL-GFX906-NEXT: v_and_or_b32 v0, v4, v1, v0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_f16_e64 v1, v0, v0 clamp +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -753,60 +1274,139 @@ } define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v5, v3, v4 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 16 +; GISEL-GFX900-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff +; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 16 +; GISEL-GFX906-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mac_f32_e32 v4, v0, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; CI-NEXT: v_mac_f32_e32 v5, v1, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_f16_sdwa v1, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 +; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -822,64 +1422,117 @@ ; FIXME (DAG): Should be able to use mixlo/mixhi define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX906-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX900-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX900-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX906-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v2f32_clamp_precvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp -; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp -; VI-NEXT: v_cvt_f16_f32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v2f32_clamp_precvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v1, v0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v1, v0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp +; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -893,82 +1546,150 @@ ; FIXME (DAG): Handling undef 4th component define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX906-NEXT: v_pack_b32_f16 v0, v0, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX900-NEXT: v_pack_b32_f16 v0, v0, v2 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX900-NEXT: v_pack_b32_f16 v0, v0, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX906-NEXT: v_pack_b32_f16 v0, v0, v2 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v3f32_clamp_precvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp -; VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; VI-NEXT: v_cvt_f16_f32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp +; SDAG-CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v3f32_clamp_precvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp -; CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v2, v0 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v2, v0 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp +; GISEL-CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -980,103 +1701,188 @@ } define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX906-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX906-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX900-NEXT: v_pack_b32_f16 v0, v0, v3 +; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, v2 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX900-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX900-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX906-NEXT: v_pack_b32_f16 v0, v0, v3 +; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, v2 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_v4f32_clamp_precvt: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; VI-NEXT: v_mad_f32 v6, v6, v8, v10 clamp -; VI-NEXT: v_mad_f32 v7, v7, v9, v11 clamp -; VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp -; VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp -; VI-NEXT: v_cvt_f16_f32_sdwa v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_sdwa v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_mad_f32 v7, v7, v9, v10 clamp +; SDAG-VI-NEXT: v_mad_f32 v6, v6, v8, v11 clamp +; SDAG-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v3 +; SDAG-VI-NEXT: v_or_b32_e32 v1, v1, v2 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp +; SDAG-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp +; SDAG-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v3, v0 +; GISEL-GFX900-NEXT: v_pack_b32_f16 v1, v2, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v3, v0 +; GISEL-GFX906-NEXT: v_pack_b32_f16 v1, v2, v1 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mad_f32 v6, v6, v8, v10 clamp +; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; GISEL-VI-NEXT: v_mad_f32 v2, v7, v9, v11 clamp +; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v4f32_clamp_precvt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp -; CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp -; CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp -; CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: s_setpc_b64 s[30:31] +; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp +; GISEL-CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp +; GISEL-CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> @@ -1114,3 +1920,5 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1,8 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,VI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s + +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: @@ -26,11 +31,20 @@ ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -109,11 +123,20 @@ ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v1, v3, v5 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 %src1.hi = extractelement <2 x half> %src1, i32 1 %src2.hi = extractelement <2 x half> %src2, i32 1 @@ -125,54 +148,96 @@ } define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX900-LABEL: v_mad_mix_v2f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_v2f32: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_mov_b32_e32 v1, v3 -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_mad_mix_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; VI-NEXT: v_mac_f32_e32 v1, v3, v5 -; VI-NEXT: v_mac_f32_e32 v0, v4, v6 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_v2f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; CI-NEXT: v_mac_f32_e32 v3, v1, v5 -; CI-NEXT: v_mov_b32_e32 v1, v3 -; CI-NEXT: v_mac_f32_e32 v0, v4, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_v2f32: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_v2f32: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v1, v3, v5 +; SDAG-VI-NEXT: v_mac_f32_e32 v0, v4, v6 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SDAG-CI-NEXT: v_mac_f32_e32 v3, v1, v5 +; SDAG-CI-NEXT: v_mov_b32_e32 v1, v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v4, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v3, v5 +; GISEL-VI-NEXT: v_mac_f32_e32 v1, v4, v6 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v6, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v7, v3 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -210,22 +275,42 @@ ; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_v2f32_shuffle: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; CI-NEXT: v_mad_f32 v0, v4, v2, v1 -; CI-NEXT: v_mac_f32_e32 v1, v5, v3 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SDAG-CI-NEXT: v_mad_f32 v0, v4, v2, v1 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v5, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GISEL-CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GISEL-CI-NEXT: v_or_b32_e32 v1, v1, v4 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GISEL-CI-NEXT: v_mad_f32 v0, v4, v0, v1 +; GISEL-CI-NEXT: v_mac_f32_e32 v1, v5, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.shuf = shufflevector <2 x half> %src0, <2 x half> undef, <2 x i32> %src1.shuf = shufflevector <2 x half> %src1, <2 x half> undef, <2 x i32> %src2.shuf = shufflevector <2 x half> %src2, <2 x half> undef, <2 x i32> @@ -249,20 +334,38 @@ ; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, -v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e64 v3, -v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v3, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -293,11 +396,20 @@ ; VI-NEXT: v_mad_f32 v0, |v0|, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -328,11 +440,20 @@ ; VI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -363,11 +484,19 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -395,11 +524,19 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, -v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, -v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.neg = fneg float %src2 @@ -428,11 +565,19 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, |v2| ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, |v2| -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -461,11 +606,19 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, -|v2| ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -479,19 +632,19 @@ ; inline immediate. define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { -; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 1.0 -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 1.0 -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 1.0 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 1.0 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: ; VI: ; %bb.0: @@ -501,11 +654,33 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 1.0 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_mov_b32_e32 v2, 1.0 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) @@ -513,19 +688,19 @@ } define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { -; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 0.15915494 -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 0.15915494 -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 0.15915494 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 0.15915494 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; VI: ; %bb.0: @@ -535,11 +710,34 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, 0.15915494 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0.15915494 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_mov_b32_e32 v2, 0.15915494 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3e22f983 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000) @@ -553,33 +751,65 @@ ; f32 1/2pi = 0x3e22f983 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { -; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 0x3e230000 -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 0x3e230000 -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 0x3e230000 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 0x3e230000 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0x3e230000 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_mov_b32_e32 v2, 0x3e230000 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x3e230000 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3e230000 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2 = fpext half 0xH3118 to float @@ -589,33 +819,65 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { -; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 0x367c0000 -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 0x367c0000 -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 0x367c0000 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 0x367c0000 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0x367c0000 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_mov_b32_e32 v2, 0x367c0000 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x367c0000 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x367c0000 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v2, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2 = fpext half 0xH003F to float @@ -624,49 +886,89 @@ } define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { -; GFX900-LABEL: v_mad_mix_v2f32_f32imm1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 1.0 -; GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_v2f32_f32imm1: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 1.0 -; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_mad_mix_v2f32_f32imm1: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mad_f32 v0, v0, v3, 1.0 -; VI-NEXT: v_mad_f32 v1, v2, v1, 1.0 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_v2f32_f32imm1: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mad_f32 v0, v0, v2, 1.0 -; CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 1.0 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 1.0 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v3, 1.0 +; SDAG-VI-NEXT: v_mad_f32 v1, v2, v1, 1.0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v2, 1.0 +; SDAG-CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imm1: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: s_mov_b32 s4, 1.0 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imm1: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: s_mov_b32 s4, 1.0 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imm1: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mad_f32 v0, v2, v0, 1.0 +; GISEL-VI-NEXT: v_mad_f32 v1, v3, v1, 1.0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_f32imm1: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, 1.0 +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> ) @@ -674,51 +976,93 @@ } define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { -; GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 0x3e230000 -; GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 0x3e230000 -; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v1, 0x3e230000 -; VI-NEXT: v_madak_f32 v0, v0, v3, 0x3e230000 -; VI-NEXT: v_mac_f32_e32 v1, v2, v4 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 -; CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e230000 -; CI-NEXT: v_mac_f32_e32 v1, v4, v3 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 0x3e230000 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 0x3e230000 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0x3e230000 +; SDAG-VI-NEXT: v_madak_f32 v0, v0, v3, 0x3e230000 +; SDAG-VI-NEXT: v_mac_f32_e32 v1, v2, v4 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e230000 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: s_mov_b32 s4, 0x3e230000 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: s_mov_b32 s4, 0x3e230000 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: s_mov_b32 s4, 0x3e230000 +; GISEL-VI-NEXT: v_mad_f32 v0, v2, v0, s4 +; GISEL-VI-NEXT: v_mad_f32 v1, v3, v1, s4 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: s_mov_b32 s4, 0x3e230000 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, s4 +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, s4 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2 = fpext <2 x half> to <2 x float> @@ -727,50 +1071,91 @@ } define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { -; GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_mov_b32 s4, 0.15915494 -; GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s4, 0.15915494 -; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_mad_mix_v2f32_f32imminv2pi: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mad_f32 v0, v0, v3, 0.15915494 -; VI-NEXT: v_mad_f32 v1, v2, v1, 0.15915494 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_v2f32_f32imminv2pi: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 -; CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e22f983 -; CI-NEXT: v_mac_f32_e32 v1, v4, v3 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: s_mov_b32 s4, 0.15915494 +; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: s_mov_b32 s4, 0.15915494 +; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_mad_f32 v0, v0, v3, 0.15915494 +; SDAG-VI-NEXT: v_mad_f32 v1, v2, v1, 0.15915494 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 +; SDAG-CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e22f983 +; SDAG-CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: s_mov_b32 s4, 0.15915494 +; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: s_mov_b32 s4, 0.15915494 +; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mad_f32 v0, v2, v0, 0.15915494 +; GISEL-VI-NEXT: v_mad_f32 v1, v3, v1, 0.15915494 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: s_mov_b32 s4, 0x3e22f983 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v2, s4 +; GISEL-CI-NEXT: v_mad_f32 v1, v1, v3, s4 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2 = fpext <2 x half> to <2 x float> @@ -800,11 +1185,20 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 %src1.hi = extractelement <2 x half> %src1, i32 1 %src2.hi = extractelement <2 x half> %src2, i32 1 @@ -830,11 +1224,17 @@ ; GFX906-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; CIVI-LABEL: no_mix_simple: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CIVI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: no_mix_simple: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: no_mix_simple: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) ret float %result } @@ -852,11 +1252,17 @@ ; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; CIVI-LABEL: no_mix_simple_fabs: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 -; CIVI-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: no_mix_simple_fabs: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: no_mix_simple_fabs: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.fabs = call float @llvm.fabs.f32(float %src0) %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2) ret float %result @@ -892,11 +1298,20 @@ ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -928,11 +1343,19 @@ ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_fma_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -970,12 +1393,22 @@ ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -1012,12 +1445,21 @@ ; VI-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, v0, v1 -; CI-NEXT: v_add_f32_e32 v0, v0, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul float %src0.ext, %src1.ext @@ -1047,11 +1489,20 @@ ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -1081,11 +1532,19 @@ ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul contract float %src0.ext, %src1.ext @@ -1106,21 +1565,39 @@ ; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mad_f32 v0, -v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e64 v3, -v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v3, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 0 %src0.neg = fneg half %src0 @@ -1160,13 +1637,23 @@ ; VI-NEXT: v_mad_f32 v0, |v0|, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 %src0.neg = fneg half %src0 @@ -1200,13 +1687,23 @@ ; VI-NEXT: v_mac_f32_e32 v0, v3, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v3, |v0| +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 %src0.abs = call half @llvm.fabs.f16(half %src0) @@ -1230,22 +1727,43 @@ ; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %fneg = fneg <2 x half> %src0.arg.bc %src0 = extractelement <2 x half> %fneg, i32 1 @@ -1269,22 +1787,43 @@ ; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; VI-NEXT: v_mac_f32_e32 v0, v3, v1 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) %src0 = extractelement <2 x half> %fabs, i32 1 @@ -1308,22 +1847,43 @@ ; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mad_f32 v0, v0, v1, v2 -; CI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) %fneg.fabs = fneg <2 x half> %fabs