diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -153,6 +153,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3_mad_mix_mods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization // directly before selecting a glue-less load, so hide this diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -97,7 +97,9 @@ bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const; bool selectG_EXTRACT(MachineInstr &I) const; + bool selectG_FMA(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; + bool selectG_BUILD_VECTOR(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; bool selectG_BUILD_VECTOR_TRUNC(MachineInstr &I) const; bool selectG_PTR_ADD(MachineInstr &I) const; @@ -297,6 +299,10 @@ ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const; ComplexRendererFns selectSMRDBufferSgprImm(MachineOperand &Root) const; + std::pair selectVOP3PMadMixModsImpl(MachineOperand &Root, + bool &Matched) const; + ComplexRendererFns selectVOP3PMadMixMods(MachineOperand &Root) const; + void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -522,15 +522,83 @@ return true; } +bool AMDGPUInstructionSelector::selectG_FMA(MachineInstr &I) const { + assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD); + + // Try to manually select MAD_MIX/FMA_MIX. + Register Dst = I.getOperand(0).getReg(); + LLT ResultTy = MRI->getType(Dst); + bool IsFMA = I.getOpcode() == AMDGPU::G_FMA; + if (ResultTy != LLT::scalar(32) || + (!Subtarget->hasMadMixInsts() && !Subtarget->hasFmaMixInsts()) || + ((IsFMA && Subtarget->hasMadMixInsts()) || + (!IsFMA && Subtarget->hasFmaMixInsts()))) { + return selectImpl(I, *CoverageInfo); + } + + // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand + // using the conversion from f16. + bool MatchedSrc0, MatchedSrc1, MatchedSrc2; + auto [Src0, Src0Mods] = + selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0); + auto [Src1, Src1Mods] = + selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1); + auto [Src2, Src2Mods] = + selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2); + +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI = + I.getMF()->getInfo(); + AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); + assert((IsFMA || !Mode.allFP32Denormals()) && + "fmad selected with denormals enabled"); +#endif + + // TODO: We can select this with f32 denormals enabled if all the sources are + // converted from f16 (in which case fmad isn't legal). + if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2) { + return selectImpl(I, *CoverageInfo); + } + + const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32; + MachineInstr *MixInst = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst) + .addImm(Src0Mods) + .addReg(Src0) + .addImm(Src1Mods) + .addReg(Src1) + .addImm(Src2Mods) + .addReg(Src2) + .addImm(0) + .addImm(0) + .addImm(0); + + if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI)) { + return false; + } + + I.removeFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR) { + if (selectImpl(MI, *CoverageInfo)) + return true; + } + MachineBasicBlock *BB = MI.getParent(); Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI->getType(DstReg); LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); const unsigned SrcSize = SrcTy.getSizeInBits(); + + // We already tried to select G_BUILD_VECTOR before. if (SrcSize < 32) - return selectImpl(MI, *CoverageInfo); + return MI.getOpcode() == AMDGPU::G_BUILD_VECTOR + ? false + : selectImpl(MI, *CoverageInfo); const DebugLoc &DL = MI.getDebugLoc(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); @@ -3504,6 +3572,9 @@ return selectG_FABS(I); case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); + case TargetOpcode::G_FMA: + case TargetOpcode::G_FMAD: + return selectG_FMA(I); case TargetOpcode::G_MERGE_VALUES: case TargetOpcode::G_BUILD_VECTOR: case TargetOpcode::G_CONCAT_VECTORS: @@ -3620,6 +3691,18 @@ MI = getDefIgnoringCopies(Src, *MRI); } + // TODO: Should be a combine instead + if (MI && MI->getOpcode() == AMDGPU::G_FSUB) { + MachineInstr *LHS = getDefIgnoringCopies(MI->getOperand(1).getReg(), *MRI); + + if (LHS->getOpcode() == AMDGPU::G_FCONSTANT && + LHS->getOperand(1).getFPImm()->isZeroValue()) { + Src = MI->getOperand(2).getReg(); + Mods |= SISrcMods::NEG; + MI = getDefIgnoringCopies(Src, *MRI); + } + } + if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) { Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::ABS; @@ -4960,6 +5043,149 @@ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; } +// Variant of stripBitCast that returns the instruction instead of a +// MachineOperand. +static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) { + if (MI->getOpcode() == AMDGPU::G_BITCAST) + return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); + return MI; +} + +// Figure out if this is really an extract of the high 16-bits of a dword, +// returns nullptr if it isn't. +static MachineInstr *isExtractHiElt(MachineInstr *Inst, + MachineRegisterInfo &MRI) { + Inst = stripBitCast(Inst, MRI); + + if (Inst->getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT) { + MachineOperand &InOp = Inst->getOperand(2); + if (InOp.isImm()) { + if (InOp.getImm() != 1) + return nullptr; + return getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI); + } + } + + if (Inst->getOpcode() != AMDGPU::G_TRUNC && + Inst->getOpcode() != AMDGPU::G_FPTRUNC) + return nullptr; + + MachineInstr *TruncOp = + getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI); + TruncOp = stripBitCast(TruncOp, MRI); + + // G_LSHR x, (G_CONSTANT i32 16) + if (TruncOp->getOpcode() == AMDGPU::G_LSHR) { + MachineInstr *SrlAmount = + getDefIgnoringCopies(TruncOp->getOperand(2).getReg(), MRI); + if (SrlAmount->getOpcode() == AMDGPU::G_CONSTANT && + SrlAmount->getOperand(1).getCImm()->getZExtValue() == 16) { + MachineInstr *SrlOp = + getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); + return stripBitCast(SrlOp, MRI); + } + } + + // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0) + // 1, 0 swaps the low/high 16 bits. + // 1, 1 sets the high 16 bits to be the same as the low 16. + // in any case, it selects the high elts. + if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { + assert(MRI.getType(TruncOp->getOperand(0).getReg()) == + LLT::fixed_vector(2, 16)); + + ArrayRef Mask = TruncOp->getOperand(3).getShuffleMask(); + assert(Mask.size() == 2); + + if (Mask[0] == 1 && Mask[1] <= 1) { + MachineInstr *LHS = + getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); + return stripBitCast(LHS, MRI); + } + } + + return nullptr; +} + +std::pair +AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, + bool &Matched) const { + Matched = false; + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + + MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); + if (MI->getOpcode() == AMDGPU::G_FPEXT) { + MachineOperand *MO = &MI->getOperand(1); + Src = MO->getReg(); + MI = getDefIgnoringCopies(Src, *MRI); + + // FIXME: add assert back? + // assert(MO->getValueType() == MVT::f16); + + // See through bitcasts. + // FIXME: Would be nice to use stripBitCast here. + if (MI->getOpcode() == AMDGPU::G_BITCAST) { + MO = &MI->getOperand(1); + Src = MO->getReg(); + MI = getDefIgnoringCopies(Src, *MRI); + } + + const auto CheckAbsNeg = [&]() { + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO); + MI = getDefIgnoringCopies(Src, *MRI); + + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; + + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } + }; + + CheckAbsNeg(); + + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the + // source register. + + Mods |= SISrcMods::OP_SEL_1; + + if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) { + Mods |= SISrcMods::OP_SEL_0; + MI = ExtractHiEltMI; + MO = &MI->getOperand(0); + Src = MO->getReg(); + + CheckAbsNeg(); + } + + Matched = true; + } + + return {Src, Mods}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { + Register Src; + unsigned Mods; + bool Matched; + std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -725,10 +725,9 @@ if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .clampMaxNumElements(0, S16, 2) - .clampScalar(0, S16, S64) - .scalarize(0); + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampScalar(0, S16, S64) + .scalarize(0); } else if (ST.has16BitInsts()) { MinNumMaxNum.customFor(FPTypes16) .clampScalar(0, S16, S64) @@ -1432,7 +1431,7 @@ unsigned IdxTypeIdx = 2; getActionDefinitionsBuilder(Op) - .customIf([=](const LegalityQuery &Query) { + .customIf([=](const LegalityQuery &Query) { const LLT EltTy = Query.Types[EltTypeIdx]; const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; @@ -1442,33 +1441,36 @@ VecTy.getSizeInBits() <= MaxRegisterSize && IdxTy.getSizeInBits() == 32; }) - .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), - bitcastToVectorElement32(VecTypeIdx)) - //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) - .bitcastIf( - all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), - [=](const LegalityQuery &Query) { - // For > 64-bit element types, try to turn this into a 64-bit - // element vector since we may be able to do better indexing - // if this is scalar. If not, fall back to 32. - const LLT EltTy = Query.Types[EltTypeIdx]; - const LLT VecTy = Query.Types[VecTypeIdx]; - const unsigned DstEltSize = EltTy.getSizeInBits(); - const unsigned VecSize = VecTy.getSizeInBits(); - - const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; - return std::make_pair( - VecTypeIdx, - LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); - }) - .clampScalar(EltTypeIdx, S32, S64) - .clampScalar(VecTypeIdx, S32, S64) - .clampScalar(IdxTypeIdx, S32, S32) - .clampMaxNumElements(VecTypeIdx, S32, 32) - // TODO: Clamp elements for 64-bit vectors? - // It should only be necessary with variable indexes. - // As a last resort, lower to the stack - .lower(); + // TODO: remove bitcastToVectorElement32 + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), + scalarOrEltNarrowerThan(VecTypeIdx, 16)), + bitcastToVectorElement32(VecTypeIdx)) + //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), + scalarOrEltWiderThan(VecTypeIdx, 64)), + [=](const LegalityQuery &Query) { + // For > 64-bit element types, try to turn this into a + // 64-bit element vector since we may be able to do better + // indexing if this is scalar. If not, fall back to 32. + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const unsigned DstEltSize = EltTy.getSizeInBits(); + const unsigned VecSize = VecTy.getSizeInBits(); + + const unsigned TargetEltSize = + DstEltSize % 64 == 0 ? 64 : 32; + return std::make_pair( + VecTypeIdx, LLT::fixed_vector(VecSize / TargetEltSize, + TargetEltSize)); + }) + .clampScalar(EltTypeIdx, S16, S64) + .clampScalar(VecTypeIdx, S32, S64) + .clampScalar(IdxTypeIdx, S32, S32) + .clampMaxNumElements(VecTypeIdx, S32, 32) + // TODO: Clamp elements for 64-bit vectors? + // It should only be necessary with variable indexes. + // As a last resort, lower to the stack + .lower(); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -1524,15 +1526,15 @@ if (ST.hasScalarPackInsts()) { BuildVector - // FIXME: Should probably widen s1 vectors straight to s32 - .minScalarOrElt(0, S16) - // Widen source elements and produce a G_BUILD_VECTOR_TRUNC - .minScalar(1, S32); + // FIXME: Should probably widen s1 vectors straight to s32 + .minScalarOrElt(0, S16) + // Widen source elements and produce a G_BUILD_VECTOR_TRUNC + .minScalar(1, S16); getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) .legalFor({V2S16, S32}) .lower(); - BuildVector.minScalarOrElt(0, S32); + BuildVector.minScalarOrElt(0, S16); } else { BuildVector.customFor({V2S16, S16}); BuildVector.minScalarOrElt(0, S32); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2614,69 +2614,6 @@ break; } - case AMDGPU::G_BUILD_VECTOR: - case AMDGPU::G_BUILD_VECTOR_TRUNC: { - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - if (DstTy != LLT::fixed_vector(2, 16)) - break; - - assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); - substituteSimpleCopyRegs(OpdMapper, 1); - substituteSimpleCopyRegs(OpdMapper, 2); - - const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - if (DstBank == &AMDGPU::SGPRRegBank) - break; // Can use S_PACK_* instructions. - - MachineIRBuilder B(MI); - - Register Lo = MI.getOperand(1).getReg(); - Register Hi = MI.getOperand(2).getReg(); - const LLT S32 = LLT::scalar(32); - - const RegisterBank *BankLo = - OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - const RegisterBank *BankHi = - OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; - - Register ZextLo; - Register ShiftHi; - - if (Opc == AMDGPU::G_BUILD_VECTOR) { - ZextLo = B.buildZExt(S32, Lo).getReg(0); - MRI.setRegBank(ZextLo, *BankLo); - - Register ZextHi = B.buildZExt(S32, Hi).getReg(0); - MRI.setRegBank(ZextHi, *BankHi); - - auto ShiftAmt = B.buildConstant(S32, 16); - MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); - - ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); - MRI.setRegBank(ShiftHi, *BankHi); - } else { - Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); - MRI.setRegBank(MaskLo, *BankLo); - - auto ShiftAmt = B.buildConstant(S32, 16); - MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); - - ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); - MRI.setRegBank(ShiftHi, *BankHi); - - ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); - MRI.setRegBank(ZextLo, *BankLo); - } - - auto Or = B.buildOr(S32, ZextLo, ShiftHi); - MRI.setRegBank(Or.getReg(0), *DstBank); - - B.buildBitcast(DstReg, Or); - MI.eraseFromParent(); - return; - } case AMDGPU::G_EXTRACT_VECTOR_ELT: { SmallVector DstRegs(OpdMapper.getVRegs(0)); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1746,6 +1746,15 @@ (V_LSHLREV_B32_e64 (i32 16), $src1)) >; +/* +foreach ext = [sext, zext, anyext] in { + def : GCNPat < + (i32 (shl (i32 (ext i16:$src)), (i32 16))), + (V_LSHLREV_B32_e64 (i32 16), $src) + >; +} +*/ + def : GCNPat < (fcopysign f64:$src0, f16:$src1), (REG_SEQUENCE SReg_64, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -168,7 +168,7 @@ $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - $elt0)) + VGPR_32:$elt0)) >; def : GCNPat < @@ -181,7 +181,7 @@ $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - $elt0)) + VGPR_32:$elt0)) >; def : GCNPat < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix-hi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix-hi.ll @@ -0,0 +1,176 @@ +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; GFX9: s_waitcnt +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + %vec.result = insertelement <2 x half> undef, half %cvt.result, i32 1 + ret <2 x half> %vec.result +} + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; GFX9: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + %vec.result = insertelement <2 x half> , half %cvt.result, i32 1 + ret <2 x half> %vec.result +} + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GFX9: s_waitcnt +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + %vec = insertelement <2 x half> undef, half %lo, i32 0 + %vec.result = insertelement <2 x half> %vec, half %cvt.result, i32 1 + ret <2 x half> %vec.result +} + +; FIXME: should be v_lshlrev_b32_e32 v0, 16, v0 + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; GFX9: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_setpc_b64 +define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + %bc = bitcast half %cvt.result to i16 + %ext = zext i16 %bc to i32 + %shr = shl i32 %ext, 16 + ret i32 %shr +} + +; FIXME: should be v_lshlrev_b32_e32 v0, 16, v0 + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; GFX9: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_setpc_b64 +define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + %bc = bitcast half %cvt.result to i16 + %ext = sext i16 %bc to i32 + %shr = shl i32 %ext, 16 + ret i32 %shr +} + +; FIXME: Cannot select the BUILD_VECTOR with this yet, seems like it counted on +; the scalarizatiion in RegBankInfo to do it. + +; bb.1 (%ir-block.0): +; liveins: $vgpr0, $vgpr1, $vgpr2 +; %3:vgpr(s32) = COPY $vgpr0 +; %0:vgpr(s16) = G_TRUNC %3:vgpr(s32) +; %4:vgpr(s32) = COPY $vgpr1 +; %1:vgpr(s16) = G_TRUNC %4:vgpr(s32) +; %5:vgpr(s32) = COPY $vgpr2 +; %2:vgpr(s16) = G_TRUNC %5:vgpr(s32) +; %6:vgpr(s32) = G_FPEXT %0:vgpr(s16) +; %7:vgpr(s32) = G_FPEXT %1:vgpr(s16) +; %8:vgpr(s32) = G_FPEXT %2:vgpr(s16) +; %9:vgpr(s32) = G_FMA %6:vgpr, %7:vgpr, %8:vgpr +; %19:vgpr(s32) = G_FCANONICALIZE %9:vgpr +; %13:vgpr(s32) = G_AMDGPU_CLAMP %19:vgpr +; %14:vgpr(s16) = G_FPTRUNC %13:vgpr(s32) +; %18:sgpr(s16) = G_IMPLICIT_DEF +; %15:vgpr(<2 x s16>) = G_BUILD_VECTOR %18:sgpr(s16), %14:vgpr(s16) +; $vgpr0 = COPY %15:vgpr(<2 x s16>) +; SI_RETURN implicit $vgpr0 + +; (GCN-LABEL): {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; (GCN): s_waitcnt +; (GFX9-NEXT): v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} +; (GFX9-NEXT): v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; (GFX9-NEXT): s_setpc_b64 +; define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 { +; %src0.ext = fpext half %src0 to float +; %src1.ext = fpext half %src1 to float +; %src2.ext = fpext half %src2 to float +; %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) +; %max = call float @llvm.maxnum.f32(float %result, float 0.0) +; %clamp = call float @llvm.minnum.f32(float %max, float 1.0) +; %cvt.result = fptrunc float %clamp to half +; %vec.result = insertelement <2 x half> undef, half %cvt.result, i32 1 +; ret <2 x half> %vec.result +; } + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0) + %clamp = call half @llvm.minnum.f16(half %max, half 1.0) + %vec.result = insertelement <2 x half> undef, half %clamp, i32 1 + ret <2 x half> %vec.result +} + + +; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} +; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + store volatile half %cvt.result, half addrspace(1)* undef + %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0) + %clamp = call half @llvm.minnum.f16(half %max, half 1.0) + %vec.result = insertelement <2 x half> undef, half %clamp, i32 1 + ret <2 x half> %vec.result +} + +declare half @llvm.minnum.f16(half, half) #1 +declare half @llvm.maxnum.f16(half, half) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix-lo.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix-lo.ll @@ -0,0 +1,383 @@ +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX906 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s + +; GCN-LABEL: mixlo_simple: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2{{$}} +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2{{$}} +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mac_f32_e32 +; CIVI: v_cvt_f16_f32_e32 +define half @mixlo_simple(float %src0, float %src1, float %src2) #0 { + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + %cvt.result = fptrunc float %result to half + ret half %cvt.result +} + +; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; GFX900: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} +; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} +; CI: v_mac_f32 +; CIVI: v_cvt_f16_f32 +define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to half + ret half %cvt.result +} + +; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mac_f32 +define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + %cvt.result = fptrunc float %result to half + ret half %cvt.result +} + +; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]$}} +define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + %cvt.result = fptrunc float %result to half + %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0) + %clamp = call half @llvm.minnum.f16(half %max, half 1.0) + ret half %clamp +} + +; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 + +; FIXME: Should be using v_mad + clamp but v_mac isn't folded in GISel due to +; different rules in `isCanonicalized`. +; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} +define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %cvt.result = fptrunc float %clamp to half + ret half %cvt.result +} + +; FIXME: Should abe able to avoid extra register because first +; operation only clobbers relevant lane. +; GCN-LABEL: {{^}}v_mad_mix_v2f32: +; GCN: s_waitcnt + +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}} + +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}} + +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x half> + ret <2 x half> %cvt.result +} + +; FIXME: Uses an extra register (v6) but DAGISel doesn't. + +; GCN-LABEL: {{^}}v_mad_mix_v3f32: +; GCN: s_waitcnt + +; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] + +; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] + +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: s_setpc_b64 +define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { + %src0.ext = fpext <3 x half> %src0 to <3 x float> + %src1.ext = fpext <3 x half> %src1 to <3 x float> + %src2.ext = fpext <3 x half> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %cvt.result = fptrunc <3 x float> %result to <3 x half> + ret <3 x half> %cvt.result +} + +; FIXME: Uses an extra register (v6) but DAGISel doesn't. + +; GCN-LABEL: {{^}}v_mad_mix_v4f32: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] + +; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] + +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: s_setpc_b64 +define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { + %src0.ext = fpext <4 x half> %src0 to <4 x float> + %src1.ext = fpext <4 x half> %src1 to <4 x float> + %src2.ext = fpext <4 x half> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %cvt.result = fptrunc <4 x float> %result to <4 x half> + ret <4 x half> %cvt.result +} + +; FIXME: Fold clamp +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt: +; GFX900: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}} + +; GFX906: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}} + +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x half> + %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %cvt.result, <2 x half> zeroinitializer) + %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> ) + ret <2 x half> %clamp +} + +; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: +; GCN: s_waitcnt +; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] clamp + +; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-DAG: v_fma_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] clamp + +; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}} +; GFX9-NEXT: s_setpc_b64 +define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { + %src0.ext = fpext <3 x half> %src0 to <3 x float> + %src1.ext = fpext <3 x half> %src1 to <3 x float> + %src2.ext = fpext <3 x half> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %cvt.result = fptrunc <3 x float> %result to <3 x half> + %max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer) + %clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> ) + ret <3 x half> %clamp +} + +; FIXME: Uses an extra register (v6) but DAGISel doesn't. + +; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + + +; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + + +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: s_setpc_b64 +define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { + %src0.ext = fpext <4 x half> %src0 to <4 x float> + %src1.ext = fpext <4 x half> %src1 to <4 x float> + %src2.ext = fpext <4 x half> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %cvt.result = fptrunc <4 x float> %result to <4 x half> + %max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer) + %clamp = call <4 x half> @llvm.minnum.v4f16(<4 x half> %max, <4 x half> ) + ret <4 x half> %clamp +} + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_lo: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] + +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] + +; GFX9-NOT: v_mov_b32_sdwa +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x half> + %cvt.lo = extractelement <2 x half> %cvt.result, i32 0 + %max.lo = call half @llvm.maxnum.f16(half %cvt.lo, half 0.0) + %clamp.lo = call half @llvm.minnum.f16(half %max.lo, half 1.0) + %insert = insertelement <2 x half> %cvt.result, half %clamp.lo, i32 0 + ret <2 x half> %insert +} + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_hi: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x half> + %cvt.hi = extractelement <2 x half> %cvt.result, i32 1 + %max.hi = call half @llvm.maxnum.f16(half %cvt.hi, half 0.0) + %clamp.hi = call half @llvm.minnum.f16(half %max.hi, half 1.0) + %insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1 + ret <2 x half> %insert +} + +; FIXME: Should be able to use mixlo/mixhi + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + +; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + +; GFX9: v_cvt_f16_f32_e32 v1, v3 +; GFX9: v_cvt_f16_f32_e32 v0, v0 +; GFX9: v_pack_b32_f16 v0, v1, v0 +; GFX9: s_setpc_b64 +define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer) + %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> ) + %cvt.result = fptrunc <2 x float> %clamp to <2 x half> + ret <2 x half> %cvt.result +} + +; FIXME: Handling undef 4th component + +; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt: +; GCN: s_waitcnt +; GFX900: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp + +; GFX906: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp + +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX9-NEXT: s_setpc_b64 +define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { + %src0.ext = fpext <3 x half> %src0 to <3 x float> + %src1.ext = fpext <3 x half> %src1 to <3 x float> + %src2.ext = fpext <3 x half> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer) + %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> ) + %cvt.result = fptrunc <3 x float> %clamp to <3 x half> + ret <3 x half> %cvt.result +} + +; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt: +; GFX900: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX900: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + +; GFX906: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX906: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp + +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +; GFX9: v_cvt_f16_f32 +define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { + %src0.ext = fpext <4 x half> %src0 to <4 x float> + %src1.ext = fpext <4 x half> %src1 to <4 x float> + %src2.ext = fpext <4 x half> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer) + %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> ) + %cvt.result = fptrunc <4 x float> %clamp to <4 x half> + ret <4 x half> %cvt.result +} + +declare half @llvm.minnum.f16(half, half) #1 +declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 +declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1 +declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) #1 + +declare half @llvm.maxnum.f16(half, half) #1 +declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 +declare <3 x half> @llvm.maxnum.v3f16(<3 x half>, <3 x half>) #1 +declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) #1 + +declare float @llvm.minnum.f32(float, float) #1 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1 + +declare float @llvm.maxnum.f32(float, float) #1 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1 + +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mad-mix.ll @@ -0,0 +1,593 @@ +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX906,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo: +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] +; VI: v_mac_f32 + +; FIXME: Should be v_mad? +; CI: v_mac_f32 +define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_int: +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; CIVI: v_mac_f32 +define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 { + %src0.hi = lshr i32 %src0, 16 + %src1.hi = lshr i32 %src1, 16 + %src2.hi = lshr i32 %src2, 16 + %src0.i16 = trunc i32 %src0.hi to i16 + %src1.i16 = trunc i32 %src1.hi to i16 + %src2.i16 = trunc i32 %src2.hi to i16 + %src0.fp16 = bitcast i16 %src0.i16 to half + %src1.fp16 = bitcast i16 %src1.i16 to half + %src2.fp16 = bitcast i16 %src2.i16 to half + %src0.ext = fpext half %src0.fp16 to float + %src1.ext = fpext half %src1.fp16 to float + %src2.ext = fpext half %src2.fp16 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; VI: v_mac_f32 + +; FIXME: Should be v_mad? +; CI: v_mac_f32 +define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.hi = extractelement <2 x half> %src0, i32 1 + %src1.hi = extractelement <2 x half> %src1, i32 1 + %src2.hi = extractelement <2 x half> %src2, i32 1 + %src0.ext = fpext half %src0.hi to float + %src1.ext = fpext half %src1.hi to float + %src2.ext = fpext half %src2.hi to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; FIXME: op_sel is on the first mix for DAG, here it's on the second. Same thing or not? + +; GCN-LABEL: {{^}}v_mad_mix_v2f32: +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 + +; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 + +; CIVI: v_mac_f32 +define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2.ext = fpext <2 x half> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + ret <2 x float> %result +} + +; FIXME: G_LSHR + G_SHUFFLEVECTOR (1,0) pair isn't folded out. +; G_LSHR is picked up by isExtractHiElt but in combination with G_SHUFFLEVECTOR +; it should cancel out. Add a combine for this? + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: +; GCN: s_waitcnt +; GFX900: v_alignbit_b32 v3, v0, v0, 16 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX900-NEXT: v_mad_mix_f32 v1, v3, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 + +; GFX906: v_alignbit_b32 v3, v0, v0, 16 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_sdwa v2, v2 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +; GFX906-NEXT: v_fma_mix_f32 v1, v3, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 + +; CIVI: v_mac_f32 +define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.shuf = shufflevector <2 x half> %src0, <2 x half> undef, <2 x i32> + %src1.shuf = shufflevector <2 x half> %src1, <2 x half> undef, <2 x i32> + %src2.shuf = shufflevector <2 x half> %src2, <2 x half> undef, <2 x i32> + %src0.ext = fpext <2 x half> %src0.shuf to <2 x float> + %src1.ext = fpext <2 x half> %src1.shuf to <2 x float> + %src2.ext = fpext <2 x half> %src2.shuf to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + ret <2 x float> %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_negf16lo_f16lo_f16lo: +; GFX900: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX900-NEXT: s_setpc_b64 + +; GFX906: s_waitcnt +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX906-NEXT: s_setpc_b64 + +; FIXME: Should be using v_mad +; CIVI: v_mac_f32_e32 +define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %src0.ext.neg = fneg float %src0.ext + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_absf16lo_f16lo_f16lo: +; GFX900: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] + +; CIVI: v_mad_f32 +define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; GFX900: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 + +; GFX906: s_waitcnt +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 + +; CIVI: v_mad_f32 +define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src0.ext.neg.abs = fneg float %src0.ext.abs + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg.abs, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mad_f32 +define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negf32: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mad_f32 +define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.neg = fneg float %src2 + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_absf32: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mad_f32 +define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.abs = call float @llvm.fabs.f32(float %src2) + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.abs) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negabsf32: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mad_f32 +define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.abs = call float @llvm.fabs.f32(float %src2) + %src2.neg.abs = fneg float %src2.abs + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg.abs) + ret float %result +} + +; TODO: Fold inline immediates. Need to be careful because it is an +; f16 inline immediate that may be converted to f32, not an actual f32 +; inline immediate. + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GCN: s_waitcnt +; GFX9: v_mov_b32_e32 [[VREG:v[0-9]+]], 1.0 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding + +; CIVI: v_mad_f32 v0, v0, v1, 1.0 +; GCN-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GCN: s_waitcnt +; GFX9: v_mov_b32_e32 [[VREG:v[0-9]+]], 0.15915494 +; GFX900: v_mad_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding +; VI: v_mad_f32 v0, v0, v1, 0.15915494 +define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000) + ret float %result +} + +; Attempt to break inline immediate folding. If the operand is +; interpreted as f32, the inline immediate is really the f16 inline +; imm value converted to f32. +; fpext f16 1/2pi = 0x3e230000 +; f32 1/2pi = 0x3e22f983 +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GFX9: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x3e230000 +; GFX900: v_mad_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding + +; FIXME: Should be using v_madak_f32? +; CIVI: v_mov_b32_e32 v0, 0x3e230000 +; CIVI-NEXT: v_mac_f32_e32 v0, v2, v1 +define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2 = fpext half 0xH3118 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GFX9: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x367c0000 +; GFX900: v_mad_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, [[VREG]] op_sel_hi:[1,1,0] ; encoding + +; FIXME: Should be using v_madak_f32 +; CIVI: v_mov_b32_e32 v0, 0x367c0000 +; CIVI-NEXT: v_mac_f32_e32 v0, v2, v1 +define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2 = fpext half 0xH003F to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +; FIXME: op_sel is on the first mix for DAG, here it's on the second. Same thing or not? + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0 +; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v1, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v0, v2 + +; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v1, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v0, v2 +define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> ) + ret <2 x float> %result +} + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000 + +; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v1, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v0, v2 + +; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v1, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v0, v2 +define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2 = fpext <2 x half> to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2) + ret <2 x float> %result +} + +; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: +; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494 + +; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v1, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v0, v2 + +; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v1, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v0, v2 +define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %src2 = fpext <2 x half> to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> ) + ret <2 x float> %result +} + +; GCN-LABEL: {{^}}v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding + +; FIXME: Should be using v_mad +; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} +; CIVI-NEXT: v_mul_f32_e64 v{{[0-9]}}, 1.0, v{{[0-9]}} clamp +define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { + %src0.hi = extractelement <2 x half> %src0, i32 1 + %src1.hi = extractelement <2 x half> %src1, i32 1 + %src2.hi = extractelement <2 x half> %src2, i32 1 + %src0.ext = fpext half %src0.hi to float + %src1.ext = fpext half %src1.hi to float + %src2.ext = fpext half %src2.hi to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + ret float %clamp +} + +; GCN-LABEL: no_mix_simple: +; GCN: s_waitcnt +; GCN-NEXT: v_{{mad|fma}}_f32 v0, v0, v1, v2 +; GCN-NEXT: s_setpc_b64 +define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + ret float %result +} + +; GCN-LABEL: no_mix_simple_fabs: +; GCN: s_waitcnt +; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GCN-NEXT: s_setpc_b64 +define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { + %src0.fabs = call float @llvm.fabs.f32(float %src0) + %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2) + ret float %result +} + +; FIXME: Should abe able to select in thits case +; All sources are converted from f16, so it doesn't matter +; v_mad_mix_f32 flushes. + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; GFX900: v_cvt_f32_f16 +; GFX900: v_cvt_f32_f16 +; GFX900: v_cvt_f32_f16 +; GFX900: v_fma_f32 +define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; GFX900: v_cvt_f32_f16 +; GFX900: v_cvt_f32_f16 +; GFX900: v_fma_f32 + +; GFX906-NOT: v_cvt_f32_f16 +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX9: v_cvt_f32_f16 +; GFX9: v_cvt_f32_f16 +; GFX9: v_cvt_f32_f16 +; GFX9: v_mul_f32 +; GFX9: v_add_f32 +define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %mul = fmul float %src0.ext, %src1.ext + %result = fadd float %mul, %src2.ext + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX9: v_cvt_f32_f16 +; GFX9: v_cvt_f32_f16 +; GFX9: v_mul_f32 +; GFX9: v_add_f32 +define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %mul = fmul float %src0.ext, %src1.ext + %result = fadd float %mul, %src2 + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX9-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %mul = fmul contract float %src0.ext, %src1.ext + %result = fadd contract float %mul, %src2.ext + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; GCN: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX9-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 { + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %mul = fmul contract float %src0.ext, %src1.ext + %result = fadd contract float %mul, %src2 + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; GFX9: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX9-NEXT: s_setpc_b64 + +; FIXME: Should be v_mad? +; CIVI: v_mac_f32_e32 +define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 0 + %src0.neg = fsub half -0.0, %src0 + %src0.ext = fpext half %src0.neg to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float +; %src0.ext.neg = fsub float -0.0, %src0.ext + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; Make sure we don't fold pre-cvt fneg if we already have a fabs +; GCN-LABEL: {{^}}v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; GFX900: s_waitcnt +define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 + %src0.neg = fsub half -0.0, %src0 + %src0.ext = fpext half %src0.neg to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; GFX9: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX9-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 + %src0.abs = call half @llvm.fabs.f16(half %src0) + %src0.ext = fpext half %src0.abs to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; FIXME: Should be -v0 and without s_pack/v_pack + +; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; GFX9: s_waitcnt +; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0x8000, 0x8000 +; GFX9-NEXT: v_pk_add_f16 v0, s4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX9-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fneg = fsub <2 x half> , %src0.arg.bc + %src0 = extractelement <2 x half> %fneg, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; GFX9: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX9-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) + %src0 = extractelement <2 x half> %fabs, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +; FIXME: Should be -|v0| without the pack/pk/and. + +; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; GFX9: s_waitcnt +; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0x8000, 0x8000 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX9-NEXT: s_setpc_b64 +define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) + %fneg.fabs = fsub <2 x half> , %fabs + %src0 = extractelement <2 x half> %fneg.fabs, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src2.ext = fpext half %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +declare half @llvm.fabs.f16(half) #2 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 +declare float @llvm.fabs.f32(float) #2 +declare float @llvm.minnum.f32(float, float) #2 +declare float @llvm.maxnum.f32(float, float) #2 +declare float @llvm.fmuladd.f32(float, float, float) #2 +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } +attributes #2 = { nounwind readnone speculatable } diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2522,6 +2522,12 @@ return true; return false; } + + /// Report the maximum number of temporary operands needed by the predicate + /// matcher. + unsigned countRendererFns() const override { + return InsnMatcher->countRendererFns(); + } }; void InstructionMatcher::optimize() {