Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -321,6 +321,7 @@ SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); + SDValue visitFMULForFMACombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); @@ -619,7 +620,7 @@ assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); const SDNodeFlags *Flags = Op.getNode()->getFlags(); - + switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); case ISD::ConstantFP: { @@ -7481,25 +7482,23 @@ SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath); + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && - TLI.isOperationLegal(ISD::FMAD, VT)); + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. - bool HasFMA = ((!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && - UnsafeFPMath); + bool HasFMA = + AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); // Always prefer FMAD to FMA for precision. - unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); @@ -7527,7 +7526,7 @@ } // Look through FP_EXTEND nodes to do more combining. - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); @@ -7553,7 +7552,7 @@ } // More folding opportunities when target permits. - if ((UnsafeFPMath || HasFMAD) && Aggressive) { + if ((AllowFusion || HasFMAD) && Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) if (N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL) { @@ -7576,7 +7575,7 @@ N0)); } - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( @@ -7666,25 +7665,23 @@ SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath); + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && - TLI.isOperationLegal(ISD::FMAD, VT)); + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. - bool HasFMA = ((!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && - UnsafeFPMath); + bool HasFMA = + AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); // Always prefer FMAD to FMA for precision. - unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); @@ -7717,7 +7714,7 @@ } // Look through FP_EXTEND nodes to do more combining. - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { @@ -7793,7 +7790,7 @@ } // More folding opportunities when target permits. - if ((UnsafeFPMath || HasFMAD) && Aggressive) { + if ((AllowFusion || HasFMAD) && Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode && @@ -7823,7 +7820,7 @@ N21, N0)); } - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { @@ -7924,6 +7921,88 @@ return SDValue(); } +/// Try to perform FMA combining on a given FMUL node. +SDValue DAGCombiner::visitFMULForFMACombine(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc SL(N); + + assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); + + const TargetOptions &Options = DAG.getTarget().Options; + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + + // Floating-point multiply-add with intermediate rounding. + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); + + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = + AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return SDValue(); + + // Always prefer FMAD to FMA for precision. + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + + // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) + // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) + auto FuseFADD = [&](SDValue X, SDValue Y) { + if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { + auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); + if (XC1 && XC1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + if (XC1 && XC1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + } + return SDValue(); + }; + + if (SDValue FMA = FuseFADD(N0, N1)) + return FMA; + if (SDValue FMA = FuseFADD(N1, N0)) + return FMA; + + // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) + // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) + // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) + // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) + auto FuseFSUB = [&](SDValue X, SDValue Y) { + if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { + auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); + if (XC0 && XC0->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + Y); + if (XC0 && XC0->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + + auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); + if (XC1 && XC1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + if (XC1 && XC1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + } + return SDValue(); + }; + + if (SDValue FMA = FuseFSUB(N0, N1)) + return FMA; + if (SDValue FMA = FuseFSUB(N1, N0)) + return FMA; + + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8231,6 +8310,12 @@ } } + // FMUL -> FMA combines: + if (SDValue Fused = visitFMULForFMACombine(N)) { + AddToWorklist(Fused.getNode()); + return Fused; + } + return SDValue(); } Index: test/CodeGen/AMDGPU/fma-combine.ll =================================================================== --- test/CodeGen/AMDGPU/fma-combine.ll +++ test/CodeGen/AMDGPU/fma-combine.ll @@ -364,5 +364,205 @@ ret void } +; +; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) +; + +; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]] +define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, 1.0 + %m = fmul float %a, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY:v[0-9]]], [[VX:v[0-9]]] +define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, 1.0 + %m = fmul float %y, %a + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, -1.0 + %m = fmul float %a, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %a = fadd float %x, -1.0 + %m = fmul float %y, %a + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]] +define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float 1.0, %x + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], [[VY]] +define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float 1.0, %x + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float -1.0, %x + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: +; SI: v_mad_f32 [[VX:v[0-9]]], -[[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float -1.0, %x + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, 1.0 + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: +; SI: v_mad_f32 [[VX:v[0-9]]], [[VX]], [[VY:v[0-9]]], -[[VY]] +define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, 1.0 + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]] +define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, -1.0 + %m = fmul float %s, %y + store float %m, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: +; SI: v_mac_f32_e32 [[VY:v[0-9]]], [[VY]], [[VX:v[0-9]]] +define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %s = fsub float %x, -1.0 + %m = fmul float %y, %s + store float %m, float addrspace(1)* %out + ret void +} + +; +; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) +; + +; FUNC-LABEL: {{^}}test_f32_interp: +; SI: v_mad_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] +; SI: v_mac_f32_e32 [[VR]], [[VT]], [[VX:v[0-9]]] +define void @test_f32_interp(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2, + float addrspace(1)* %in3) { + %x = load float, float addrspace(1)* %in1 + %y = load float, float addrspace(1)* %in2 + %t = load float, float addrspace(1)* %in3 + %t1 = fsub float 1.0, %t + %tx = fmul float %x, %t + %ty = fmul float %y, %t1 + %r = fadd float %tx, %ty + store float %r, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_f64_interp: +; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]] +; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]] +define void @test_f64_interp(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2, + double addrspace(1)* %in3) { + %x = load double, double addrspace(1)* %in1 + %y = load double, double addrspace(1)* %in2 + %t = load double, double addrspace(1)* %in3 + %t1 = fsub double 1.0, %t + %tx = fmul double %x, %t + %ty = fmul double %y, %t1 + %r = fadd double %tx, %ty + store double %r, double addrspace(1)* %out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll +++ test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -4,7 +4,7 @@ declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone ; FUNC-LABEL: {{^}}test_lrp: -; SI: v_sub_f32 +; SI: v_mad_f32 ; SI: v_mac_f32_e32 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone Index: test/CodeGen/X86/fma_patterns.ll =================================================================== --- test/CodeGen/X86/fma_patterns.ll +++ test/CodeGen/X86/fma_patterns.ll @@ -2,6 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4 +; +; Patterns (+ fneg variants): add(mul(x,y),z), sub(mul(x,y),z) +; + define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { ; CHECK-LABEL: test_x86_fmadd_ps: ; CHECK: # BB#0: @@ -264,3 +268,304 @@ ret <4 x float> %res } +; +; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) +; + +define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_add_x_one_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_add_x_one_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %a, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_add_x_one: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_add_x_one: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %y, %a + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_add_x_negone_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_add_x_negone_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %a, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_add_x_negone: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_add_x_negone: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %y, %a + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_one_x_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_one_x_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_one_x: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_one_x: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_negone_x_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_negone_x_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_negone_x: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_negone_x: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_x_one_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_x_one_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_x_one: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_x_one: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_x_negone_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_x_negone_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_x_negone: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_x_negone: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +; +; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) +; + +define float @test_f32_interp(float %x, float %y, float %t) { +; CHECK-LABEL: test_f32_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_f32_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub float 1.0, %t + %tx = fmul float %x, %t + %ty = fmul float %y, %t1 + %r = fadd float %tx, %ty + ret float %r +} + +define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) { +; CHECK-LABEL: test_v4f32_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <4 x float> , %t + %tx = fmul <4 x float> %x, %t + %ty = fmul <4 x float> %y, %t1 + %r = fadd <4 x float> %tx, %ty + ret <4 x float> %r +} + +define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { +; CHECK-LABEL: test_v8f32_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v8f32_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1 +; CHECK_FMA4-NEXT: vfmaddps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <8 x float> , %t + %tx = fmul <8 x float> %x, %t + %ty = fmul <8 x float> %y, %t1 + %r = fadd <8 x float> %tx, %ty + ret <8 x float> %r +} + +define double @test_f64_interp(double %x, double %y, double %t) { +; CHECK-LABEL: test_f64_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_f64_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub double 1.0, %t + %tx = fmul double %x, %t + %ty = fmul double %y, %t1 + %r = fadd double %tx, %ty + ret double %r +} + +define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) { +; CHECK-LABEL: test_v2f64_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v2f64_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <2 x double> , %t + %tx = fmul <2 x double> %x, %t + %ty = fmul <2 x double> %y, %t1 + %r = fadd <2 x double> %tx, %ty + ret <2 x double> %r +} + +define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) { +; CHECK-LABEL: test_v4f64_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f64_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1 +; CHECK_FMA4-NEXT: vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <4 x double> , %t + %tx = fmul <4 x double> %x, %t + %ty = fmul <4 x double> %y, %t1 + %r = fadd <4 x double> %tx, %ty + ret <4 x double> %r +}