Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -320,6 +320,7 @@ SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); + SDValue visitFMULForFMACombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); @@ -7923,6 +7924,88 @@ return SDValue(); } +/// Try to perform FMA combining on a given FMUL node. +SDValue DAGCombiner::visitFMULForFMACombine(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc SL(N); + + assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); + + const TargetOptions &Options = DAG.getTarget().Options; + bool UnsafeFPMath = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + + // Floating-point multiply-add with intermediate rounding. + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); + + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = + ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && + TLI.isFMAFasterThanFMulAndFAdd(VT) && UnsafeFPMath); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return SDValue(); + + // Always prefer FMAD to FMA for precision. + unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + + // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) + // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) + auto FuseFADD = [&](SDValue X, SDValue Y) { + if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { + auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); + if (XC1 && XC1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + if (XC1 && XC1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + } + return SDValue(); + }; + + if (SDValue FMA = FuseFADD(N0, N1)) + return FMA; + if (SDValue FMA = FuseFADD(N1, N0)) + return FMA; + + // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) + // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) + // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) + // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) + auto FuseFSUB = [&](SDValue X, SDValue Y) { + if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { + auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); + if (XC0 && XC0->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + Y); + if (XC0 && XC0->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + + auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); + if (XC1 && XC1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + if (XC1 && XC1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + } + return SDValue(); + }; + + if (SDValue FMA = FuseFSUB(N0, N1)) + return FMA; + if (SDValue FMA = FuseFSUB(N1, N0)) + return FMA; + + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8230,6 +8313,12 @@ } } + // FMUL -> FMA combines: + if (SDValue Fused = visitFMULForFMACombine(N)) { + AddToWorklist(Fused.getNode()); + return Fused; + } + return SDValue(); } Index: test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll +++ test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -4,7 +4,7 @@ declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone ; FUNC-LABEL: {{^}}test_lrp: -; SI: v_sub_f32 +; SI: v_mad_f32 ; SI: v_mac_f32_e32 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone Index: test/CodeGen/X86/fma_patterns.ll =================================================================== --- test/CodeGen/X86/fma_patterns.ll +++ test/CodeGen/X86/fma_patterns.ll @@ -2,6 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4 +; +; Patterns (+ fneg variants): add(mul(x,y),z), sub(mul(x,y),z) +; + define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { ; CHECK-LABEL: test_x86_fmadd_ps: ; CHECK: # BB#0: @@ -264,3 +268,304 @@ ret <4 x float> %res } +; +; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y) +; + +define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_add_x_one_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_add_x_one_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %a, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_add_x_one: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_add_x_one: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %y, %a + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_add_x_negone_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_add_x_negone_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %a, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_add_x_negone: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_add_x_negone: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %y, %a + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_one_x_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_one_x_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_one_x: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_one_x: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_negone_x_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_negone_x_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_negone_x: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_negone_x: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_x_one_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_x_one_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_x_one: +; CHECK: # BB#0: +; CHECK-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_x_one: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_sub_x_negone_y: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_sub_x_negone_y: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %s, %y + ret <4 x float> %m +} + +define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: test_v4f32_mul_y_sub_x_negone: +; CHECK: # BB#0: +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_mul_y_sub_x_negone: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + +; +; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) +; + +define float @test_f32_interp(float %x, float %y, float %t) { +; CHECK-LABEL: test_f32_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_f32_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub float 1.0, %t + %tx = fmul float %x, %t + %ty = fmul float %y, %t1 + %r = fadd float %tx, %ty + ret float %r +} + +define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) { +; CHECK-LABEL: test_v4f32_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddps %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <4 x float> , %t + %tx = fmul <4 x float> %x, %t + %ty = fmul <4 x float> %y, %t1 + %r = fadd <4 x float> %tx, %ty + ret <4 x float> %r +} + +define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { +; CHECK-LABEL: test_v8f32_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v8f32_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1 +; CHECK_FMA4-NEXT: vfmaddps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <8 x float> , %t + %tx = fmul <8 x float> %x, %t + %ty = fmul <8 x float> %y, %t1 + %r = fadd <8 x float> %tx, %ty + ret <8 x float> %r +} + +define double @test_f64_interp(double %x, double %y, double %t) { +; CHECK-LABEL: test_f64_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_f64_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub double 1.0, %t + %tx = fmul double %x, %t + %ty = fmul double %y, %t1 + %r = fadd double %tx, %ty + ret double %r +} + +define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) { +; CHECK-LABEL: test_v2f64_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v2f64_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1 +; CHECK_FMA4-NEXT: vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <2 x double> , %t + %tx = fmul <2 x double> %x, %t + %ty = fmul <2 x double> %y, %t1 + %r = fadd <2 x double> %tx, %ty + ret <2 x double> %r +} + +define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) { +; CHECK-LABEL: test_v4f64_interp: +; CHECK: # BB#0: +; CHECK-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f64_interp: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1 +; CHECK_FMA4-NEXT: vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK_FMA4-NEXT: retq + %t1 = fsub <4 x double> , %t + %tx = fmul <4 x double> %x, %t + %ty = fmul <4 x double> %y, %t1 + %r = fadd <4 x double> %tx, %ty + ret <4 x double> %r +}