diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2626,6 +2626,17 @@ return false; } + /// Return true if we want to fold a*b +/- c*d as fma(c, d, a*b) or + /// fma(-c, d, a*b), otherwise, it is folded as fma(a, b, c*d) or + /// fma(a, b, -c*d). The result could be different between these two as + /// different result of folding a*b against c*d. + virtual bool shouldFMAFoldSecondFMul(const SDNode *N) const { + assert(N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB); + assert(N->getOperand(0).getOpcode() == ISD::FMUL && + N->getOperand(1).getOpcode() == ISD::FMUL); + return false; + } + /// Returns true if the FADD or FSUB node passed could legally be combined with /// an fmul to form an ISD::FMAD. virtual bool isFMADLegalForFAddFSub(const SelectionDAG &DAG, diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11679,11 +11679,17 @@ return AllowFusionGlobally || isContractable(N.getNode()); }; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), - // prefer to fold the multiply with fewer uses. - if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { - if (N0.getNode()->use_size() > N1.getNode()->use_size()) - std::swap(N0, N1); - } + // prefer to fold the multiply with fewer uses or target really want to. + bool IsFMulAddFMul = + Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1); + bool FoldSecondFMul = + IsFMulAddFMul && + ((N0.getNode()->use_size() > N1.getNode()->use_size()) || + ((N0.getNode()->use_size() == N1.getNode()->use_size()) && + TLI.shouldFMAFoldSecondFMul(N))); + + if (FoldSecondFMul) + std::swap(N0, N1); // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { @@ -11898,19 +11904,50 @@ }; // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) - if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); - } + auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { + if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), + XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z), + Flags); + } + return SDValue(); + }; // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. - if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - N1.getOperand(0)), - N1.getOperand(1), N0, Flags); + auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) { + if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), + YZ.getOperand(1), X, Flags); + } + return SDValue(); + }; + + // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses or target really want to. + bool IsFMulSubFMul = + Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1); + bool FoldSecondFMul = + IsFMulSubFMul && + ((N0.getNode()->use_size() > N1.getNode()->use_size()) || + ((N0.getNode()->use_size() == N1.getNode()->use_size()) && + TLI.shouldFMAFoldSecondFMul(N))); + + if (FoldSecondFMul) { + // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) + if (SDValue V = tryToFoldXSubYZ(N0, N1)) + return V; + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) + if (SDValue V = tryToFoldXYSubZ(N0, N1)) + return V; + } else { + // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) + if (SDValue V = tryToFoldXYSubZ(N0, N1)) + return V; + // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) + if (SDValue V = tryToFoldXSubYZ(N0, N1)) + return V; } // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -911,6 +911,8 @@ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + bool shouldFMAFoldSecondFMul(const SDNode *N) const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; // Should we expand the build vector with shuffles? diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -124,6 +124,11 @@ static cl::opt UseAbsoluteJumpTables("ppc-use-absolute-jumptables", cl::desc("use absolute jump tables on ppc"), cl::Hidden); +static cl::opt FMAEvaFirstFMul( + "ppc-fma-eva-first-fmul", + cl::desc("fold a*b+/-c*d as fma(c, d, a*b) or fma(-c, d, a*b)"), + cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); @@ -1468,6 +1473,13 @@ return true; } +bool PPCTargetLowering::shouldFMAFoldSecondFMul(const SDNode *N) const { + assert(N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB); + assert(N->getOperand(0).getOpcode() == ISD::FMUL && + N->getOperand(1).getOpcode() == ISD::FMUL); + return FMAEvaFirstFMul; +} + //===----------------------------------------------------------------------===// // Node matching predicates, for use by the tblgen matching code. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/PowerPC/fma-precision.ll b/llvm/test/CodeGen/PowerPC/fma-precision.ll --- a/llvm/test/CodeGen/PowerPC/fma-precision.ll +++ b/llvm/test/CodeGen/PowerPC/fma-precision.ll @@ -1,15 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-linux-gnu | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mcpu=pwr9 --ppc-fma-eva-first-fmul -mtriple=powerpc64le-linux-gnu | \ +; RUN: FileCheck %s -check-prefix=CHECK-EVA-FIRST-OP ; Verify that the fold of a*b-c*d respect the uses of a*b define double @fsub1(double %a, double %b, double %c, double %d) { ; CHECK-LABEL: fsub1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xsmuldp 3, 4, 3 ; CHECK-NEXT: xsmuldp 0, 2, 1 -; CHECK-NEXT: xsmsubadp 3, 2, 1 -; CHECK-NEXT: xsmuldp 1, 0, 3 +; CHECK-NEXT: fmr 1, 0 +; CHECK-NEXT: xsnmsubadp 1, 4, 3 +; CHECK-NEXT: xsmuldp 1, 0, 1 ; CHECK-NEXT: blr +; +; CHECK-EVA-FIRST-OP-LABEL: fsub1: +; CHECK-EVA-FIRST-OP: # %bb.0: # %entry +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 0, 2, 1 +; CHECK-EVA-FIRST-OP-NEXT: fmr 1, 0 +; CHECK-EVA-FIRST-OP-NEXT: xsnmsubadp 1, 4, 3 +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 1, 0, 1 +; CHECK-EVA-FIRST-OP-NEXT: blr entry: %mul = fmul fast double %b, %a %mul1 = fmul fast double %d, %c @@ -27,6 +37,14 @@ ; CHECK-NEXT: xsmsubadp 3, 2, 1 ; CHECK-NEXT: xsmuldp 1, 0, 3 ; CHECK-NEXT: blr +; +; CHECK-EVA-FIRST-OP-LABEL: fsub2: +; CHECK-EVA-FIRST-OP: # %bb.0: # %entry +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 0, 4, 3 +; CHECK-EVA-FIRST-OP-NEXT: fmr 3, 0 +; CHECK-EVA-FIRST-OP-NEXT: xsmsubadp 3, 2, 1 +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 1, 0, 3 +; CHECK-EVA-FIRST-OP-NEXT: blr entry: %mul = fmul fast double %b, %a %mul1 = fmul fast double %d, %c @@ -43,6 +61,12 @@ ; CHECK-NEXT: xsmsubadp 0, 2, 1 ; CHECK-NEXT: fmr 1, 0 ; CHECK-NEXT: blr +; +; CHECK-EVA-FIRST-OP-LABEL: fsub3: +; CHECK-EVA-FIRST-OP: # %bb.0: # %entry +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 1, 2, 1 +; CHECK-EVA-FIRST-OP-NEXT: xsnmsubadp 1, 4, 3 +; CHECK-EVA-FIRST-OP-NEXT: blr entry: %mul = fmul fast double %b, %a %mul1 = fmul fast double %d, %c @@ -59,6 +83,14 @@ ; CHECK-NEXT: xsmaddadp 1, 4, 3 ; CHECK-NEXT: xsmuldp 1, 0, 1 ; CHECK-NEXT: blr +; +; CHECK-EVA-FIRST-OP-LABEL: fadd1: +; CHECK-EVA-FIRST-OP: # %bb.0: # %entry +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 0, 2, 1 +; CHECK-EVA-FIRST-OP-NEXT: fmr 1, 0 +; CHECK-EVA-FIRST-OP-NEXT: xsmaddadp 1, 4, 3 +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 1, 0, 1 +; CHECK-EVA-FIRST-OP-NEXT: blr entry: %mul = fmul fast double %b, %a %mul1 = fmul fast double %d, %c @@ -76,6 +108,14 @@ ; CHECK-NEXT: xsmaddadp 3, 2, 1 ; CHECK-NEXT: xsmuldp 1, 0, 3 ; CHECK-NEXT: blr +; +; CHECK-EVA-FIRST-OP-LABEL: fadd2: +; CHECK-EVA-FIRST-OP: # %bb.0: # %entry +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 0, 4, 3 +; CHECK-EVA-FIRST-OP-NEXT: fmr 3, 0 +; CHECK-EVA-FIRST-OP-NEXT: xsmaddadp 3, 2, 1 +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 1, 0, 3 +; CHECK-EVA-FIRST-OP-NEXT: blr entry: %mul = fmul fast double %b, %a %mul1 = fmul fast double %d, %c @@ -91,6 +131,13 @@ ; CHECK-NEXT: xsmuldp 1, 2, 1 ; CHECK-NEXT: xsmaddadp 1, 4, 3 ; CHECK-NEXT: blr +; +; CHECK-EVA-FIRST-OP-LABEL: fadd3: +; CHECK-EVA-FIRST-OP: # %bb.0: # %entry +; CHECK-EVA-FIRST-OP-NEXT: xsmuldp 0, 4, 3 +; CHECK-EVA-FIRST-OP-NEXT: xsmaddadp 0, 2, 1 +; CHECK-EVA-FIRST-OP-NEXT: fmr 1, 0 +; CHECK-EVA-FIRST-OP-NEXT: blr entry: %mul = fmul fast double %b, %a %mul1 = fmul fast double %d, %c diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -177,12 +177,11 @@ ; CHECK-NEXT: fmuls 1, 1, 0 ; CHECK-NEXT: fmadds 1, 1, 0, 4 ; CHECK-NEXT: fmuls 0, 0, 5 -; CHECK-NEXT: fres 5, 2 +; CHECK-NEXT: fmuls 0, 0, 1 +; CHECK-NEXT: fres 1, 2 ; CHECK-NEXT: fmuls 4, 0, 1 -; CHECK-NEXT: fmuls 4, 4, 5 -; CHECK-NEXT: fmuls 2, 2, 4 -; CHECK-NEXT: fmsubs 0, 0, 1, 2 -; CHECK-NEXT: fmadds 0, 5, 0, 4 +; CHECK-NEXT: fnmsubs 0, 2, 4, 0 +; CHECK-NEXT: fmadds 0, 1, 0, 4 ; CHECK-NEXT: fmuls 1, 3, 0 ; CHECK-NEXT: blr %x = call fast float @llvm.sqrt.f32(float %a)