Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -446,6 +446,7 @@ setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); @@ -6902,6 +6903,73 @@ return SDValue(); } +// As FMUL is much faster than FDIV, we combine multiple FDIVs with the same +// divisor to a reciprocal and multiple FMULs with the reciprocal. +// E.g. ( a / D; b / D; c / D; ) +// => +// ( recip = 1.0 / D; a * recip; b * recip; c * recip ) +// Notice that one shortcoming is the critical patch increases from "one FDIV" +// to "one FDIV + one FMUL", which may cause regressions on some benchmarks. To +// reduce regressions, we only do such combine when there are more than two +// FDIVs. +// If one of the FDIV is reciprocal, we reuse it directly. +// E.g. ( recip = 1.0 / D; c = a / D; ) +// => +// ( recip = 1.0 / D; c = a * recip; ) +static SDValue performFDIVCombine(SDNode *N, SelectionDAG &DAG) { + // Only do such combine when unsafe fp math is enabled. + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + SDValue Dividend = N->getOperand(0); + SDValue Divisor = N->getOperand(1); + EVT VT = N->getValueType(0); + + SDValue FPOne = DAG.getConstantFP(1.0, VT); // floating point 1.0 + // Skip if current Node is a reciprocal. + if (Dividend == FPOne) + return SDValue(); + + SDValue Reciprocal = SDValue(); + SmallVector Users; + // Collect all non-reciprocal users of Divisor. Also find if there is already + // a reciprocal of Divisor. If so, we can reuse the reciprocal instead of + // creating a new one. + for (SDNode::use_iterator UI = Divisor.getNode()->use_begin(), + UE = Divisor.getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = UI.getUse().getUser(); + if (User->getOpcode() == ISD::FDIV && User->getOperand(1) == Divisor) { + if (User->getOperand(0) == FPOne) + Reciprocal = SDValue(User, 0); + else + Users.push_back(User); + } + } + + if (Reciprocal == SDValue()) { + // Skip if there is less than three FDIVs. + // FIXME: Different subtargets may behave differently. This can be + // controlled depending on subtargets. + if (Users.size() < 3) + return SDValue(); + // Create a reciprocal of Divisor if there is no such reciprocal. + Reciprocal = DAG.getNode(ISD::FDIV, SDLoc(N), VT, FPOne, Divisor); + } else if (Users.size() == 0) { + // Skip if there is no other users except the reciprocal. + return SDValue(); + } + + // Dividend / Divisor => Dividend * Reciprocal + for (auto I = Users.begin(), E = Users.end(); I != E; ++I) { + SDValue NewNode = + DAG.getNode(ISD::FMUL, SDLoc(*I), VT, (*I)->getOperand(0), Reciprocal); + DAG.ReplaceAllUsesWith(*I, NewNode.getNode()); + } + + return SDValue(); +} + static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to @@ -8516,6 +8584,8 @@ return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMulCombine(N, DAG, DCI, Subtarget); + case ISD::FDIV: + return performFDIVCombine(N, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG); Index: test/CodeGen/AArch64/fdiv-combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/fdiv-combine.ll @@ -0,0 +1,145 @@ +; RUN: llc -march=aarch64 < %s | FileCheck %s + +; Following test cases check: +; a / D; b / D; c / D; +; => +; recip = 1.0 / D; a * recip; b * recip; c * recip; +define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 { +; CHECK-LABEL: three_fdiv_float: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv float %a, %D + %div1 = fdiv float %b, %D + %div2 = fdiv float %c, %D + tail call void @foo_3f(float %div, float %div1, float %div2) + ret void +} + +define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 { +; CHECK-LABEL: three_fdiv_double: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv double %a, %D + %div1 = fdiv double %b, %D + %div2 = fdiv double %c, %D + tail call void @foo_3d(double %div, double %div1, double %div2) + ret void +} + +define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; CHECK-LABEL: three_fdiv_4xfloat: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv <4 x float> %a, %D + %div1 = fdiv <4 x float> %b, %D + %div2 = fdiv <4 x float> %c, %D + tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2) + ret void +} + +define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { +; CHECK-LABEL: three_fdiv_2xdouble: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul +; CHECK: fmul +; CHECK: fmul + %div = fdiv <2 x double> %a, %D + %div1 = fdiv <2 x double> %b, %D + %div2 = fdiv <2 x double> %c, %D + tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2) + ret void +} + +; Following test cases check we never combine two FDIVs if neither of them +; calculates a reciprocal. +define void @two_fdiv_float(float %D, float %a, float %b) #0 { +; CHECK-LABEL: two_fdiv_float: +; CHECK: fdiv +; CHECK: fdiv +; CHECK-NEXT-NOT: fmul + %div = fdiv float %a, %D + %div1 = fdiv float %b, %D + tail call void @foo_2f(float %div, float %div1) + ret void +} + +define void @two_fdiv_double(double %D, double %a, double %b) #0 { +; CHECK-LABEL: two_fdiv_double: +; CHECK: fdiv +; CHECK: fdiv +; CHECK-NEXT-NOT: fmul + %div = fdiv double %a, %D + %div1 = fdiv double %b, %D + tail call void @foo_2d(double %div, double %div1) + ret void +} + +; Following test cases check +; recip = 1.0 / D; c = a / D; +; => +; recip = 1.0 / D; c = a * recip; +define void @recip_fdiv_float(float %D, float %a) #0 { +; CHECK-LABEL: recip_fdiv_float: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul + %div = fdiv float 1.000000e+00, %D + %div1 = fdiv float %a, %D + tail call void @foo_2f(float %div, float %div1) + ret void +} + +define void @recip_fdiv_double(double %D, double %a) #0 { +; CHECK-LABEL: recip_fdiv_double: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul + %div = fdiv double 1.000000e+00, %D + %div1 = fdiv double %a, %D + tail call void @foo_2d(double %div, double %div1) + ret void +} + +define void @recip_fdiv_4xfloat(<4 x float> %D, <4 x float> %a) #0 { +; CHECK-LABEL: recip_fdiv_4xfloat: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul + %div = fdiv <4 x float> , %D + %div1 = fdiv <4 x float> %a, %D + tail call void @foo_2_4xf(<4 x float> %div, <4 x float> %div1) + ret void +} + +define void @recip_fdiv_2xdouble(<2 x double> %D, <2 x double> %a) #0 { +; CHECK-LABEL: recip_fdiv_2xdouble: +; CHECK: fdiv +; CHECK-NEXT-NOT: fdiv +; CHECK: fmul + %div = fdiv <2 x double> , %D + %div1 = fdiv <2 x double> %a, %D + tail call void @foo_2_2xd(<2 x double> %div, <2 x double> %div1) + ret void +} + +declare void @foo_3f(float, float, float) +declare void @foo_3d(double, double, double) +declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>) +declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>) +declare void @foo_2f(float, float) +declare void @foo_2d(double, double) +declare void @foo_2_4xf(<4 x float>, <4 x float>) +declare void @foo_2_2xd(<2 x double>, <2 x double>) + +attributes #0 = { "unsafe-fp-math"="true" } +