Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -446,6 +446,7 @@
   setTargetDAGCombine(ISD::STORE);
 
   setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::FDIV);
 
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::VSELECT);
@@ -6902,6 +6903,73 @@
   return SDValue();
 }
 
+// As FMUL is much faster than FDIV, we combine multiple FDIVs with the same
+// divisor to a reciprocal and multiple FMULs with the reciprocal.
+// E.g.  ( a / D; b / D; c / D; )
+//           =>
+//       ( recip = 1.0 / D; a * recip; b * recip; c * recip )
+// Notice that one shortcoming is the critical patch increases from "one FDIV"
+// to "one FDIV + one FMUL", which may cause regressions on some benchmarks. To
+// reduce regressions, we only do such combine when there are more than two
+// FDIVs.
+// If one of the FDIV is reciprocal, we reuse it directly.
+// E.g.  ( recip = 1.0 / D; c = a / D; )
+//           =>
+//       ( recip = 1.0 / D; c = a * recip; )
+static SDValue performFDIVCombine(SDNode *N, SelectionDAG &DAG) {
+  // Only do such combine when unsafe fp math is enabled.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  SDValue Dividend = N->getOperand(0);
+  SDValue Divisor = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  SDValue FPOne = DAG.getConstantFP(1.0, VT); // floating point 1.0
+  // Skip if current Node is a reciprocal.
+  if (Dividend == FPOne)
+    return SDValue();
+
+  SDValue Reciprocal = SDValue();
+  SmallVector<SDNode *, 4> Users;
+  // Collect all non-reciprocal users of Divisor. Also find if there is already
+  // a reciprocal of Divisor. If so, we can reuse the reciprocal instead of
+  // creating a new one.
+  for (SDNode::use_iterator UI = Divisor.getNode()->use_begin(),
+                            UE = Divisor.getNode()->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = UI.getUse().getUser();
+    if (User->getOpcode() == ISD::FDIV && User->getOperand(1) == Divisor) {
+      if (User->getOperand(0) == FPOne)
+        Reciprocal = SDValue(User, 0);
+      else
+        Users.push_back(User);
+    }
+  }
+
+  if (Reciprocal == SDValue()) {
+    // Skip if there is less than three FDIVs.
+    // FIXME: Different subtargets may behave differently. This can be
+    // controlled depending on subtargets.
+    if (Users.size() < 3)
+      return SDValue();
+    // Create a reciprocal of Divisor if there is no such reciprocal.
+    Reciprocal = DAG.getNode(ISD::FDIV, SDLoc(N), VT, FPOne, Divisor);
+  } else if (Users.size() == 0) {
+    // Skip if there is no other users except the reciprocal.
+    return SDValue();
+  }
+
+  // Dividend / Divisor => Dividend * Reciprocal
+  for (auto I = Users.begin(), E = Users.end(); I != E; ++I) {
+    SDValue NewNode =
+        DAG.getNode(ISD::FMUL, SDLoc(*I), VT, (*I)->getOperand(0), Reciprocal);
+    DAG.ReplaceAllUsesWith(*I, NewNode.getNode());
+  }
+
+  return SDValue();
+}
+
 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
                                                          SelectionDAG &DAG) {
   // Take advantage of vector comparisons producing 0 or -1 in each lane to
@@ -8516,6 +8584,8 @@
     return performXorCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::FDIV:
+      return performFDIVCombine(N, DAG);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return performIntToFpCombine(N, DAG);
Index: test/CodeGen/AArch64/fdiv-combine.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/fdiv-combine.ll
@@ -0,0 +1,145 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; Following test cases check:
+;   a / D; b / D; c / D;
+;                =>
+;   recip = 1.0 / D; a * recip; b * recip; c * recip;
+define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
+; CHECK-LABEL: three_fdiv_float:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv float %a, %D
+  %div1 = fdiv float %b, %D
+  %div2 = fdiv float %c, %D
+  tail call void @foo_3f(float %div, float %div1, float %div2)
+  ret void
+}
+
+define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
+; CHECK-LABEL: three_fdiv_double:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv double %a, %D
+  %div1 = fdiv double %b, %D
+  %div2 = fdiv double %c, %D
+  tail call void @foo_3d(double %div, double %div1, double %div2)
+  ret void
+}
+
+define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+; CHECK-LABEL: three_fdiv_4xfloat:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv <4 x float> %a, %D
+  %div1 = fdiv <4 x float> %b, %D
+  %div2 = fdiv <4 x float> %c, %D
+  tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2)
+  ret void
+}
+
+define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
+; CHECK-LABEL: three_fdiv_2xdouble:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+; CHECK: fmul
+; CHECK: fmul
+  %div = fdiv <2 x double> %a, %D
+  %div1 = fdiv <2 x double> %b, %D
+  %div2 = fdiv <2 x double> %c, %D
+  tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2)
+  ret void
+}
+
+; Following test cases check we never combine two FDIVs if neither of them
+; calculates a reciprocal.
+define void @two_fdiv_float(float %D, float %a, float %b) #0 {
+; CHECK-LABEL: two_fdiv_float:
+; CHECK: fdiv
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fmul
+  %div = fdiv float %a, %D
+  %div1 = fdiv float %b, %D
+  tail call void @foo_2f(float %div, float %div1)
+  ret void
+}
+
+define void @two_fdiv_double(double %D, double %a, double %b) #0 {
+; CHECK-LABEL: two_fdiv_double:
+; CHECK: fdiv
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fmul
+  %div = fdiv double %a, %D
+  %div1 = fdiv double %b, %D
+  tail call void @foo_2d(double %div, double %div1)
+  ret void
+}
+
+; Following test cases check
+;   recip = 1.0 / D; c = a / D;
+;            =>
+;   recip = 1.0 / D; c = a * recip;
+define void @recip_fdiv_float(float %D, float %a) #0 {
+; CHECK-LABEL: recip_fdiv_float:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+  %div = fdiv float 1.000000e+00, %D
+  %div1 = fdiv float %a, %D
+  tail call void @foo_2f(float %div, float %div1)
+  ret void
+}
+
+define void @recip_fdiv_double(double %D, double %a) #0 {
+; CHECK-LABEL: recip_fdiv_double:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+  %div = fdiv double 1.000000e+00, %D
+  %div1 = fdiv double %a, %D
+  tail call void @foo_2d(double %div, double %div1)
+  ret void
+}
+
+define void @recip_fdiv_4xfloat(<4 x float> %D, <4 x float> %a) #0 {
+; CHECK-LABEL: recip_fdiv_4xfloat:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %D
+  %div1 = fdiv <4 x float> %a, %D
+  tail call void @foo_2_4xf(<4 x float> %div, <4 x float> %div1)
+  ret void
+}
+
+define void @recip_fdiv_2xdouble(<2 x double> %D, <2 x double> %a) #0 {
+; CHECK-LABEL: recip_fdiv_2xdouble:
+; CHECK: fdiv
+; CHECK-NEXT-NOT: fdiv
+; CHECK: fmul
+  %div = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %D
+  %div1 = fdiv <2 x double> %a, %D
+  tail call void @foo_2_2xd(<2 x double> %div, <2 x double> %div1)
+  ret void
+}
+
+declare void @foo_3f(float, float, float)
+declare void @foo_3d(double, double, double)
+declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>)
+declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)
+declare void @foo_2f(float, float)
+declare void @foo_2d(double, double)
+declare void @foo_2_4xf(<4 x float>, <4 x float>)
+declare void @foo_2_2xd(<2 x double>, <2 x double>)
+
+attributes #0 = { "unsafe-fp-math"="true" }
+