Index: include/llvm/MC/MCSchedule.h
===================================================================
--- include/llvm/MC/MCSchedule.h
+++ include/llvm/MC/MCSchedule.h
@@ -180,6 +180,12 @@
   unsigned MispredictPenalty;
   static const unsigned DefaultMispredictPenalty = 10;
 
+  // FdivLatency is the expected latency of single precision fdiv instruction.
+  //
+  // Most of microarchitecture have high latency for floating point division.
+  unsigned FdivLatency;
+  static const unsigned DefaultFdivLatency = 14;
+
   bool PostRAScheduler; // default value is false
 
   bool CompleteModel;
Index: include/llvm/Target/TargetSchedule.td
===================================================================
--- include/llvm/Target/TargetSchedule.td
+++ include/llvm/Target/TargetSchedule.td
@@ -84,6 +84,7 @@
   int LoadLatency = -1; // Cycles for loads to access the cache.
   int HighLatency = -1; // Approximation of cycles for "high latency" ops.
   int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
+  int FdivLatency = -1; // Single precision fdiv latency
 
   // Per-cycle resources tables.
   ProcessorItineraries Itineraries = NoItineraries;
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -121,6 +121,7 @@
     const TargetMachine *TM;
     const TargetLowering *TLI;
     const TargetTransformInfo *TTI;
+    const TargetSubtargetInfo *STI;
     const TargetLibraryInfo *TLInfo;
 
     /// As we scan instructions optimizing them, this is the next instruction
@@ -214,8 +215,10 @@
   PromotedInsts.clear();
 
   ModifiedDT = false;
-  if (TM)
-    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  if (TM) {
+    STI = TM->getSubtargetImpl(F);
+    TLI = STI->getTargetLowering();
+  }
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   OptSize = F.optForSize();
@@ -4500,6 +4503,7 @@
 
 /// Returns true if a SelectInst should be turned into an explicit branch.
 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
+                                                const TargetSubtargetInfo *STI,
                                                 SelectInst *SI) {
   // FIXME: This should use the same heuristics as IfConversion to determine
   // whether a select is better represented as a branch.  This requires that
@@ -4507,11 +4511,31 @@
   // case currently.
 
   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+  if (!Cmp)
+    return false;
+
+  Value *CmpOp0 = Cmp->getOperand(0);
+  Value *CmpOp1 = Cmp->getOperand(1);
+
+  // Emit "cmov on compare with a expensive operand" as a branch to avoid stalls
+  // on executing expensive instruction likes division.
+  auto IsExpensiveCostInst = [&](Value *V) -> bool {
+    auto *I = dyn_cast<Instruction>(V);
+    if (I && I->getOpcode() == Instruction::FDiv &&
+        STI->getSchedModel().FdivLatency >
+            STI->getSchedModel().MispredictPenalty)
+      return true;
+
+    return false;
+  };
+
+  if (IsExpensiveCostInst(CmpOp0) || IsExpensiveCostInst(CmpOp1))
+    return true;
 
   // If a branch is predictable, an out-of-order CPU can avoid blocking on its
   // comparison condition. If the compare has more than one use, there's
   // probably another cmov or setcc around, so it's not worth emitting a branch.
-  if (!Cmp || !Cmp->hasOneUse())
+  if (!Cmp->hasOneUse())
     return false;
 
   // If either operand of the select is expensive and only needed on one side
@@ -4530,7 +4554,7 @@
   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
   // Can we convert the 'select' to CF ?
-  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
+  if (DisableSelectToBranch || OptSize || !TLI || !STI || VectorCond)
     return false;
 
   TargetLowering::SelectSupportKind SelectKind;
@@ -4546,7 +4570,7 @@
     // We have efficient codegen support for the select instruction.
     // Check if it is profitable to keep this 'select'.
     if (!TLI->isPredictableSelectExpensive() ||
-        !isFormingBranchFromSelectProfitable(TTI, SI))
+        !isFormingBranchFromSelectProfitable(TTI, STI, SI))
       return false;
   }
 
Index: lib/MC/MCSchedule.cpp
===================================================================
--- lib/MC/MCSchedule.cpp
+++ lib/MC/MCSchedule.cpp
@@ -24,6 +24,7 @@
                                             DefaultLoadLatency,
                                             DefaultHighLatency,
                                             DefaultMispredictPenalty,
+                                            DefaultFdivLatency,
                                             false,
                                             true,
                                             0,
Index: lib/Target/AArch64/AArch64SchedA57.td
===================================================================
--- lib/Target/AArch64/AArch64SchedA57.td
+++ lib/Target/AArch64/AArch64SchedA57.td
@@ -26,6 +26,7 @@
   let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
   let LoadLatency       =   4; // Optimistic load latency
   let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+  let FdivLatency       =  18; // Set single precision fdiv latency
 
   // Enable partial & runtime unrolling. The magic number is chosen based on
   // experiments and benchmarking data.
Index: test/CodeGen/X86/machine-combiner.ll
===================================================================
--- test/CodeGen/X86/machine-combiner.ll
+++ test/CodeGen/X86/machine-combiner.ll
@@ -363,18 +363,18 @@
 define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3) {
 ; SSE-LABEL: reassociate_mins_single:
 ; SSE:       # BB#0:
-; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    mulss %xmm1, %xmm0
 ; SSE-NEXT:    minss %xmm3, %xmm2
 ; SSE-NEXT:    minss %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: reassociate_mins_single:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vminss %xmm3, %xmm2, %xmm1
 ; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-  %t0 = fdiv float %x0, %x1
+  %t0 = fmul float %x0, %x1
   %cmp1 = fcmp olt float %x2, %t0
   %sel1 = select i1 %cmp1, float %x2, float %t0
   %cmp2 = fcmp olt float %x3, %sel1
@@ -387,18 +387,18 @@
 define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3) {
 ; SSE-LABEL: reassociate_maxs_single:
 ; SSE:       # BB#0:
-; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    mulss %xmm1, %xmm0
 ; SSE-NEXT:    maxss %xmm3, %xmm2
 ; SSE-NEXT:    maxss %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: reassociate_maxs_single:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmaxss %xmm3, %xmm2, %xmm1
 ; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-  %t0 = fdiv float %x0, %x1
+  %t0 = fmul float %x0, %x1
   %cmp1 = fcmp ogt float %x2, %t0
   %sel1 = select i1 %cmp1, float %x2, float %t0
   %cmp2 = fcmp ogt float %x3, %sel1
@@ -411,18 +411,18 @@
 define double @reassociate_mins_double(double %x0, double %x1, double %x2, double %x3) {
 ; SSE-LABEL: reassociate_mins_double:
 ; SSE:       # BB#0:
-; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    mulsd %xmm1, %xmm0
 ; SSE-NEXT:    minsd %xmm3, %xmm2
 ; SSE-NEXT:    minsd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: reassociate_mins_double:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vminsd %xmm3, %xmm2, %xmm1
 ; AVX-NEXT:    vminsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-  %t0 = fdiv double %x0, %x1
+  %t0 = fmul double %x0, %x1
   %cmp1 = fcmp olt double %x2, %t0
   %sel1 = select i1 %cmp1, double %x2, double %t0
   %cmp2 = fcmp olt double %x3, %sel1
@@ -435,18 +435,18 @@
 define double @reassociate_maxs_double(double %x0, double %x1, double %x2, double %x3) {
 ; SSE-LABEL: reassociate_maxs_double:
 ; SSE:       # BB#0:
-; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    mulsd %xmm1, %xmm0
 ; SSE-NEXT:    maxsd %xmm3, %xmm2
 ; SSE-NEXT:    maxsd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: reassociate_maxs_double:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmaxsd %xmm3, %xmm2, %xmm1
 ; AVX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-  %t0 = fdiv double %x0, %x1
+  %t0 = fmul double %x0, %x1
   %cmp1 = fcmp ogt double %x2, %t0
   %sel1 = select i1 %cmp1, double %x2, double %t0
   %cmp2 = fcmp ogt double %x3, %sel1
Index: test/Transforms/CodeGenPrepare/X86/select.ll
===================================================================
--- test/Transforms/CodeGenPrepare/X86/select.ll
+++ test/Transforms/CodeGenPrepare/X86/select.ll
@@ -134,3 +134,18 @@
 ; CHECK:  %sel = select i1 %cmp, i32 %div1, i32 %div2
 }
 
+; Nothing to sink here, but this gets converted to a branch to
+; avoid stalling an out-of-order CPU on a predictable branch.
+; Because cmp's operand is expensive instruction likes division.
+
+define float @fdiv_do_transform(float %a, float %b) {
+entry:
+  %div = fdiv float %a, %b
+  %cmp = fcmp ogt float %div, %b
+  %sel = select i1 %cmp, float %div, float 8.0
+  ret float %sel
+
+; CHECK-LABEL: @fdiv_do_transform(
+; CHECK:  br i1 %cmp, label %select.end, label %select.false
+}
+
Index: utils/TableGen/SubtargetEmitter.cpp
===================================================================
--- utils/TableGen/SubtargetEmitter.cpp
+++ utils/TableGen/SubtargetEmitter.cpp
@@ -1157,6 +1157,7 @@
     EmitProcessorProp(OS, PM.ModelDef, "LoadLatency", ',');
     EmitProcessorProp(OS, PM.ModelDef, "HighLatency", ',');
     EmitProcessorProp(OS, PM.ModelDef, "MispredictPenalty", ',');
+    EmitProcessorProp(OS, PM.ModelDef, "FdivLatency", ',');
 
     OS << "  " << (bool)(PM.ModelDef ?
                          PM.ModelDef->getValueAsBit("PostRAScheduler") : 0)