Index: include/llvm/CodeGen/AsmPrinter.h
===================================================================
--- include/llvm/CodeGen/AsmPrinter.h
+++ include/llvm/CodeGen/AsmPrinter.h
@@ -23,9 +23,11 @@
 #include "llvm/CodeGen/DwarfStringPoolEntry.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdint>
 #include <memory>
 #include <utility>
@@ -112,6 +114,11 @@
   typedef std::pair<const GlobalVariable *, unsigned> GOTEquivUsePair;
   MapVector<const MCSymbol *, GOTEquivUsePair> GlobalGOTEquivs;
 
+  /// The taget schedule model.
+  TargetSchedModel SchedModel;
+  /// Enable print [latency:throughput] in output .S
+  bool enablePrintSchedInfo = false;
+
 private:
   MCSymbol *CurrentFnBegin = nullptr;
   MCSymbol *CurrentFnEnd = nullptr;
Index: include/llvm/CodeGen/TargetSchedule.h
===================================================================
--- include/llvm/CodeGen/TargetSchedule.h
+++ include/llvm/CodeGen/TargetSchedule.h
@@ -181,6 +181,11 @@
   /// This is typically one cycle.
   unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx,
                                 const MachineInstr *DepMI) const;
+  /// \brief Compute the reciprocal throughput of the given instruction.
+  /// UseDefaultThroughput will be used to get the desired value without
+  /// any calculations
+  double computeInstrRThroughput(const MachineInstr *MI,
+                                   bool UseDefaultThroughput = true) const;
 };
 
 } // end namespace llvm
Index: include/llvm/MC/MCTargetOptions.h
===================================================================
--- include/llvm/MC/MCTargetOptions.h
+++ include/llvm/MC/MCTargetOptions.h
@@ -47,6 +47,7 @@
   bool ShowMCEncoding : 1;
   bool ShowMCInst : 1;
   bool AsmVerbose : 1;
+  bool PrintSchedule : 1;
 
   /// Preserve Comments in Assembly.
   bool PreserveAsmComments : 1;
@@ -80,6 +81,7 @@
           ARE_EQUAL(MCIncrementalLinkerCompatible) &&
           ARE_EQUAL(MCPIECopyRelocations) &&
           ARE_EQUAL(ShowMCEncoding) &&
+          ARE_EQUAL(PrintSchedule) &&
           ARE_EQUAL(ShowMCInst) &&
           ARE_EQUAL(AsmVerbose) &&
           ARE_EQUAL(DwarfVersion) &&
Index: include/llvm/Target/TargetSubtargetInfo.h
===================================================================
--- include/llvm/Target/TargetSubtargetInfo.h
+++ include/llvm/Target/TargetSubtargetInfo.h
@@ -143,6 +143,9 @@
   /// TargetLowering preference). It does not yet disable the postRA scheduler.
   virtual bool enableMachineScheduler() const;
 
+  /// \brief Enable print [latency:throughput] comment in output .S file
+  virtual bool enablePrintSchedInfo() const { return false; }
+
   /// \brief True if the machine scheduler should disable the TLI preference
   /// for preRA scheduling with the source level scheduler.
   virtual bool enableMachineSchedDefaultSched() const { return true; }
Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp
===================================================================
--- lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -46,6 +46,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -719,7 +720,8 @@
 }
 
 /// emitComments - Pretty-print comments for instructions.
-static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+static void emitComments(AsmPrinter *AP, const MachineInstr &MI,
+                         raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
@@ -752,6 +754,19 @@
   // Check for spill-induced copies
   if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
     CommentOS << " Reload Reuse\n";
+
+  if (AP->enablePrintSchedInfo) {
+    if (!(MI.isPseudo() || MI.isTerminator())) {
+      auto Latency = AP->SchedModel.computeInstrLatency(&MI);
+      auto RThroughput = AP->SchedModel.computeInstrRThroughput(&MI);
+      if (Latency > 0 && RThroughput != std::numeric_limits<double>::infinity())
+        CommentOS << "[" << Latency << format(":%2.2f",  RThroughput) << "]\n";
+      else if (Latency > 0)
+        CommentOS << "[" << Latency << ":?]\n";
+      else if (RThroughput != std::numeric_limits<double>::infinity())
+        CommentOS << "[?:" << RThroughput << "]\n";
+    }
+  }
 }
 
 /// emitImplicitDef - This method emits the specified machine instruction
@@ -965,7 +980,7 @@
       }
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->GetCommentOS());
+        emitComments(this, MI, OutStreamer->GetCommentOS());
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
@@ -1380,8 +1395,15 @@
   }
 
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
-  if (isVerbose())
+  if (isVerbose()) {
     LI = &getAnalysis<MachineLoopInfo>();
+  }
+  if (TM.Options.MCOptions.PrintSchedule &&
+        MF.getSubtarget().enablePrintSchedInfo()) {
+    enablePrintSchedInfo = true;
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    SchedModel.init(STI.getSchedModel(), &STI, STI.getInstrInfo());
+  }
 }
 
 namespace {
Index: lib/CodeGen/TargetSchedule.cpp
===================================================================
--- lib/CodeGen/TargetSchedule.cpp
+++ lib/CodeGen/TargetSchedule.cpp
@@ -308,3 +308,48 @@
   }
   return 0;
 }
+
+double
+TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI,
+                                          bool UseDefaultRThroughput) const {
+  if (UseDefaultRThroughput) {
+    // TODO: we should arrange something like we have on http://www.agner.org/
+  }
+  double Unknown = std::numeric_limits<double>::infinity();
+  double Throughput = Unknown;
+  if (!hasInstrSchedModel() && !hasInstrItineraries())
+    return Unknown;
+
+  if (hasInstrItineraries()) {
+    unsigned schedClass = MI->getDesc().getSchedClass();
+    auto *IID = getInstrItineraries();
+    for (const InstrStage *IS = IID->beginStage(schedClass),
+                          *E = IID->endStage(schedClass);
+         IS != E; ++IS) {
+      unsigned Cycles = IS->getCycles();
+      if (!Cycles)
+        continue;
+      Throughput =
+          std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles);
+    }
+    // We need reciprocal throughput that's why we return such value
+    return 1 / Throughput;
+  }
+  if (hasInstrSchedModel()) {
+    auto *SCDesc = resolveSchedClass(MI);
+    for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc),
+                                   *WEnd = STI->getWriteProcResEnd(SCDesc);
+         WPR != WEnd; ++WPR) {
+      unsigned Cycles = WPR->Cycles;
+      if (!Cycles)
+        return Unknown;
+
+      unsigned NumUnits =
+          SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits;
+      Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles);
+    }
+    // We need reciprocal throughput that's why we return such value
+    return 1 / Throughput;
+  }
+  return Unknown;
+}
Index: lib/MC/MCTargetOptions.cpp
===================================================================
--- lib/MC/MCTargetOptions.cpp
+++ lib/MC/MCTargetOptions.cpp
@@ -18,6 +18,7 @@
       MCSaveTempLabels(false), MCUseDwarfDirectory(false),
       MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false),
       ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
+      PrintSchedule(false),
       PreserveAsmComments(true) {}
 
 StringRef MCTargetOptions::getABIName() const {
Index: lib/Target/X86/X86Subtarget.h
===================================================================
--- lib/Target/X86/X86Subtarget.h
+++ lib/Target/X86/X86Subtarget.h
@@ -624,6 +624,8 @@
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
+  bool enablePrintSchedInfo() const override { return true; }
+
   bool enableEarlyIfConversion() const override;
 
   /// Return the instruction itineraries based on the subtarget selection.
Index: test/CodeGen/X86/recip-fastmath2.ll
===================================================================
--- test/CodeGen/X86/recip-fastmath2.ll
+++ test/CodeGen/X86/recip-fastmath2.ll
@@ -1,63 +1,124 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell    | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell    | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
 
 ; It's the extra tests coverage for recip as discussed on D26855.
 
+define float @f32_no_estimate_2(float %x) #0 {
+; SSE-LABEL: f32_no_estimate_2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    divss %xmm0, %xmm1 # [10:?]
+; SSE-NEXT:    movaps %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: f32_no_estimate_2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:?]
+; AVX-RECIP-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [10:?]
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_no_estimate_2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:?]
+; FMA-RECIP-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [10:?]
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_no_estimate_2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [5:1.00]
+; BTVER2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [19:19.00]
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_no_estimate_2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; SANDY-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_no_estimate_2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; HASWELL-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_no_estimate_2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_no_estimate_2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; AVX512-NEXT:    retq
+  %div = fdiv fast float 1234.0, %x
+  ret float %div
+}
+
 define float @f32_no_step_2(float %x) #3 {
 ; SSE-LABEL: f32_no_step_2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpss %xmm0, %xmm0
-; SSE-NEXT:    mulss {{.*}}(%rip), %xmm0
+; SSE-NEXT:    rcpss %xmm0, %xmm0 # [1:?]
+; SSE-NEXT:    mulss {{.*}}(%rip), %xmm0 # [4:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: f32_no_step_2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_no_step_2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_no_step_2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_no_step_2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_no_step_2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # [5:1.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_no_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_no_step_2:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0 # [1:0.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; AVX512-NEXT:    retq
   %div = fdiv fast float 1234.0, %x
   ret float %div
@@ -66,174 +127,264 @@
 define float @f32_one_step_2(float %x) #1 {
 ; SSE-LABEL: f32_one_step_2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpss %xmm0, %xmm2
-; SSE-NEXT:    mulss %xmm2, %xmm0
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    subss %xmm0, %xmm1
-; SSE-NEXT:    mulss %xmm2, %xmm1
-; SSE-NEXT:    addss %xmm2, %xmm1
-; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    rcpss %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    mulss %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    subss %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulss %xmm2, %xmm1 # [1:?]
+; SSE-NEXT:    addss %xmm2, %xmm1 # [1:?]
+; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    movaps %xmm1, %xmm0 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: f32_one_step_2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:?]
+; AVX-RECIP-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_one_step_2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:?]
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [5:1.00]
+; BTVER2-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [2:1.00]
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step_2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:0.50]
+; SANDY-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step_2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_one_step_2:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # [1:0.00]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; AVX512-NEXT:    retq
   %div = fdiv fast float 3456.0, %x
   ret float %div
 }
 
+define float @f32_no_estimate_2_divs(float %x) #0 {
+; SSE-LABEL: f32_no_estimate_2_divs:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    divss %xmm0, %xmm1 # [10:?]
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulss %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    mulss %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: f32_no_estimate_2_divs:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:?]
+; AVX-RECIP-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [10:?]
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:?]
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_no_estimate_2_divs:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:?]
+; FMA-RECIP-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [10:?]
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:?]
+; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_no_estimate_2_divs:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [5:1.00]
+; BTVER2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [19:19.00]
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [7:1.00]
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_no_estimate_2_divs:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; SANDY-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:1.00]
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_no_estimate_2_divs:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; HASWELL-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_no_estimate_2_divs:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_no_estimate_2_divs:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovss {{.*}}(%rip), %xmm1 # [4:0.50]
+; AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # [12:1.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
+; AVX512-NEXT:    retq
+  %div = fdiv fast float 3456.0, %x
+  %div2 = fdiv fast float %div, %x
+  ret float %div2
+}
+
 define float @f32_one_step_2_divs(float %x) #1 {
 ; SSE-LABEL: f32_one_step_2_divs:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpss %xmm0, %xmm1
-; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT:    subss %xmm0, %xmm2
-; SSE-NEXT:    mulss %xmm1, %xmm2
-; SSE-NEXT:    addss %xmm1, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    mulss %xmm2, %xmm0
-; SSE-NEXT:    mulss %xmm2, %xmm0
+; SSE-NEXT:    rcpss %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulss %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm2 # [4:?]
+; SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    subss %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    mulss %xmm1, %xmm2 # [1:?]
+; SSE-NEXT:    addss %xmm1, %xmm2 # [1:?]
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulss %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    mulss %xmm2, %xmm0 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: f32_one_step_2_divs:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:?]
+; AVX-RECIP-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:?]
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_one_step_2_divs:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:?]
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [4:?]
+; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [5:1.00]
+; BTVER2-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [2:1.00]
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [7:1.00]
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [2:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:0.50]
+; SANDY-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:1.00]
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_one_step_2_divs:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # [1:0.00]
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # [4:0.00]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # [1:0.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; AVX512-NEXT:    retq
   %div = fdiv fast float 3456.0, %x
   %div2 = fdiv fast float %div, %x
@@ -243,210 +394,283 @@
 define float @f32_two_step_2(float %x) #2 {
 ; SSE-LABEL: f32_two_step_2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpss %xmm0, %xmm2
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    mulss %xmm2, %xmm3
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm1, %xmm4
-; SSE-NEXT:    subss %xmm3, %xmm4
-; SSE-NEXT:    mulss %xmm2, %xmm4
-; SSE-NEXT:    addss %xmm2, %xmm4
-; SSE-NEXT:    mulss %xmm4, %xmm0
-; SSE-NEXT:    subss %xmm0, %xmm1
-; SSE-NEXT:    mulss %xmm4, %xmm1
-; SSE-NEXT:    addss %xmm4, %xmm1
-; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    rcpss %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    movaps %xmm0, %xmm3 # [1:?]
+; SSE-NEXT:    mulss %xmm2, %xmm3 # [1:?]
+; SSE-NEXT:    movss {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm1, %xmm4 # [1:?]
+; SSE-NEXT:    subss %xmm3, %xmm4 # [1:?]
+; SSE-NEXT:    mulss %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    addss %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    mulss %xmm4, %xmm0 # [1:?]
+; SSE-NEXT:    subss %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulss %xmm4, %xmm1 # [1:?]
+; SSE-NEXT:    addss %xmm4, %xmm1 # [1:?]
+; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    movaps %xmm1, %xmm0 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: f32_two_step_2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # [1:?]
+; AVX-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm3 # [4:?]
+; AVX-RECIP-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # [1:?]
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_two_step_2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [1:?]
+; FMA-RECIP-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:?]
+; FMA-RECIP-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:?]
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    vmovss {{.*}}(%rip), %xmm3 # [5:1.00]
+; BTVER2-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [2:1.00]
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # [2:1.00]
+; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # [3:1.00]
+; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # [2:1.00]
+; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # [3:1.00]
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_two_step_2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # [5:1.00]
+; SANDY-NEXT:    vmovss {{.*}}(%rip), %xmm3 # [4:0.50]
+; SANDY-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # [3:1.00]
+; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # [5:1.00]
+; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # [3:1.00]
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_two_step_2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:0.00]
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*}}(%rip), %xmm3 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_two_step_2:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vmovaps %xmm1, %xmm3
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # [1:0.00]
+; AVX512-NEXT:    vmovss {{.*}}(%rip), %xmm2 # [4:0.50]
+; AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3 # [1:1.00]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # [1:0.00]
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # [1:0.00]
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # [1:0.00]
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # [1:0.00]
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; AVX512-NEXT:    retq
   %div = fdiv fast float 6789.0, %x
   ret float %div
 }
 
+define <4 x float> @v4f32_no_estimate2(<4 x float> %x) #0 {
+; SSE-LABEL: v4f32_no_estimate2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    divps %xmm0, %xmm1 # [10:?]
+; SSE-NEXT:    movaps %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: v4f32_no_estimate2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [4:?]
+; AVX-RECIP-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [10:?]
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_no_estimate2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [4:?]
+; FMA-RECIP-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [10:?]
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_no_estimate2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [5:1.00]
+; BTVER2-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [19:19.00]
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_no_estimate2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [4:0.50]
+; SANDY-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [12:1.00]
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_no_estimate2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [4:0.50]
+; HASWELL-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [12:1.00]
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_no_estimate2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [12:1.00]
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v4f32_no_estimate2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps {{.*}}(%rip), %xmm1 # [4:0.50]
+; AVX512-NEXT:    # xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # [12:1.00]
+; AVX512-NEXT:    retq
+  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+  ret <4 x float> %div
+}
+
 define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; SSE-LABEL: v4f32_one_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    subps %xmm0, %xmm1
-; SSE-NEXT:    mulps %xmm2, %xmm1
-; SSE-NEXT:    addps %xmm2, %xmm1
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    rcpps %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    subps %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm1 # [1:?]
+; SSE-NEXT:    addps %xmm2, %xmm1 # [1:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    movaps %xmm1, %xmm0 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v4f32_one_step2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [4:?]
+; AVX-RECIP-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_one_step2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 # [4:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [5:1.00]
+; BTVER2-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # [2:1.00]
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [4:0.50]
+; SANDY-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_one_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # [1:0.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # [4:0.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
@@ -455,101 +679,106 @@
 define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; SSE-LABEL: v4f32_one_step_2_divs:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm1
-; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    subps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    rcpps %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulps %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm2 # [4:?]
+; SSE-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    subps %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    mulps %xmm1, %xmm2 # [1:?]
+; SSE-NEXT:    addps %xmm1, %xmm2 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    # xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    mulps %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm0 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [4:?]
+; AVX-RECIP-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [4:?]
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [1:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0 # [4:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [4:?]
+; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [1:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [5:1.00]
+; BTVER2-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # [2:1.00]
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [7:1.00]
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [2:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [4:0.50]
+; SANDY-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:1.00]
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_one_step_2_divs:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # [1:0.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # [4:0.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # [9:0.50]
+; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
 ; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   %div2 = fdiv fast <4 x float> %div, %x
@@ -559,230 +788,304 @@
 define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SSE-LABEL: v4f32_two_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm2
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    movaps %xmm1, %xmm4
-; SSE-NEXT:    subps %xmm3, %xmm4
-; SSE-NEXT:    mulps %xmm2, %xmm4
-; SSE-NEXT:    addps %xmm2, %xmm4
-; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    subps %xmm0, %xmm1
-; SSE-NEXT:    mulps %xmm4, %xmm1
-; SSE-NEXT:    addps %xmm4, %xmm1
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    rcpps %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    movaps %xmm0, %xmm3 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm3 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    movaps %xmm1, %xmm4 # [1:?]
+; SSE-NEXT:    subps %xmm3, %xmm4 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    addps %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    mulps %xmm4, %xmm0 # [1:?]
+; SSE-NEXT:    subps %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulps %xmm4, %xmm1 # [1:?]
+; SSE-NEXT:    addps %xmm4, %xmm1 # [1:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    movaps %xmm1, %xmm0 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v4f32_two_step2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # [1:?]
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %xmm3 # [4:?]
+; AVX-RECIP-NEXT:    # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_two_step2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1 # [1:?]
+; FMA-RECIP-NEXT:    vmovaps {{.*}}(%rip), %xmm2 # [4:?]
+; FMA-RECIP-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm3 # [5:1.00]
+; BTVER2-NEXT:    # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # [2:1.00]
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # [2:1.00]
+; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # [3:1.00]
+; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # [2:1.00]
+; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # [3:1.00]
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [2:1.00]
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_two_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %xmm3 # [4:0.50]
+; SANDY-NEXT:    # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # [3:1.00]
+; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # [5:1.00]
+; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # [3:1.00]
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:1.00]
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:0.00]
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:0.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # [5:0.50]
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_two_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %xmm0, %xmm1
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; KNL-NEXT:    vmovaps %xmm1, %xmm3
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vrcpps %xmm0, %xmm1 # [5:1.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; KNL-NEXT:    vmovaps %xmm1, %xmm3 # [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:0.00]
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:0.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v4f32_two_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; SKX-NEXT:    vmovaps %xmm1, %xmm3
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # [1:0.00]
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2 # [4:0.50]
+; SKX-NEXT:    vmovaps %xmm1, %xmm3 # [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # [1:0.00]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # [1:0.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # [1:0.00]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # [9:0.50]
 ; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
 
+define <8 x float> @v8f32_no_estimate2(<8 x float> %x) #0 {
+; SSE-LABEL: v8f32_no_estimate2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm2 # [4:?]
+; SSE-NEXT:    # xmm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    divps %xmm0, %xmm2 # [10:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm3 # [4:?]
+; SSE-NEXT:    # xmm3 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SSE-NEXT:    divps %xmm1, %xmm3 # [10:?]
+; SSE-NEXT:    movaps %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    movaps %xmm3, %xmm1 # [1:?]
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: v8f32_no_estimate2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [4:?]
+; AVX-RECIP-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [10:?]
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_no_estimate2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [4:?]
+; FMA-RECIP-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [10:?]
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_no_estimate2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [5:1.00]
+; BTVER2-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [19:19.00]
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_no_estimate2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [4:0.50]
+; SANDY-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [12:1.00]
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_no_estimate2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [4:0.50]
+; HASWELL-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [19:2.00]
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_no_estimate2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [4:0.50]
+; HASWELL-NO-FMA-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [19:2.00]
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v8f32_no_estimate2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovaps {{.*}}(%rip), %ymm1 # [4:0.50]
+; AVX512-NEXT:    # ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # [19:2.00]
+; AVX512-NEXT:    retq
+  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
+  ret <8 x float> %div
+}
+
 define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_one_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm1, %xmm4
-; SSE-NEXT:    mulps %xmm4, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    subps %xmm1, %xmm3
-; SSE-NEXT:    mulps %xmm4, %xmm3
-; SSE-NEXT:    addps %xmm4, %xmm3
-; SSE-NEXT:    rcpps %xmm0, %xmm1
-; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    subps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    rcpps %xmm1, %xmm4 # [1:?]
+; SSE-NEXT:    mulps %xmm4, %xmm1 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm2 # [4:?]
+; SSE-NEXT:    # xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    movaps %xmm2, %xmm3 # [1:?]
+; SSE-NEXT:    subps %xmm1, %xmm3 # [1:?]
+; SSE-NEXT:    mulps %xmm4, %xmm3 # [1:?]
+; SSE-NEXT:    addps %xmm4, %xmm3 # [1:?]
+; SSE-NEXT:    rcpps %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    mulps %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    subps %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    mulps %xmm1, %xmm2 # [1:?]
+; SSE-NEXT:    addps %xmm1, %xmm2 # [1:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2 # [4:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3 # [4:?]
+; SSE-NEXT:    movaps %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    movaps %xmm3, %xmm1 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_one_step2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [4:?]
+; AVX-RECIP-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_one_step2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
-; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 # [4:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [5:1.00]
+; BTVER2-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # [2:1.00]
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # [5:1.00]
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [4:0.50]
+; SANDY-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [3:1.00]
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_one_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # [1:0.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # [4:0.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
@@ -791,110 +1094,116 @@
 define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_one_step_2_divs:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subps %xmm0, %xmm4
-; SSE-NEXT:    mulps %xmm2, %xmm4
-; SSE-NEXT:    addps %xmm2, %xmm4
-; SSE-NEXT:    rcpps %xmm1, %xmm0
-; SSE-NEXT:    mulps %xmm0, %xmm1
-; SSE-NEXT:    subps %xmm1, %xmm3
-; SSE-NEXT:    mulps %xmm0, %xmm3
-; SSE-NEXT:    addps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; SSE-NEXT:    mulps %xmm3, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    rcpps %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm3 # [4:?]
+; SSE-NEXT:    # xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    movaps %xmm3, %xmm4 # [1:?]
+; SSE-NEXT:    subps %xmm0, %xmm4 # [1:?]
+; SSE-NEXT:    mulps %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    addps %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    rcpps %xmm1, %xmm0 # [1:?]
+; SSE-NEXT:    mulps %xmm0, %xmm1 # [1:?]
+; SSE-NEXT:    subps %xmm1, %xmm3 # [1:?]
+; SSE-NEXT:    mulps %xmm0, %xmm3 # [1:?]
+; SSE-NEXT:    addps %xmm0, %xmm3 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm1 # [4:?]
+; SSE-NEXT:    # xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SSE-NEXT:    mulps %xmm3, %xmm1 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    # xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    mulps %xmm4, %xmm0 # [1:?]
+; SSE-NEXT:    mulps %xmm4, %xmm0 # [1:?]
+; SSE-NEXT:    mulps %xmm3, %xmm1 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [4:?]
+; AVX-RECIP-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [4:?]
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [1:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
-; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0 # [4:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [4:?]
+; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [1:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [5:1.00]
+; BTVER2-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # [2:1.00]
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [7:1.00]
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [2:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # [5:1.00]
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [4:0.50]
+; SANDY-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [3:1.00]
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00]
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00]
+; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_one_step_2_divs:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00]
+; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # [1:0.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # [4:0.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # [9:1.00]
+; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
 ; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   %div2 = fdiv fast <8 x float> %div, %x
@@ -904,142 +1213,147 @@
 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SSE-LABEL: v8f32_two_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    rcpps %xmm1, %xmm3
-; SSE-NEXT:    movaps %xmm1, %xmm4
-; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    movaps %xmm0, %xmm5
-; SSE-NEXT:    subps %xmm4, %xmm5
-; SSE-NEXT:    mulps %xmm3, %xmm5
-; SSE-NEXT:    addps %xmm3, %xmm5
-; SSE-NEXT:    mulps %xmm5, %xmm1
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm1, %xmm3
-; SSE-NEXT:    mulps %xmm5, %xmm3
-; SSE-NEXT:    addps %xmm5, %xmm3
-; SSE-NEXT:    rcpps %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm2, %xmm4
-; SSE-NEXT:    mulps %xmm1, %xmm4
-; SSE-NEXT:    movaps %xmm0, %xmm5
-; SSE-NEXT:    subps %xmm4, %xmm5
-; SSE-NEXT:    mulps %xmm1, %xmm5
-; SSE-NEXT:    addps %xmm1, %xmm5
-; SSE-NEXT:    mulps %xmm5, %xmm2
-; SSE-NEXT:    subps %xmm2, %xmm0
-; SSE-NEXT:    mulps %xmm5, %xmm0
-; SSE-NEXT:    addps %xmm5, %xmm0
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
-; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2 # [1:?]
+; SSE-NEXT:    rcpps %xmm1, %xmm3 # [1:?]
+; SSE-NEXT:    movaps %xmm1, %xmm4 # [1:?]
+; SSE-NEXT:    mulps %xmm3, %xmm4 # [1:?]
+; SSE-NEXT:    movaps {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    # xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    movaps %xmm0, %xmm5 # [1:?]
+; SSE-NEXT:    subps %xmm4, %xmm5 # [1:?]
+; SSE-NEXT:    mulps %xmm3, %xmm5 # [1:?]
+; SSE-NEXT:    addps %xmm3, %xmm5 # [1:?]
+; SSE-NEXT:    mulps %xmm5, %xmm1 # [1:?]
+; SSE-NEXT:    movaps %xmm0, %xmm3 # [1:?]
+; SSE-NEXT:    subps %xmm1, %xmm3 # [1:?]
+; SSE-NEXT:    mulps %xmm5, %xmm3 # [1:?]
+; SSE-NEXT:    addps %xmm5, %xmm3 # [1:?]
+; SSE-NEXT:    rcpps %xmm2, %xmm1 # [1:?]
+; SSE-NEXT:    movaps %xmm2, %xmm4 # [1:?]
+; SSE-NEXT:    mulps %xmm1, %xmm4 # [1:?]
+; SSE-NEXT:    movaps %xmm0, %xmm5 # [1:?]
+; SSE-NEXT:    subps %xmm4, %xmm5 # [1:?]
+; SSE-NEXT:    mulps %xmm1, %xmm5 # [1:?]
+; SSE-NEXT:    addps %xmm1, %xmm5 # [1:?]
+; SSE-NEXT:    mulps %xmm5, %xmm2 # [1:?]
+; SSE-NEXT:    subps %xmm2, %xmm0 # [1:?]
+; SSE-NEXT:    mulps %xmm5, %xmm0 # [1:?]
+; SSE-NEXT:    addps %xmm5, %xmm0 # [1:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3 # [4:?]
+; SSE-NEXT:    movaps %xmm3, %xmm1 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_two_step2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # [1:?]
+; AVX-RECIP-NEXT:    vmovaps {{.*}}(%rip), %ymm3 # [4:?]
+; AVX-RECIP-NEXT:    # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_two_step2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; FMA-RECIP-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1 # [1:?]
+; FMA-RECIP-NEXT:    vmovaps {{.*}}(%rip), %ymm2 # [4:?]
+; FMA-RECIP-NEXT:    # ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:?]
+; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:?]
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm3 # [5:1.00]
+; BTVER2-NEXT:    # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # [2:1.00]
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # [2:1.00]
+; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # [3:1.00]
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # [2:1.00]
+; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # [3:1.00]
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_two_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # [5:1.00]
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # [5:1.00]
+; SANDY-NEXT:    vmovaps {{.*}}(%rip), %ymm3 # [4:0.50]
+; SANDY-NEXT:    # ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # [3:1.00]
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # [5:1.00]
+; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # [3:1.00]
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # [3:1.00]
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # [1:1.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:0.00]
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:0.00]
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:0.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # [5:1.00]
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # [3:1.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_two_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm1
-; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; KNL-NEXT:    vmovaps %ymm1, %ymm3
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vrcpps %ymm0, %ymm1 # [7:2.00]
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; KNL-NEXT:    vmovaps %ymm1, %ymm3 # [1:1.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:0.00]
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:0.00]
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:0.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_two_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; SKX-NEXT:    vmovaps %ymm1, %ymm3
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # [1:0.00]
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2 # [5:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm3 # [1:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # [1:0.00]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # [1:0.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # [1:0.00]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
@@ -1048,48 +1362,48 @@
 define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
 ; SSE-LABEL: v8f32_no_step:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm0
-; SSE-NEXT:    rcpps %xmm1, %xmm1
+; SSE-NEXT:    rcpps %xmm0, %xmm0 # [1:?]
+; SSE-NEXT:    rcpps %xmm1, %xmm1 # [1:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_no_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0 # [1:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_no_step:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0 # [1:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_no_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # [2:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_no_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0
+; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # [5:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_no_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # [7:2.00]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # [7:2.00]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_no_step:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0
+; KNL-NEXT:    vrcpps %ymm0, %ymm0 # [7:2.00]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_no_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # [1:0.00]
 ; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
@@ -1098,58 +1412,58 @@
 define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
 ; SSE-LABEL: v8f32_no_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm1, %xmm1
-; SSE-NEXT:    rcpps %xmm0, %xmm0
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
+; SSE-NEXT:    rcpps %xmm1, %xmm1 # [1:?]
+; SSE-NEXT:    rcpps %xmm0, %xmm0 # [1:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0 # [4:?]
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1 # [4:?]
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_no_step2:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0 # [1:?]
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:?]
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_no_step2:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0 # [1:?]
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [4:?]
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_no_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
-; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # [2:1.00]
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [7:1.00]
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_no_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # [5:1.00]
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_no_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # [7:2.00]
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # [7:2.00]
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_no_step2:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vrcpps %ymm0, %ymm0
-; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vrcpps %ymm0, %ymm0 # [7:2.00]
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: v8f32_no_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # [1:0.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # [9:1.00]
 ; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
Index: test/CodeGen/X86/recip-pic.ll
===================================================================
--- test/CodeGen/X86/recip-pic.ll
+++ test/CodeGen/X86/recip-pic.ll
@@ -1,27 +1,110 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=slm -relocation-model=pic -print-schedule | FileCheck %s --check-prefix=CHECK
 
-define fastcc float @foo(float %x) unnamed_addr #0 {
-; CHECK-LABEL: foo:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    calll .L0$pb
+define float @f32_no_estimate_2(float %x) #0 {
+; CHECK-LABEL: f32_no_estimate_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax # [1:1.00]
 ; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    calll .L0$pb # [1:0.00]
+; CHECK-NEXT:  .Lcfi1:
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT:  .L0$pb:
 ; CHECK-NEXT:    popl %eax
-; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:  .Lcfi2:
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
-; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:  .Ltmp0: # [1:0.50]
 ; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    divss %xmm0, %xmm1
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    movss %xmm1, (%eax)
+; CHECK-NEXT:    movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm0 # [3:1.00]
+; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss {{[0-9]+}}(%esp), %xmm0 # [37:34.00]
+; CHECK-NEXT:    movss %xmm0, (%eax) # [1:1.00]
+; CHECK-NEXT:    movss %xmm0, (%esp) # [1:1.00]
+; CHECK-NEXT:    flds (%esp) # [3:1.00]
+; CHECK-NEXT:    popl %eax # [3:1.00]
 ; CHECK-NEXT:    retl
-entry:
   %div = fdiv fast float 3.0, %x
   store float %div, float* undef, align 4
   ret float %div
 }
 
+define float @f32_one_step(float %x) #1 {
+; CHECK-LABEL: f32_one_step:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax # [1:1.00]
+; CHECK-NEXT:  .Lcfi3:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # [3:1.00]
+; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    calll .L1$pb # [1:0.00]
+; CHECK-NEXT:  .Lcfi4:
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:  .L1$pb:
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:  .Lcfi5:
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-NEXT:  .Ltmp1: # [1:0.50]
+; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %eax
+; CHECK-NEXT:    movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # [3:1.00]
+; CHECK-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    rcpss %xmm0, %xmm1 # [5:1.00]
+; CHECK-NEXT:    mulss %xmm1, %xmm0 # [5:2.00]
+; CHECK-NEXT:    subss %xmm0, %xmm2 # [3:1.00]
+; CHECK-NEXT:    mulss %xmm1, %xmm2 # [5:2.00]
+; CHECK-NEXT:    addss %xmm1, %xmm2 # [3:1.00]
+; CHECK-NEXT:    mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm2 # [8:2.00]
+; CHECK-NEXT:    movss %xmm2, (%eax) # [1:1.00]
+; CHECK-NEXT:    movss %xmm2, (%esp) # [1:1.00]
+; CHECK-NEXT:    flds (%esp) # [3:1.00]
+; CHECK-NEXT:    popl %eax # [3:1.00]
+; CHECK-NEXT:    retl
+  %div = fdiv fast float 3.0, %x
+  store float %div, float* undef, align 4
+  ret float %div
+}
+
+define float @f32_two_steps(float %x) #2 {
+; CHECK-LABEL: f32_two_steps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax # [1:1.00]
+; CHECK-NEXT:  .Lcfi6:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # [3:1.00]
+; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    calll .L2$pb # [1:0.00]
+; CHECK-NEXT:  .Lcfi7:
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:  .L2$pb:
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:  .Lcfi8:
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-NEXT:  .Ltmp2: # [1:0.50]
+; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %eax
+; CHECK-NEXT:    movss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # [3:1.00]
+; CHECK-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    rcpss %xmm0, %xmm1 # [5:1.00]
+; CHECK-NEXT:    movaps %xmm0, %xmm2 # [1:1.00]
+; CHECK-NEXT:    movaps %xmm3, %xmm4 # [1:1.00]
+; CHECK-NEXT:    mulss %xmm1, %xmm2 # [5:2.00]
+; CHECK-NEXT:    subss %xmm2, %xmm4 # [3:1.00]
+; CHECK-NEXT:    mulss %xmm1, %xmm4 # [5:2.00]
+; CHECK-NEXT:    addss %xmm1, %xmm4 # [3:1.00]
+; CHECK-NEXT:    mulss %xmm4, %xmm0 # [5:2.00]
+; CHECK-NEXT:    subss %xmm0, %xmm3 # [3:1.00]
+; CHECK-NEXT:    mulss %xmm4, %xmm3 # [5:2.00]
+; CHECK-NEXT:    addss %xmm4, %xmm3 # [3:1.00]
+; CHECK-NEXT:    mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm3 # [8:2.00]
+; CHECK-NEXT:    movss %xmm3, (%eax) # [1:1.00]
+; CHECK-NEXT:    movss %xmm3, (%esp) # [1:1.00]
+; CHECK-NEXT:    flds (%esp) # [3:1.00]
+; CHECK-NEXT:    popl %eax # [3:1.00]
+; CHECK-NEXT:    retl
+  %div = fdiv fast float 3.0, %x
+  store float %div, float* undef, align 4
+  ret float %div
+}
 
+attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
Index: tools/llc/llc.cpp
===================================================================
--- tools/llc/llc.cpp
+++ tools/llc/llc.cpp
@@ -99,6 +99,10 @@
 static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
                                     cl::desc("Show encoding in .s output"));
 
+static cl::opt<bool> PrintSchedule("print-schedule", cl::Hidden,
+                        cl::init(false),
+                        cl::desc("Print [latency:throughput] in .s output"));
+
 static cl::opt<bool> EnableDwarfDirectory(
     "enable-dwarf-directory", cl::Hidden,
     cl::desc("Use .file directives with an explicit directory."));
@@ -446,6 +450,7 @@
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
   Options.DisableIntegratedAS = NoIntegratedAssembler;
   Options.MCOptions.ShowMCEncoding = ShowMCEncoding;
+  Options.MCOptions.PrintSchedule = PrintSchedule;
   Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
   Options.MCOptions.AsmVerbose = AsmVerbose;
   Options.MCOptions.PreserveAsmComments = PreserveComments;