Index: include/llvm/CodeGen/MachineCombinerPattern.h
===================================================================
--- include/llvm/CodeGen/MachineCombinerPattern.h
+++ include/llvm/CodeGen/MachineCombinerPattern.h
@@ -71,7 +71,10 @@
   FMLSv2f32_OP2,
   FMLSv2f64_OP2,
   FMLSv4i32_indexed_OP2,
-  FMLSv4f32_OP2
+  FMLSv4f32_OP2,
+
+  // This is FDIV-RECIP pattern matched by X86 machine combiner
+  Div2RecipEst
 };
 
 } // end namespace llvm
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -286,7 +286,8 @@
   /// based on the function's attributes. If the operation is not overridden by
   /// the function's attributes, "Unspecified" is returned and target defaults
   /// are expected to be used for instruction selection.
-  int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const;
+  virtual int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF,
+                                         bool forDAGF = true) const;
 
   /// Return the refinement step count for a square root of the given type based
   /// on the function's attributes. If the operation is not overridden by
Index: lib/CodeGen/MachineCombiner.cpp
===================================================================
--- lib/CodeGen/MachineCombiner.cpp
+++ lib/CodeGen/MachineCombiner.cpp
@@ -155,9 +155,16 @@
         assert(DefInstr &&
                "There must be a definition for a new virtual register");
         DepthOp = InstrDepth[II->second];
-        LatencyOp = TSchedModel.computeOperandLatency(
-            DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
-            InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+        int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
+        int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
+        assert((DefIdx || UseIdx) && "Invalid reg usage");
+        if (DefIdx < 0 || UseIdx < 0)
+          // W/o def/use indexes we can't compute latency based on shed model
+          // that's why we're forced to use the default value
+          LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr);
+        else
+          LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
+                                                        InstrPtr, UseIdx);
       } else {
         MachineInstr *DefInstr = getOperandDef(MO);
         if (DefInstr) {
@@ -267,8 +274,12 @@
   // dependency cycles) in the critical path to proceed with the transform.
   // Being conservative also protects against inaccuracies in the underlying
   // machine trace metrics and CPU models.
-  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
+  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
+    DEBUG(dbgs() << "It MustReduceDepth ");
+    DEBUG(NewRootDepth < RootDepth ? dbgs() << "and it does it\n"
+                                   : dbgs() << "but it does NOT do it\n");
     return NewRootDepth < RootDepth;
+  }
 
   // A more flexible cost calculation for the critical path includes the slack
   // of the original code sequence. This may allow the transform to proceed
@@ -282,16 +293,18 @@
 
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
 
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+
   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
         dbgs() << " RootLatency: " << RootLatency << "\n";
         dbgs() << " RootSlack: " << RootSlack << "\n";
-        dbgs() << " NewRootDepth + NewRootLatency = "
-               << NewRootDepth + NewRootLatency << "\n";
-        dbgs() << " RootDepth + RootLatency + RootSlack = "
-               << RootDepth + RootLatency + RootSlack << "\n";);
-
-  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
-  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+        dbgs() << " NewRootDepth + NewRootLatency = " << NewCycleCount << "\n";
+        dbgs() << " RootDepth + RootLatency + RootSlack = " << OldCycleCount
+               << "\n";);
+  DEBUG(NewCycleCount <= OldCycleCount
+            ? dbgs() << "It improves PathLen\n"
+            : dbgs() << "It does NOT improve PathLen");
 
   return NewCycleCount <= OldCycleCount;
 }
@@ -340,6 +353,9 @@
   DEBUG(dbgs() << "RESOURCE DATA: \n";
         dbgs() << " resource len before: " << ResLenBeforeCombine
                << " after: " << ResLenAfterCombine << "\n";);
+  DEBUG(ResLenAfterCombine <= ResLenBeforeCombine
+            ? dbgs() << "It preserves ResourceLen\n"
+            : dbgs() << "It does NOT preserve ResourceLen\n");
 
   return ResLenAfterCombine <= ResLenBeforeCombine;
 }
Index: lib/CodeGen/MachineTraceMetrics.cpp
===================================================================
--- lib/CodeGen/MachineTraceMetrics.cpp
+++ lib/CodeGen/MachineTraceMetrics.cpp
@@ -518,6 +518,7 @@
 }
 
 /// Invalidate traces through BadMBB.
+// TODO: this code should be refactored because it really increases compile time
 void
 MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
   SmallVector<const MachineBasicBlock*, 16> WorkList;
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -2078,8 +2078,9 @@
   return getOpEnabled(true, VT, getRecipEstimateForFunc(MF));
 }
 
-int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT,
-                                                   MachineFunction &MF) const {
+int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF,
+                                                   bool forDAGCombiner) const {
+
   return getOpEnabled(false, VT, getRecipEstimateForFunc(MF));
 }
 
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -1285,6 +1285,14 @@
                             int &RefinementSteps, bool &UseOneConstNR,
                             bool Reciprocal) const override;
 
+    /// Return a ReciprocalEstimate enum value for a division of the given type
+    /// based on the function's attributes. If the operation is not overridden
+    /// by
+    /// the function's attributes, "Unspecified" is returned and target defaults
+    /// are expected to be used for instruction selection.
+    int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF,
+                                   bool forDAG) const override;
+
     /// Use rcp* to speed up fdiv calculations.
     SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                              int &RefinementSteps) const override;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -16475,6 +16475,17 @@
   return SDValue();
 }
 
+/// Return a ReciprocalEstimate enum value for a division of the given type
+/// based on the function's attributes. If the operation is not overridden by
+/// the function's attributes, "Unspecified" is returned and target defaults
+/// are expected to be used for instruction selection.
+int X86TargetLowering::getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF,
+                                                  bool forDAG) const {
+  if (!Subtarget.hasAVX512() && forDAG)
+    return ReciprocalEstimate::Disabled;
+  return TargetLoweringBase::getRecipEstimateDivEnabled(VT, MF);
+}
+
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -501,6 +501,21 @@
     return true;
   }
 
+  /// When getMachineCombinerPatterns() finds patterns, this function generates
+  /// the instructions that could replace the original code sequence
+  void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern Pattern,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+
+  /// Return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// listed in the <Patterns> array.
+  bool getMachineCombinerPatterns(
+      MachineInstr &Root,
+      SmallVectorImpl<MachineCombinerPattern> &Patterns) const override;
+
   bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
 
   bool hasReassociableOperands(const MachineInstr &Inst,
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -10102,6 +10102,383 @@
   }
 }
 
+// The dividend could be ExactlyOne value and in this case we should not create
+// additional constant for reciprocal division but use the dividend instead.
+// We're trying to find the dividend definition and if it is a constant
+// ExactlyOne value we'll use it.
+static bool isDividendExactlyOne(MachineFunction &MF, unsigned DividendReg) {
+  if (MachineInstr *MI = MF.getRegInfo().getUniqueVRegDef(DividendReg)) {
+    auto Constants =
+        MI->getParent()->getParent()->getConstantPool()->getConstants();
+    for (auto &MO : MI->operands()) {
+      if (MO.isCPI()) {
+        // We have a Constant Pool Index operand in this instruction
+        // FIXME: should we deal with other types of operand like Immediate?
+        auto ConstantEntry = Constants[MO.getIndex()];
+        // FIXME: what should we do with MachineConstantPoolEntry?
+        if (!ConstantEntry.isMachineConstantPoolEntry()) {
+          if (auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal)) {
+            if (C->getType()->isVectorTy()) {
+              if (!(C = C->getSplatValue()))
+                return false;
+            }
+            if (auto *CFP = dyn_cast<ConstantFP>(C))
+              return CFP->isExactlyValue(1.0);
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+static EVT getFDivEVT(MachineInstr &Root) {
+  // FIXME: should we support other kinds of DIV?
+  switch (Root.getOpcode()) {
+  default:
+    break;
+  case X86::DIVSSrr:  // f32
+  case X86::VDIVSSrr: // f32
+    return MVT::f32;
+  case X86::DIVPSrr:  // v4f32
+  case X86::VDIVPSrr: // v4f32
+    return MVT::v4f32;
+  case X86::VDIVPSYrr: // v8f32
+    return MVT::v8f32;
+  }
+  return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
+/// genReciprocalDiv - Generates A = B * 1/C instead of A = B/C
+/// (at the moment we support float types only: f32, v4f32 and 8f32)
+/// TODO: Should we support double types for the latest CPUs?
+///
+/// To get more precision we're using Newton-Raphson iterations like here:
+///
+/// X[0] = reciprocal (C);
+/// X[i+1] = X[i] + X[i] * (1 - C * X[i]); every iteration increases precision
+///
+/// In theory if we know that X[0] is accurate to N bits, the result of
+/// iteration k will be accurate to almost 2^k*N bits. For x86 it means:
+/// X[0] = 11 bits
+/// X[1] = 22 bits
+/// x[2] = 44 bits
+/// etc.
+///
+/// And the result of division will be here: A = B * X
+/// Example (-x86-asm-syntax=intel): instead of
+///
+///   vmovss  xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero
+///   vdivss  xmm0, xmm1, xmm0
+///
+/// we're generating
+///
+///   vmovss  xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero
+///   vrcpss  xmm2, xmm0, xmm0
+///   vmulss  xmm0, xmm0, xmm2
+///   vsubss  xmm0, xmm1, xmm0
+///   vmulss  xmm0, xmm0, xmm2
+///   vaddss  xmm0, xmm0, xmm2
+
+#define FMA_INDEX 7
+
+static void genReciprocalDiv(MachineInstr &Root,
+                             SmallVectorImpl<MachineInstr *> &InsInstrs,
+                             DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                             ArrayRef<int> Instrs, Type *Ty) {
+
+  MachineBasicBlock *MBB = Root.getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto &Subtarget = MF.getSubtarget<X86Subtarget>();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const TargetLowering *TLI = Subtarget.getTargetLowering();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+  int Iterations = TLI->getDivRefinementSteps(getFDivEVT(Root), MF);
+
+  bool hasFMA = Subtarget.hasFMA();
+  assert(!hasFMA || Instrs.size() > FMA_INDEX);
+
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned DividendReg = Root.getOperand(1).getReg();
+  bool DividendIsExactlyOne = isDividendExactlyOne(MF, DividendReg);
+  unsigned DividerReg = Root.getOperand(2).getReg();
+  bool DividerIsKill = Root.getOperand(2).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(DividendReg))
+    MRI.constrainRegClass(DividendReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(DividerReg))
+    MRI.constrainRegClass(DividerReg, RC);
+
+  if (Iterations < 0) // all values >= 0 mean Iterations were defined explictly
+    Iterations = 1;   // otherwise we use the default value
+
+  // The bullets below (0,2,1,3,4,5,6) mean the indexes inside input Instrs
+  // The meaning of indexes(bullets) see below inside genAlternativeCodeSequence
+  // 0: rcp
+  // Initial estimate value is recipocal division of C
+  MachineInstrBuilder RcpMI;
+  // Iff DivIsRcp == true Then div ~= rcp without any additional refinement
+  bool DivIsRcp = DividendIsExactlyOne && !Iterations;
+
+  unsigned RcpVReg;
+  if (!DivIsRcp) {
+    // We need refinement and only because of that we need this vreg
+    RcpVReg = MRI.createVirtualRegister(RC);
+    InstrIdxForVirtReg.insert(std::make_pair(RcpVReg, 0));
+  }
+  if (Instrs[0] == X86::VRCPSSr)
+    RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]),
+                    DivIsRcp ? ResultReg : RcpVReg)
+                .addReg(DividerReg, getKillRegState(DividerIsKill))
+                .addReg(DividerReg, getKillRegState(DividerIsKill));
+  else
+    RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]),
+                    DivIsRcp ? ResultReg : RcpVReg)
+                .addReg(DividerReg, getKillRegState(DividerIsKill));
+  InsInstrs.push_back(RcpMI);
+
+  unsigned LoadVReg = 0;
+  if (!DividendIsExactlyOne && Iterations) {
+    // 2: load (mov)
+    // We need all ones value to be able to do (1 - C * X[i])
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (MF.getTarget().isPositionIndependent()) {
+      if (Subtarget.is64Bit())
+        PICBase = X86::RIP;
+      else if (Subtarget.is32Bit())
+        PICBase = X86::EIP;
+      else 
+        return;
+    }
+    // Create a constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    //  const Constant *C = Constant::getAllOnesValue(Ty);
+    auto *CFP = ConstantFP::get(Ty, 1.0);
+    unsigned CPI = MCP.getConstantPoolIndex(CFP, 4);
+    LoadVReg = MRI.createVirtualRegister(RC);
+    InstrIdxForVirtReg.insert(std::make_pair(LoadVReg, 0));
+
+    MachineInstrBuilder LoadMI;
+    auto &MIDesc = TII->get(Instrs[2]);
+    if (MIDesc.getNumOperands() == 6)
+      LoadMI = BuildMI(MF, Root.getDebugLoc(), MIDesc, LoadVReg)
+                   .addReg(PICBase)
+                   .addImm(1)
+                   .addReg(0)
+                   .addConstantPoolIndex(CPI)
+                   .addReg(0);
+    else
+      LoadMI = BuildMI(MF, Root.getDebugLoc(), MIDesc, LoadVReg)
+                   .addConstantPoolIndex(CPI);
+    InsInstrs.push_back(LoadMI);
+  }
+  unsigned EstVReg = RcpVReg; // X[0] = reciprocal (C);
+
+  for (int i = 0; i < Iterations; i++) {
+    if (hasFMA) {
+      // 7: fnmadd
+      unsigned NFmaVReg = MRI.createVirtualRegister(RC);
+      InstrIdxForVirtReg.insert(std::make_pair(NFmaVReg, 0));
+      MachineInstrBuilder NFmaMI =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[FMA_INDEX]), NFmaVReg)
+              .addReg(DividerReg, getKillRegState(DividerIsKill))
+              .addReg(DividendIsExactlyOne ? DividendReg : LoadVReg)
+              .addReg(EstVReg);
+      InsInstrs.push_back(NFmaMI); // 1 - C * X[i]
+      // 8: fmadd
+      MachineInstrBuilder FmaMI;
+      unsigned FmaVReg;
+      if (DividendIsExactlyOne && (i + 1 == Iterations))
+        FmaVReg = ResultReg;
+      else {
+        FmaVReg = MRI.createVirtualRegister(RC);
+        InstrIdxForVirtReg.insert(std::make_pair(FmaVReg, 0));
+      }
+      FmaMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[FMA_INDEX + 1]),
+                      FmaVReg)
+                  .addReg(EstVReg)
+                  .addReg(EstVReg)
+                  .addReg(NFmaVReg);
+      InsInstrs.push_back(FmaMI); // X[i] + X[i] * (1 - C * X[i])
+      EstVReg = FmaVReg;
+    } else {
+      // 1: mul
+      unsigned MulVReg = MRI.createVirtualRegister(RC);
+      InstrIdxForVirtReg.insert(std::make_pair(MulVReg, 0));
+      MachineInstrBuilder MulMI =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), MulVReg)
+              .addReg(DividerReg, getKillRegState(DividerIsKill))
+              .addReg(EstVReg);
+      InsInstrs.push_back(MulMI); // C * X[i]
+
+      // 3: sub
+      unsigned SubVReg = MRI.createVirtualRegister(RC);
+      InstrIdxForVirtReg.insert(std::make_pair(SubVReg, 0));
+      MachineInstrBuilder SubMI =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3]), SubVReg)
+              .addReg(DividendIsExactlyOne ? DividendReg : LoadVReg)
+              .addReg(MulVReg);
+      InsInstrs.push_back(SubMI); // 1 - C * X[i]
+
+      // 4: mul2
+      unsigned Mul2VReg = MRI.createVirtualRegister(RC);
+      InstrIdxForVirtReg.insert(std::make_pair(Mul2VReg, 0));
+      MachineInstrBuilder Mul2MI =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), Mul2VReg)
+              .addReg(SubVReg)
+              .addReg(EstVReg);
+      InsInstrs.push_back(Mul2MI); // X[i] * (1 - C * X[i])
+
+      // 5: add
+      MachineInstrBuilder AddMI;
+      unsigned AddVReg;
+      if (DividendIsExactlyOne && (i + 1 == Iterations))
+        AddVReg = ResultReg;
+      else {
+        AddVReg = MRI.createVirtualRegister(RC);
+        InstrIdxForVirtReg.insert(std::make_pair(AddVReg, 0));
+      }
+      AddMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), AddVReg)
+                  .addReg(Mul2VReg)
+                  .addReg(EstVReg);
+      InsInstrs.push_back(AddMI); // X[i] + X[i] * (1 - C * X[i])
+      EstVReg = AddVReg;
+    }
+  }
+  if (!DividendIsExactlyOne) {
+    // 6: result mul
+    // The final multiplication B * 1/C
+    MachineInstrBuilder ResultMulMI =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[6]), ResultReg)
+            .addReg(DividendReg)
+            .addReg(EstVReg);
+    InsInstrs.push_back(ResultMulMI);
+  }
+  return;
+}
+
+/// When getMachineCombinerPatterns() finds potential patterns,
+/// this function generates the instructions that could replace the
+/// original code sequence
+void X86InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  switch (Pattern) {
+  default:
+    // Reassociate instructions.
+    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+                                                DelInstrs, InstrIdxForVirtReg);
+    return;
+  case MachineCombinerPattern::Div2RecipEst:
+    switch (Root.getOpcode()) {
+    default:
+      return;
+    case X86::VDIVSSrr: // f32
+      genReciprocalDiv(Root, InsInstrs, InstrIdxForVirtReg,
+                       {X86::VRCPSSr, X86::VMULSSrr, X86::VMOVSSrm,
+                        X86::VSUBSSrr, X86::VMULSSrr, X86::VADDSSrr,
+                        X86::VMULSSrr, X86::VFNMADD132SSr,
+                        X86::VFMADD132SSr}, // FMA support at FMA_INDEX
+                       Type::getFloatTy(MF.getFunction()->getContext()));
+      break;
+    case X86::VDIVPSrr: // v4f32
+      genReciprocalDiv(
+          Root, InsInstrs, InstrIdxForVirtReg,
+          {X86::VRCPPSr, X86::VMULPSrr, X86::VMOVAPSrm, X86::VSUBPSrr,
+           X86::VMULPSrr, X86::VADDPSrr, X86::VMULPSrr, X86::VFNMADD132PSr,
+           X86::VFMADD132PSr}, // FMA support at FMA_INDEX
+          VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4));
+      break;
+    case X86::VDIVPSYrr: // v8f32
+      genReciprocalDiv(
+          Root, InsInstrs, InstrIdxForVirtReg,
+          {X86::VRCPPSYr, X86::VMULPSYrr, X86::VMOVAPSYrm, X86::VSUBPSYrr,
+           X86::VMULPSYrr, X86::VADDPSYrr, X86::VMULPSYrr, X86::VFNMADD132PSYr,
+           X86::VFMADD132PSYr}, // FMA support at FMA_INDEX
+          VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8));
+      break;
+    case X86::DIVSSrr: // f32
+      genReciprocalDiv(Root, InsInstrs, InstrIdxForVirtReg,
+                       {X86::RCPSSr, X86::MULSSrr, X86::MOVSSrm, X86::SUBSSrr,
+                        X86::MULSSrr, X86::ADDSSrr, X86::MULSSrr},
+                       Type::getFloatTy(MF.getFunction()->getContext()));
+      break;
+    case X86::DIVPSrr: // v4f32
+      genReciprocalDiv(
+          Root, InsInstrs, InstrIdxForVirtReg,
+          {X86::RCPPSr, X86::MULPSrr, X86::MOVAPSrr, X86::SUBPSrr, X86::MULPSrr,
+           X86::ADDPSrr, X86::MULPSrr},
+          VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4));
+      break;
+    }
+    break;
+  }
+  DEBUG(dbgs() << "\nAlternate sequence for " << MF.getName() << "\n";
+        for (unsigned i = 0; i < InsInstrs.size(); i++) {
+          dbgs() << i << ": ";
+          InsInstrs[i]->print(dbgs(), false, MF.getSubtarget().getInstrInfo());
+        });
+  DelInstrs.push_back(&Root); // Record FDiv for deletion
+}
+
+/// Find instructions that can be turned into recip
+static bool getFDIVPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+  switch (Root.getOpcode()) {
+  default:
+    return false;
+  // TODO: should we support other kinds of instructions?
+  case X86::VDIVSSrr:  // f32
+  case X86::VDIVPSrr:  // v4f32
+  case X86::VDIVPSYrr: // v8f32
+  case X86::DIVSSrr:   // f32
+  case X86::DIVPSrr:   // v4f32
+    break;
+  }
+  auto *MF = Root.getParent()->getParent();
+  auto TLI = MF->getSubtarget().getTargetLowering();
+  EVT VT = getFDivEVT(Root);
+  if (VT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
+  switch (TLI->getRecipEstimateDivEnabled(VT, *MF, /*forDAG*/ false)) {
+  case TLI->ReciprocalEstimate::Disabled:
+    return false;
+  case TLI->ReciprocalEstimate::Unspecified:
+    if (Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
+      break;
+    return false;
+  }
+
+  Patterns.push_back(MachineCombinerPattern::Div2RecipEst);
+  return true;
+}
+
+/// Return true when there is potentially a faster code sequence for an
+/// instruction chain ending in \p Root. All potential patterns are listed in
+/// the \p Pattern vector. Pattern should be sorted in priority order since the
+/// pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool X86InstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // FDIV patterns
+  if (getFDIVPatterns(Root, Patterns))
+    return true;
+  // TODO: FSQRT patterns will be prepared after reciprocal implementation
+  // completes
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
 /// This is an architecture-specific helper function of reassociateOps.
 /// Set special operand attributes for new instructions after reassociation.
 void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
Index: test/CodeGen/X86/recip-fastmath.ll
===================================================================
--- test/CodeGen/X86/recip-fastmath.ll
+++ test/CodeGen/X86/recip-fastmath.ll
@@ -37,9 +37,9 @@
 define float @f32_one_step(float %x) #1 {
 ; SSE-LABEL: f32_one_step:
 ; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm2, %xmm0
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    mulss %xmm2, %xmm1
 ; SSE-NEXT:    addss %xmm2, %xmm1
@@ -48,12 +48,12 @@
 ;
 ; AVX-RECIP-LABEL: f32_one_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_one_step:
@@ -65,22 +65,18 @@
 ;
 ; BTVER2-LABEL: f32_one_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step:
@@ -92,12 +88,8 @@
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_one_step:
@@ -113,10 +105,10 @@
 define float @f32_two_step(float %x) #2 {
 ; SSE-LABEL: f32_two_step:
 ; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulss %xmm2, %xmm3
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subss %xmm3, %xmm4
 ; SSE-NEXT:    mulss %xmm2, %xmm4
@@ -130,80 +122,64 @@
 ;
 ; AVX-RECIP-LABEL: f32_two_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm3
+; AVX-RECIP-NEXT:    vsubss %xmm3, %xmm1, %xmm3
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm3, %xmm3
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm3, %xmm2
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_two_step:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; FMA-RECIP-NEXT:    vmovaps %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm2, %xmm2, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd132ss %xmm3, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm3
+; BTVER2-NEXT:    vsubss %xmm3, %xmm1, %xmm3
+; BTVER2-NEXT:    vmulss %xmm2, %xmm3, %xmm3
+; BTVER2-NEXT:    vaddss %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_two_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_two_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; HASWELL-NEXT:    vmovaps %xmm0, %xmm3
+; HASWELL-NEXT:    vfnmadd132ss %xmm2, %xmm1, %xmm3
+; HASWELL-NEXT:    vfmadd132ss %xmm2, %xmm2, %xmm3
+; HASWELL-NEXT:    vfnmadd132ss %xmm3, %xmm1, %xmm0
 ; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_two_step:
@@ -276,9 +252,9 @@
 define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; SSE-LABEL: v4f32_one_step:
 ; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    subps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps %xmm2, %xmm1
@@ -287,12 +263,12 @@
 ;
 ; AVX-RECIP-LABEL: v4f32_one_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_one_step:
@@ -304,40 +280,33 @@
 ;
 ; BTVER2-LABEL: v4f32_one_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm2
+; HASWELL-NEXT:    vfnmadd231ps %xmm2, %xmm0, %xmm1
+; HASWELL-NEXT:    vfmadd132ps %xmm2, %xmm2, %xmm1
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_one_step:
@@ -361,10 +330,10 @@
 define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SSE-LABEL: v4f32_two_step:
 ; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -378,80 +347,64 @@
 ;
 ; AVX-RECIP-LABEL: v4f32_two_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm3
+; AVX-RECIP-NEXT:    vsubps %xmm3, %xmm1, %xmm3
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm3, %xmm3
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_two_step:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm2
+; FMA-RECIP-NEXT:    vmovaps %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm2, %xmm2, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd132ps %xmm3, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm3
+; BTVER2-NEXT:    vsubps %xmm3, %xmm1, %xmm3
+; BTVER2-NEXT:    vmulps %xmm2, %xmm3, %xmm3
+; BTVER2-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_two_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_two_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm2
+; HASWELL-NEXT:    vmovaps %xmm0, %xmm3
+; HASWELL-NEXT:    vfnmadd132ps %xmm2, %xmm1, %xmm3
+; HASWELL-NEXT:    vfmadd132ps %xmm2, %xmm2, %xmm3
+; HASWELL-NEXT:    vfnmadd132ps %xmm3, %xmm1, %xmm0
 ; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_two_step:
@@ -538,9 +491,9 @@
 define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_one_step:
 ; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    rcpps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    subps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm4, %xmm3
@@ -556,12 +509,12 @@
 ;
 ; AVX-RECIP-LABEL: v8f32_one_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_one_step:
@@ -573,40 +526,33 @@
 ;
 ; BTVER2-LABEL: v8f32_one_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NEXT:    vfnmadd231ps %ymm2, %ymm0, %ymm1
+; HASWELL-NEXT:    vfmadd132ps %ymm2, %ymm2, %ymm1
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_one_step:
@@ -631,10 +577,10 @@
 ; SSE-LABEL: v8f32_two_step:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps %xmm1, %xmm2
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    rcpps %xmm0, %xmm3
 ; SSE-NEXT:    movaps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    movaps %xmm1, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
 ; SSE-NEXT:    mulps %xmm3, %xmm5
@@ -660,80 +606,64 @@
 ;
 ; AVX-RECIP-LABEL: v8f32_two_step:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm1, %ymm3
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm3, %ymm3
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm3, %ymm2
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_two_step:
 ; FMA-RECIP:       # BB#0:
-; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
+; FMA-RECIP-NEXT:    vmovaps %ymm0, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm3
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm2, %ymm2, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd132ps %ymm3, %ymm1, %ymm0
 ; FMA-RECIP-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3
+; BTVER2-NEXT:    vsubps %ymm3, %ymm1, %ymm3
+; BTVER2-NEXT:    vmulps %ymm2, %ymm3, %ymm3
+; BTVER2-NEXT:    vaddps %ymm2, %ymm3, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_two_step:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_two_step:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vmovaps %ymm0, %ymm3
+; HASWELL-NEXT:    vfnmadd132ps %ymm2, %ymm1, %ymm3
+; HASWELL-NEXT:    vfmadd132ps %ymm2, %ymm2, %ymm3
+; HASWELL-NEXT:    vfnmadd132ps %ymm3, %ymm1, %ymm0
 ; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_two_step:
Index: test/CodeGen/X86/recip-fastmath2.ll
===================================================================
--- test/CodeGen/X86/recip-fastmath2.ll
+++ test/CodeGen/X86/recip-fastmath2.ll
@@ -38,8 +38,8 @@
 ;
 ; SANDY-LABEL: f32_no_step_2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_no_step_2:
@@ -67,8 +67,8 @@
 ; SSE-LABEL: f32_one_step_2:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
-; SSE-NEXT:    mulss %xmm2, %xmm0
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulss %xmm2, %xmm0
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    mulss %xmm2, %xmm1
 ; SSE-NEXT:    addss %xmm2, %xmm1
@@ -79,20 +79,21 @@
 ; AVX-RECIP-LABEL: f32_one_step_2:
 ; AVX-RECIP:       # BB#0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: f32_one_step_2:
 ; FMA-RECIP:       # BB#0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm2
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_one_step_2:
@@ -101,39 +102,27 @@
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step_2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_one_step_2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_one_step_2:
@@ -150,25 +139,25 @@
 define float @f32_one_step_2_divs(float %x) #1 {
 ; SSE-LABEL: f32_one_step_2_divs:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpss %xmm0, %xmm1
-; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT:    subss %xmm0, %xmm2
-; SSE-NEXT:    mulss %xmm1, %xmm2
-; SSE-NEXT:    addss %xmm1, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    mulss %xmm2, %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    rcpss %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm2, %xmm0
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    mulss %xmm2, %xmm1
+; SSE-NEXT:    addss %xmm2, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    mulss %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: f32_one_step_2_divs:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    retq
@@ -184,24 +173,20 @@
 ;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
@@ -217,12 +202,8 @@
 ;
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
@@ -244,9 +225,9 @@
 ; SSE-LABEL: f32_two_step_2:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulss %xmm2, %xmm3
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subss %xmm3, %xmm4
 ; SSE-NEXT:    mulss %xmm2, %xmm4
@@ -262,15 +243,15 @@
 ; AVX-RECIP-LABEL: f32_two_step_2:
 ; AVX-RECIP:       # BB#0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm3
+; AVX-RECIP-NEXT:    vsubss %xmm3, %xmm2, %xmm3
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm3, %xmm3
+; AVX-RECIP-NEXT:    vaddss %xmm1, %xmm3, %xmm1
 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
@@ -278,69 +259,45 @@
 ; FMA-RECIP:       # BB#0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vmovaps %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd132ss %xmm1, %xmm2, %xmm3
 ; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vfnmadd132ss %xmm3, %xmm2, %xmm0
 ; FMA-RECIP-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm3
+; BTVER2-NEXT:    vsubss %xmm3, %xmm2, %xmm3
+; BTVER2-NEXT:    vmulss %xmm1, %xmm3, %xmm3
+; BTVER2-NEXT:    vaddss %xmm1, %xmm3, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: f32_two_step_2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: f32_two_step_2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; AVX512-LABEL: f32_two_step_2:
@@ -362,8 +319,8 @@
 ; SSE-LABEL: v4f32_one_step2:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
+; SSE-NEXT:    movaps ${{\.LCPI.*}}, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    subps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps %xmm2, %xmm1
@@ -374,20 +331,21 @@
 ; AVX-RECIP-LABEL: v4f32_one_step2:
 ; AVX-RECIP:       # BB#0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_one_step2:
 ; FMA-RECIP:       # BB#0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vfnmadd231ps %xmm1, %xmm0, %xmm2
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm2
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm2, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_one_step2:
@@ -396,40 +354,27 @@
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_one_step2:
@@ -455,25 +400,25 @@
 define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; SSE-LABEL: v4f32_one_step_2_divs:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm1
-; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    subps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    subps %xmm0, %xmm1
+; SSE-NEXT:    mulps %xmm2, %xmm1
+; SSE-NEXT:    addps %xmm2, %xmm1
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    mulps %xmm1, %xmm0
+; SSE-NEXT:    mulps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    retq
@@ -489,46 +434,38 @@
 ;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps %xmm2, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm2
+; HASWELL-NEXT:    vfnmadd231ps %xmm2, %xmm0, %xmm1
+; HASWELL-NEXT:    vfmadd132ps %xmm2, %xmm2, %xmm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm1, %xmm0
+; HASWELL-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
@@ -560,9 +497,9 @@
 ; SSE-LABEL: v4f32_two_step2:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
+; SSE-NEXT:    movaps ${{\.LCPI.*}}, %xmm1
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -578,15 +515,15 @@
 ; AVX-RECIP-LABEL: v4f32_two_step2:
 ; AVX-RECIP:       # BB#0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm3
+; AVX-RECIP-NEXT:    vsubps %xmm3, %xmm2, %xmm3
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm3, %xmm3
+; AVX-RECIP-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
@@ -594,69 +531,45 @@
 ; FMA-RECIP:       # BB#0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vmovaps %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm3
 ; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vfnmadd132ps %xmm3, %xmm2, %xmm0
 ; FMA-RECIP-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
-; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm3
+; BTVER2-NEXT:    vsubps %xmm3, %xmm2, %xmm3
+; BTVER2-NEXT:    vmulps %xmm1, %xmm3, %xmm3
+; BTVER2-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v4f32_two_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %xmm0, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
-; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_two_step2:
@@ -689,41 +602,42 @@
 define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_one_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm1, %xmm4
-; SSE-NEXT:    mulps %xmm4, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    rcpps %xmm0, %xmm4
+; SSE-NEXT:    movaps ${{\.LCPI.*}}, %xmm2
+; SSE-NEXT:    mulps %xmm4, %xmm0
 ; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    subps %xmm1, %xmm3
+; SSE-NEXT:    subps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm4, %xmm3
 ; SSE-NEXT:    addps %xmm4, %xmm3
-; SSE-NEXT:    rcpps %xmm0, %xmm1
-; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    subps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    rcpps %xmm1, %xmm0
+; SSE-NEXT:    mulps %xmm0, %xmm1
+; SSE-NEXT:    subps %xmm1, %xmm2
+; SSE-NEXT:    mulps %xmm0, %xmm2
+; SSE-NEXT:    addps %xmm0, %xmm2
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_one_step2:
 ; AVX-RECIP:       # BB#0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_one_step2:
 ; FMA-RECIP:       # BB#0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
-; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vfnmadd231ps %ymm1, %ymm0, %ymm2
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm2
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm2, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_one_step2:
@@ -732,40 +646,27 @@
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_one_step2:
@@ -791,34 +692,34 @@
 define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_one_step_2_divs:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subps %xmm0, %xmm4
-; SSE-NEXT:    mulps %xmm2, %xmm4
-; SSE-NEXT:    addps %xmm2, %xmm4
-; SSE-NEXT:    rcpps %xmm1, %xmm0
-; SSE-NEXT:    mulps %xmm0, %xmm1
-; SSE-NEXT:    subps %xmm1, %xmm3
-; SSE-NEXT:    mulps %xmm0, %xmm3
-; SSE-NEXT:    addps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    rcpps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm2, %xmm4
+; SSE-NEXT:    subps %xmm1, %xmm4
+; SSE-NEXT:    mulps %xmm3, %xmm4
+; SSE-NEXT:    addps %xmm3, %xmm4
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SSE-NEXT:    mulps %xmm4, %xmm1
+; SSE-NEXT:    rcpps %xmm0, %xmm3
+; SSE-NEXT:    mulps %xmm3, %xmm0
+; SSE-NEXT:    subps %xmm0, %xmm2
+; SSE-NEXT:    mulps %xmm3, %xmm2
+; SSE-NEXT:    addps %xmm3, %xmm2
 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    mulps %xmm4, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
 ; AVX-RECIP:       # BB#0:
-; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    retq
@@ -834,46 +735,38 @@
 ;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; BTVER2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_one_step_2_divs:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
-; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
+; HASWELL-NEXT:    vfnmadd231ps %ymm2, %ymm0, %ymm1
+; HASWELL-NEXT:    vfmadd132ps %ymm2, %ymm2, %ymm1
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm0
+; HASWELL-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
@@ -904,48 +797,48 @@
 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SSE-LABEL: v8f32_two_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    rcpps %xmm1, %xmm3
-; SSE-NEXT:    movaps %xmm1, %xmm4
+; SSE-NEXT:    movaps %xmm1, %xmm2
+; SSE-NEXT:    rcpps %xmm0, %xmm3
+; SSE-NEXT:    movaps ${{\.LCPI.*}}, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SSE-NEXT:    movaps %xmm0, %xmm5
+; SSE-NEXT:    movaps %xmm1, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
 ; SSE-NEXT:    mulps %xmm3, %xmm5
 ; SSE-NEXT:    addps %xmm3, %xmm5
-; SSE-NEXT:    mulps %xmm5, %xmm1
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm1, %xmm3
+; SSE-NEXT:    mulps %xmm5, %xmm0
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    subps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm5, %xmm3
 ; SSE-NEXT:    addps %xmm5, %xmm3
-; SSE-NEXT:    rcpps %xmm2, %xmm1
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
+; SSE-NEXT:    rcpps %xmm2, %xmm0
 ; SSE-NEXT:    movaps %xmm2, %xmm4
-; SSE-NEXT:    mulps %xmm1, %xmm4
-; SSE-NEXT:    movaps %xmm0, %xmm5
+; SSE-NEXT:    mulps %xmm0, %xmm4
+; SSE-NEXT:    movaps %xmm1, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
-; SSE-NEXT:    mulps %xmm1, %xmm5
-; SSE-NEXT:    addps %xmm1, %xmm5
+; SSE-NEXT:    mulps %xmm0, %xmm5
+; SSE-NEXT:    addps %xmm0, %xmm5
 ; SSE-NEXT:    mulps %xmm5, %xmm2
-; SSE-NEXT:    subps %xmm2, %xmm0
-; SSE-NEXT:    mulps %xmm5, %xmm0
-; SSE-NEXT:    addps %xmm5, %xmm0
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
-; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
-; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    subps %xmm2, %xmm1
+; SSE-NEXT:    mulps %xmm5, %xmm1
+; SSE-NEXT:    addps %xmm5, %xmm1
+; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
+; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v8f32_two_step2:
 ; AVX-RECIP:       # BB#0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm3
+; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm2, %ymm3
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm3, %ymm3
+; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm3, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX-RECIP-NEXT:    retq
 ;
@@ -953,69 +846,45 @@
 ; FMA-RECIP:       # BB#0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; FMA-RECIP-NEXT:    vmovaps %ymm0, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; FMA-RECIP-NEXT:    vfnmadd132ps %ymm3, %ymm2, %ymm0
 ; FMA-RECIP-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
-; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm3
+; BTVER2-NEXT:    vsubps %ymm3, %ymm2, %ymm3
+; BTVER2-NEXT:    vmulps %ymm1, %ymm3, %ymm3
+; BTVER2-NEXT:    vaddps %ymm1, %ymm3, %ymm1
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; BTVER2-NEXT:    retq
 ;
 ; SANDY-LABEL: v8f32_two_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
-; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_two_step2:
@@ -1098,9 +967,9 @@
 define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
 ; SSE-LABEL: v8f32_no_step2:
 ; SSE:       # BB#0:
-; SSE-NEXT:    rcpps %xmm1, %xmm1
 ; SSE-NEXT:    rcpps %xmm0, %xmm0
 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    rcpps %xmm1, %xmm1
 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -1124,20 +993,20 @@
 ;
 ; SANDY-LABEL: v8f32_no_step2:
 ; SANDY:       # BB#0:
-; SANDY-NEXT:    vrcpps %ymm0, %ymm0
-; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; SANDY-NEXT:    retq
 ;
 ; HASWELL-LABEL: v8f32_no_step2:
 ; HASWELL:       # BB#0:
-; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
-; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NEXT:    retq
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
 ; HASWELL-NO-FMA:       # BB#0:
-; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_no_step2:
Index: test/CodeGen/X86/recip-pic.ll
===================================================================
--- test/CodeGen/X86/recip-pic.ll
+++ test/CodeGen/X86/recip-pic.ll
@@ -4,6 +4,8 @@
 define fastcc float @foo(float %x) unnamed_addr #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    rcpss %xmm0, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    calll .L0$pb
 ; CHECK-NEXT:  .Lcfi0:
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
@@ -13,8 +15,11 @@
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    divss %xmm0, %xmm1
+; CHECK-NEXT:    mulss %xmm2, %xmm0
+; CHECK-NEXT:    subss %xmm0, %xmm1
+; CHECK-NEXT:    mulss %xmm2, %xmm1
+; CHECK-NEXT:    addss %xmm2, %xmm1
+; CHECK-NEXT:    mulss {{\.LCPI.*}}@GOTOFF(%eax), %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    movss %xmm1, (%eax)
 ; CHECK-NEXT:    retl