Index: lib/Target/X86/X86.td
===================================================================
--- lib/Target/X86/X86.td
+++ lib/Target/X86/X86.td
@@ -182,6 +182,8 @@
                                    "LEA instruction with certain arguments is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
+                            "Use RSQRT* to optimize square root calculations">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -347,7 +349,8 @@
                      [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                       FeatureBMI, FeatureF16C, FeatureMOVBE,
-                      FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
+                      FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
+                      FeatureUseSqrtEst]>;
 
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -1014,6 +1014,10 @@
 
     /// Convert a comparison if required by the subtarget.
     SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+    /// Use rsqrt* to speed up sqrt calculations.
+    SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                            unsigned &RefinementSteps) const override;
   };
 
   namespace X86 {
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -14335,6 +14335,59 @@
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+/// Override the default NR algorithm because the 2-constant implementation
+/// runs faster on Intel SandyBridge and AMD Jaguar (btver2). It has one
+/// less FP instruction in exchange for an extra constant load that should
+/// be folded into an FP op.
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Operand,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
+  // FIXME: We should use instruction latency models to calculate the cost of
+  // each potential sequence, but this is very hard to do reliably because
+  // at least Intel's Core* chips have variable timing based on the number of
+  // sig digs in the divisor and/or sqrt operand.
+  if (!Subtarget->useSqrtEst())
+    return SDValue();
+
+  EVT VT = Operand.getValueType();
+  
+  // SSE1 has rsqrtss and rsqrtps.
+  // TODO: Add support for AVX (v8f32) and AVX512 (v16f32).
+  // TODO: Is it ever worthwhile to use an estimate for f64?
+  if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) {
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal sqrt, we need to find the zero of the function:
+    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+    //     =>
+    //   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+    SDLoc DL(Operand);
+    SDValue MinusThree = DCI.DAG.getConstantFP(-3.0, VT);
+    SDValue MinusHalf = DCI.DAG.getConstantFP(-0.5, VT);
+    
+    SDValue Est = DCI.DAG.getNode(X86ISD::FRSQRT, DL, VT, Operand);
+    SDValue HalfEst = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
+    DCI.AddToWorklist(HalfEst.getNode());
+    
+    Est = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    DCI.AddToWorklist(Est.getNode());
+    
+    Est = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, Operand);
+    DCI.AddToWorklist(Est.getNode());
+      
+    Est = DCI.DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree);
+    DCI.AddToWorklist(Est.getNode());
+    
+    Est = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst);
+    DCI.AddToWorklist(Est.getNode());
+ 
+    RefinementSteps = 0;
+    return Est;
+  }
+  return SDValue();
+}
+
 static bool isAllOnes(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   return C && C->isAllOnesValue();
Index: lib/Target/X86/X86Subtarget.h
===================================================================
--- lib/Target/X86/X86Subtarget.h
+++ lib/Target/X86/X86Subtarget.h
@@ -192,6 +192,11 @@
   /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
+  /// Use the RSQRT* instructions to optimize square root calculations.
+  /// For this to be profitable, the cost of FSQRT and FDIV must be
+  /// substantially higher than normal FP ops like FADD and FMUL.
+  bool UseSqrtEst;
+
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
 
@@ -369,6 +374,7 @@
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
   bool slowIncDec() const { return SlowIncDec; }
+  bool useSqrtEst() const { return UseSqrtEst; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
Index: lib/Target/X86/X86Subtarget.cpp
===================================================================
--- lib/Target/X86/X86Subtarget.cpp
+++ lib/Target/X86/X86Subtarget.cpp
@@ -278,6 +278,7 @@
   LEAUsesAG = false;
   SlowLEA = false;
   SlowIncDec = false;
+  UseSqrtEst = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
Index: test/CodeGen/X86/sqrt-fastmath.ll
===================================================================
--- test/CodeGen/X86/sqrt-fastmath.ll
+++ test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
 ; #include <math.h>
@@ -52,9 +53,59 @@
   ret x86_fp80 %call
 }
 
-; Function Attrs: nounwind readnone
 declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
 
+; If the target's sqrtss and divss instructions are substantially
+; slower than rsqrtss with a Newton-Raphson refinement, we should
+; generate the estimate sequence.
+define float @reciprocal_square_root(float %x) #0 {
+  %sqrt = tail call float @llvm.sqrt.f32(float %x)
+  %div = fdiv fast float 1.0, %sqrt
+  ret float %div
+
+; CHECK-LABEL: reciprocal_square_root:
+; CHECK: sqrtss
+; CHECK-NEXT: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root:
+; BTVER2: vrsqrtss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vaddss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: retq
+}
+
+declare float @llvm.sqrt.f32(float) #1
+
+; If the target's sqrtps and divps instructions are substantially
+; slower than rsqrtps with a Newton-Raphson refinement, we should
+; generate the estimate sequence.
+define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
+  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v4f32:
+; CHECK: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v4f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
+
+
 attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }