Index: llvm/trunk/include/llvm/Target/TargetLowering.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h
+++ llvm/trunk/include/llvm/Target/TargetLowering.h
@@ -243,9 +243,10 @@
     return true;
   }
 
-  /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
-  bool isFsqrtCheap() const {
-    return FsqrtIsCheap;
+  /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
+  virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
+    // Default behavior is to replace SQRT(X) with X*RSQRT(X).
+    return false;
   }
 
   /// Returns true if target has indicated at least one type should be bypassed.
@@ -1381,10 +1382,6 @@
   /// control.
   void setJumpIsExpensive(bool isExpensive = true);
 
-  /// Tells the code generator that fsqrt is cheap, and should not be replaced
-  /// with an alternative sequence of instructions.
-  void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
-
   /// Tells the code generator that this target supports floating point
   /// exceptions and cares about preserving floating point exception behavior.
   void setHasFloatingPointExceptions(bool FPExceptions = true) {
@@ -1910,9 +1907,6 @@
   /// combined with "shift" to BitExtract instructions.
   bool HasExtractBitsInsn;
 
-  // Don't expand fsqrt with an approximation based on the inverse sqrt.
-  bool FsqrtIsCheap;
-
   /// Tells the code generator to bypass slow divide or remainder
   /// instructions. For example, BypassSlowDivWidths[32,8] tells the code
   /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8907,14 +8907,18 @@
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (TLI.isFsqrtCheap(N0, DAG))
     return SDValue();
 
   // TODO: FSQRT nodes should have flags that propagate to the created nodes.
   // For now, create a Flags object for use with all unsafe math transforms.
   SDNodeFlags Flags;
   Flags.setUnsafeAlgebra(true);
-  return buildSqrtEstimate(N->getOperand(0), &Flags);
+  return buildSqrtEstimate(N0, &Flags);
 }
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)
Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
@@ -807,7 +807,6 @@
   SelectIsExpensive = false;
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
-  FsqrtIsCheap = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -166,6 +166,9 @@
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
+  bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+    return true;
+  }
   SDValue getRsqrtEstimate(SDValue Operand,
                            DAGCombinerInfo &DCI,
                            unsigned &RefinementSteps,
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,8 +446,6 @@
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
 
-  setFsqrtIsCheap(true);
-
   // We want to find all load dependencies for long chains of stores to enable
   // merging into very wide vectors. The problem is with vectors with > 4
   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
Index: llvm/trunk/lib/Target/X86/X86.td
===================================================================
--- llvm/trunk/lib/Target/X86/X86.td
+++ llvm/trunk/lib/Target/X86/X86.td
@@ -249,6 +249,19 @@
 def FeatureFastPartialYMMWrite
     : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
                        "true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+    : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+                       "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+    : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+                       "true", "Vector SQRT is fast (disable Newton-Raphson)">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -442,7 +455,8 @@
   FeaturePCLMUL,
   FeatureXSAVE,
   FeatureXSAVEOPT,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFastScalarFSQRT
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -500,7 +514,8 @@
   FeatureXSAVEC,
   FeatureXSAVES,
   FeatureSGX,
-  FeatureCLFLUSHOPT
+  FeatureCLFLUSHOPT,
+  FeatureFastVectorFSQRT
 ]>;
 
 // FIXME: define SKL model
Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h
@@ -1219,6 +1219,9 @@
     /// Convert a comparison if required by the subtarget.
     SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
 
+    /// Check if replacement of SQRT with RSQRT should be disabled.
+    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
     /// Use rsqrt* to speed up sqrt calculations.
     SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                              unsigned &RefinementSteps,
Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -15081,6 +15081,19 @@
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // We never want to use both SQRT and RSQRT instructions for the same input.
+  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+    return false;
+
+  if (VT.isVector())
+    return Subtarget.hasFastVectorFSQRT();
+  return Subtarget.hasFastScalarFSQRT();
+}
+
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
Index: llvm/trunk/lib/Target/X86/X86Subtarget.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h
@@ -199,6 +199,14 @@
   /// of a YMM register without clearing the upper part.
   bool HasFastPartialYMMWrite;
 
+  /// True if hardware SQRTSS instruction is at least as fast (latency) as
+  /// RSQRTSS followed by a Newton-Raphson iteration.
+  bool HasFastScalarFSQRT;
+
+  /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+  /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+  bool HasFastVectorFSQRT;
+
   /// True if 8-bit divisions are significantly faster than
   /// 32-bit divisions and should be used when possible.
   bool HasSlowDivide32;
@@ -434,6 +442,8 @@
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
   bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp
@@ -282,6 +282,8 @@
   HasCmpxchg16b = false;
   UseLeaForSP = false;
   HasFastPartialYMMWrite = false;
+  HasFastScalarFSQRT = false;
+  HasFastVectorFSQRT = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
Index: llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem     | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell   | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake     | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
+
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
+
+define float @foo_x1(float %f) #0 {
+; SCALAR-EST-LABEL: foo_x1:
+; SCALAR-EST:       # BB#0:
+; SCALAR-EST-NEXT:    rsqrtss %xmm0
+; SCALAR-EST:         retq
+;
+; SCALAR-ACC-LABEL: foo_x1:
+; SCALAR-ACC:       # BB#0:
+; SCALAR-ACC-NEXT:    {{^ *v?sqrtss %xmm0}}
+; SCALAR-ACC-NEXT:    retq
+  %call = tail call float @llvm.sqrt.f32(float %f) #1
+  ret float %call
+}
+
+define <4 x float> @foo_x4(<4 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x4:
+; VECTOR-EST:       # BB#0:
+; VECTOR-EST-NEXT:    rsqrtps %xmm0
+; VECTOR-EST:         retq
+;
+; VECTOR-ACC-LABEL: foo_x4:
+; VECTOR-ACC:       # BB#0:
+; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %xmm0}}
+; VECTOR-ACC-NEXT:    retq
+  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
+  ret <4 x float> %call
+}
+
+define <8 x float> @foo_x8(<8 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x8:
+; VECTOR-EST:       # BB#0:
+; VECTOR-EST-NEXT:    rsqrtps
+; VECTOR-EST:         retq
+;
+; VECTOR-ACC-LABEL: foo_x8:
+; VECTOR-ACC:       # BB#0:
+; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %[xy]mm0}}
+; VECTOR-ACC-NOT:     rsqrt
+; VECTOR-ACC:         retq
+  %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
+  ret <8 x float> %call
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }