Index: include/llvm/CodeGen/CommandFlags.h
===================================================================
--- include/llvm/CodeGen/CommandFlags.h
+++ include/llvm/CodeGen/CommandFlags.h
@@ -24,6 +24,7 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRecip.h"
 #include <string>
 using namespace llvm;
 
@@ -152,6 +153,12 @@
                          "Only fuse FP ops when the result won't be effected."),
               clEnumValEnd));
 
+cl::list<std::string>
+ReciprocalOps("recip",
+       cl::CommaSeparated,
+       cl::desc("Choose reciprocal operation types and parameters."),
+       cl::value_desc("all,none,default,divf,vec-sqrtd,vec-divd:0,sqrtf:9..."));
+
 cl::opt<bool>
 DontPlaceZerosInBSS("nozero-initialized-in-bss",
               cl::desc("Don't place zero-initialized symbols into bss section"),
@@ -231,6 +238,7 @@
   Options.LessPreciseFPMADOption = EnableFPMAD;
   Options.NoFramePointerElim = DisableFPElim;
   Options.AllowFPOpFusion = FuseFPOps;
+  Options.Reciprocals = ReciprocalOps;
   Options.UnsafeFPMath = EnableUnsafeFPMath;
   Options.NoInfsFPMath = EnableNoInfsFPMath;
   Options.NoNaNsFPMath = EnableNoNaNsFPMath;
Index: include/llvm/Target/TargetOptions.h
===================================================================
--- include/llvm/Target/TargetOptions.h
+++ include/llvm/Target/TargetOptions.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TARGET_TARGETOPTIONS_H
 #define LLVM_TARGET_TARGETOPTIONS_H
 
+#include "llvm/Target/TargetRecip.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include <string>
 
@@ -72,7 +73,8 @@
           CompressDebugSections(false), FunctionSections(false),
           DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
           TrapFuncName(), FloatABIType(FloatABI::Default),
-          AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single),
+          AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(),
+          JTType(JumpTable::Single),
           ThreadModel(ThreadModel::POSIX) {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
@@ -211,6 +213,9 @@
     /// the value of this option.
     FPOpFusion::FPOpFusionMode AllowFPOpFusion;
 
+    /// This class encapsulates options for reciprocal-estimate code generation.
+    TargetRecip Reciprocals;
+    
     /// JTType - This flag specifies the type of jump-instruction table to
     /// create for functions that have the jumptable attribute.
     JumpTable::JumpTableType JTType;
@@ -245,6 +250,7 @@
     ARE_EQUAL(TrapFuncName) &&
     ARE_EQUAL(FloatABIType) &&
     ARE_EQUAL(AllowFPOpFusion) &&
+    ARE_EQUAL(Reciprocals) &&
     ARE_EQUAL(JTType) &&
     ARE_EQUAL(ThreadModel) &&
     ARE_EQUAL(MCOptions);
Index: include/llvm/Target/TargetRecip.h
===================================================================
--- include/llvm/Target/TargetRecip.h
+++ include/llvm/Target/TargetRecip.h
@@ -0,0 +1,74 @@
+//===--------------------- llvm/Target/TargetRecip.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is used to customize machine-specific reciprocal estimate code
+// generation in a target-independent way.
+// If a target does not support operations in this specification, then code
+// generation will default to using supported operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_TARGETRECIP_H
+#define LLVM_TARGET_TARGETRECIP_H
+
+#include <vector>
+#include <string>
+
+namespace llvm {
+
+enum RecipOps {
+  RO_DivF = 0,     // division,    float,  scalar
+  RO_VecDivF,      // division,    float,  vector
+  RO_DivD,         // division,    double, scalar
+  RO_VecDivD,      // division,    double, vector
+  RO_SqrtF,        // square root, float,  scalar
+  RO_VecSqrtF,     // square root, float,  vector
+  RO_SqrtD,        // square root, double, scalar
+  RO_VecSqrtD,     // square root, double, vector
+  RO_All,
+  RO_NUM_RECIP_OPS = RO_All
+};
+
+class TargetRecip {
+public:
+  TargetRecip();
+
+  /// Initialize all or part of the operations from command-line options or
+  /// encoded strings.
+  TargetRecip(const std::vector<std::string> &Args);
+
+  virtual ~TargetRecip();
+  
+  /// Set whether a particular reciprocal operation is enabled and how many
+  /// refinement steps are needed when using it. Use the 'RO_All' value
+  /// to set enablement and refinement steps for all operations.
+  void setDefaults(RecipOps Op, bool Enable, unsigned RefSteps);
+
+  /// Return true if the reciprocal operation has been enabled by default or
+  /// from the command-line. Return false if the operations has been disabled
+  /// by default or from the command-line.
+  bool isEnabled(RecipOps Op) const;
+
+  /// Return the number of iterations necessary to refine the
+  /// the result of a machine instruction for the given reciprocal operation.
+  unsigned getRefinementSteps(RecipOps Op) const;
+
+  bool operator==(const TargetRecip &Other) const;
+
+private:
+  int8_t Enabled[RO_NUM_RECIP_OPS];
+  int8_t RefinementSteps[RO_NUM_RECIP_OPS];
+  
+  bool ParseGlobalParams(const std::string &Arg);
+  void ParseIndividualParams(const std::string &Arg);
+};
+
+} // End llvm namespace
+
+#endif
Index: lib/Target/CMakeLists.txt
===================================================================
--- lib/Target/CMakeLists.txt
+++ lib/Target/CMakeLists.txt
@@ -6,6 +6,7 @@
   TargetLoweringObjectFile.cpp
   TargetMachine.cpp
   TargetMachineC.cpp
+  TargetRecip.cpp
   TargetSubtargetInfo.cpp
 
   ADDITIONAL_HEADER_DIRS
Index: lib/Target/TargetRecip.cpp
===================================================================
--- lib/Target/TargetRecip.cpp
+++ lib/Target/TargetRecip.cpp
@@ -0,0 +1,204 @@
+//===-------------------------- TargetRecip.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is used to customize machine-specific reciprocal estimate code
+// generation in a target-independent way.
+// If a target does not support operations in this specification, then code
+// generation will default to using supported operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRecip.h"
+
+using namespace llvm;
+
+// These must be in the same order as the corresponding enum values.
+const char *ArgStrings[] = {
+  "divf",
+  "vec-divf",
+  "divd",
+  "vec-divd",
+  "sqrtf",
+  "vec-sqrtf",
+  "sqrtd",
+  "vec-sqrtd"
+};
+
+// The uninitialized state is needed for the enablement bits and refinement
+// steps because custom settings may arrive via the command-line before target
+// defaults are set.
+enum {
+  Uninitialized = -1
+};
+
+TargetRecip::TargetRecip() {
+  for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) {
+    Enabled[i] = Uninitialized;
+    RefinementSteps[i] = Uninitialized;
+  }
+}
+
+static bool ParseRefinementStep(const std::string &In, size_t &Position,
+                                uint8_t &Value) {
+  const char REF_STEP_TOKEN = ':';
+  Position = In.find(REF_STEP_TOKEN);
+  if (Position == std::string::npos)
+    return false;
+  
+  std::string RefStepString = In.substr(Position + 1);
+  // Allow exactly one numeric character for the additional refinement
+  // step parameter.
+  if (RefStepString.length() == 1) {
+    char RefStepChar = RefStepString[0];
+    if (RefStepChar >= '0' && RefStepChar <= '9') {
+      Value = RefStepChar - '0';
+      return true;
+    }
+  }
+  report_fatal_error("Invalid refinement step for -recip.");
+}
+
+bool TargetRecip::ParseGlobalParams(const std::string &Arg) {
+  bool Enable;
+  bool UseDefaults;
+  if (Arg.find("all") == 0) {
+    UseDefaults = false;
+    Enable = true;
+  } else if (Arg.find("none") == 0) {
+    UseDefaults = false;
+    Enable = false;
+  } else if (Arg.find("default") == 0) {
+    UseDefaults = true;
+  } else {
+    // Any other string is invalid or an individual setting.
+    return false;
+  }
+  
+  // All enable values will be initialized to target defaults if 'default' was
+  // specified.
+  if (!UseDefaults)
+    for (int i = 0; i != RO_NUM_RECIP_OPS; ++i)
+      Enabled[i] = Enable;
+
+  size_t RefPos;
+  uint8_t RefSteps;
+  if (ParseRefinementStep(Arg, RefPos, RefSteps)) {
+    // Custom refinement count was specified with all, none, or default.
+    for (int i = 0; i != RO_NUM_RECIP_OPS; ++i)
+      RefinementSteps[i] = RefSteps;
+  }
+  return true;
+}
+
+void TargetRecip::ParseIndividualParams(const std::string &Arg) {
+  std::string ArgSub = Arg;
+
+  // Each reciprocal type may be enabled ('+') or disabled ('-') individually.
+  bool IsEnabled;
+  if (Arg[0] == '+') {
+    ArgSub = Arg.substr(1);
+    IsEnabled = true;
+  } else if (Arg[0] == '-') {
+    ArgSub = Arg.substr(1);
+    IsEnabled = false;
+  } else {
+    // If no plus or minus, default to plus.
+    IsEnabled = true;
+  }
+  
+  // Look for an optional setting of the number of refinement steps needed
+  // for this type of reciprocal operation.
+  size_t RefPos;
+  uint8_t RefSteps;
+  std::string RefStepString;
+  if (ParseRefinementStep(ArgSub, RefPos, RefSteps)) {
+    // Split the string for further processing.
+    RefStepString = ArgSub.substr(RefPos + 1);
+    ArgSub = ArgSub.substr(0, RefPos);
+  }
+  
+  // Find the reciprocal operation corresponding to this string value.
+  RecipOps Op = RO_NUM_RECIP_OPS;
+  for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) {
+    if (ArgSub == ArgStrings[i]) {
+      Op = (RecipOps)i;
+      break;
+    }
+  }
+  
+  if (Op == RO_NUM_RECIP_OPS)
+    report_fatal_error("Invalid option for -recip.");
+  
+  
+  // Set whether this operation is being enabled or disabled, and optionally
+  // set the number of refinement steps for this operation.
+  Enabled[Op] = IsEnabled;
+  if (!RefStepString.empty()) {
+    RefinementSteps[Op] = RefSteps;
+  }
+}
+
+TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
+  TargetRecip::TargetRecip() {
+  unsigned NumArgs = Args.size();
+
+  // Check if "all", "default", or "none" was specified.
+  if (NumArgs == 1 && ParseGlobalParams(Args[0]))
+    return;
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    std::string Value = Args[i];
+    if (Value.empty())
+      report_fatal_error("Empty option string for -recip.");
+    ParseIndividualParams(Value);
+  }
+}
+
+bool TargetRecip::isEnabled(RecipOps Op) const {
+  if (Op == RO_NUM_RECIP_OPS) return false;
+  assert(Enabled[Op] != Uninitialized && "Enabled setting was not initialized");
+  return Enabled[Op];
+}
+
+unsigned TargetRecip::getRefinementSteps(RecipOps Op) const {
+  if (Op == RO_NUM_RECIP_OPS) return 0;
+  assert(RefinementSteps[Op] != Uninitialized &&
+         "Refinement step setting was not initialized");
+  return RefinementSteps[Op];
+}
+
+void TargetRecip::setDefaults(RecipOps Op, bool Enable, unsigned RefSteps) {
+  if (Op == RO_All) {
+    for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) {
+      if (Enabled[i] == Uninitialized)
+        Enabled[i] = Enable;
+      if (RefinementSteps[i] == Uninitialized)
+        RefinementSteps[i] = RefSteps;
+    }
+  } else {
+    if (Enabled[Op] == Uninitialized)
+      Enabled[Op] = Enable;
+    if (RefinementSteps[Op] == Uninitialized)
+      RefinementSteps[Op] = RefSteps;
+  }
+}
+
+bool TargetRecip::operator==(const TargetRecip &Other) const {
+  for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) {
+    if (RefinementSteps[i] != Other.RefinementSteps[i])
+      return false;
+    if (Enabled[i] != Other.Enabled[i])
+      return false;
+  }
+  return true;
+}
+
+TargetRecip::~TargetRecip() {
+}
Index: lib/Target/X86/X86.td
===================================================================
--- lib/Target/X86/X86.td
+++ lib/Target/X86/X86.td
@@ -188,10 +188,6 @@
                                    "LEA instruction with certain arguments is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
-def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
-                            "Use RSQRT* to optimize square root calculations">;
-def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
-                          "true", "Use RCP* to optimize division calculations">;
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
@@ -444,7 +440,7 @@
                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                       FeatureBMI, FeatureF16C, FeatureMOVBE,
                       FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
-                      FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
+                      FeatureSlowSHLD]>;
 
 // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -67,12 +67,6 @@
              "rather than promotion."),
     cl::Hidden);
 
-static cl::opt<int> ReciprocalEstimateRefinementSteps(
-    "x86-recip-refinement-steps", cl::init(1),
-    cl::desc("Specify the number of Newton-Raphson iterations applied to the "
-             "result of the hardware reciprocal estimate instruction."),
-    cl::NotHidden);
-
 // Forward declarations.
 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
@@ -12874,29 +12868,31 @@
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps,
                                             bool &UseOneConstNR) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor and/or sqrt operand.
-  if (!Subtarget->useSqrtEst())
-    return SDValue();
-
   EVT VT = Op.getValueType();
+  RecipOps RecipOp;
 
-  // SSE1 has rsqrtss and rsqrtps.
+  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   // instructions: convert to single, rsqrtss, convert back to double, refine
   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = 1;
-    UseOneConstNR = false;
-    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = RO_SqrtF;
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = RO_VecSqrtF;
+  else
+    return SDValue();
+  
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+  
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  UseOneConstNR = false;
+  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
@@ -12904,15 +12900,9 @@
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor.
-  if (!Subtarget->useReciprocalEst())
-    return SDValue();
-
   EVT VT = Op.getValueType();
-
+  RecipOps RecipOp;
+  
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
@@ -12920,12 +12910,20 @@
   // 15 instructions: convert to single, rcpss, convert back to double, refine
   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = ReciprocalEstimateRefinementSteps;
-    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = RO_DivF;
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = RO_VecDivF;
+  else
+    return SDValue();
+  
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
Index: lib/Target/X86/X86Subtarget.h
===================================================================
--- lib/Target/X86/X86Subtarget.h
+++ lib/Target/X86/X86Subtarget.h
@@ -190,16 +190,6 @@
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
-  /// Use the RSQRT* instructions to optimize square root calculations.
-  /// For this to be profitable, the cost of FSQRT and FDIV must be
-  /// substantially higher than normal FP ops like FADD and FMUL.
-  bool UseSqrtEst;
-
-  /// Use the RCP* instructions to optimize FP division calculations.
-  /// For this to be profitable, the cost of FDIV must be
-  /// substantially higher than normal FP ops like FADD and FMUL.
-  bool UseReciprocalEst;
-
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
 
@@ -377,8 +367,6 @@
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
   bool slowIncDec() const { return SlowIncDec; }
-  bool useSqrtEst() const { return UseSqrtEst; }
-  bool useReciprocalEst() const { return UseReciprocalEst; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
Index: lib/Target/X86/X86Subtarget.cpp
===================================================================
--- lib/Target/X86/X86Subtarget.cpp
+++ lib/Target/X86/X86Subtarget.cpp
@@ -273,8 +273,6 @@
   LEAUsesAG = false;
   SlowLEA = false;
   SlowIncDec = false;
-  UseSqrtEst = false;
-  UseReciprocalEst = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -109,6 +109,7 @@
   if (Subtarget.isTargetWin64())
     this->Options.TrapUnreachable = true;
 
+  this->Options.Reciprocals.setDefaults(RO_All, false, 1);
   initAsmInfo();
 }
 
Index: test/CodeGen/X86/recip-fastmath.ll
===================================================================
--- test/CodeGen/X86/recip-fastmath.ll
+++ test/CodeGen/X86/recip-fastmath.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE
 
 ; If the target's divss/divps instructions are substantially
 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
Index: test/CodeGen/X86/sqrt-fastmath.ll
===================================================================
--- test/CodeGen/X86/sqrt-fastmath.ll
+++ test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
 
 declare double @__sqrt_finite(double) #0
 declare float @__sqrtf_finite(float) #0