Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -606,24 +606,22 @@
   return true;
 }
 
-// Perform RCP optimizations:
+// lowerUsingRcp:
 //
-// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
-//                                                denormals flushed.
+// 1/x -> rcp(x) when fdiv is allowed to be re-associated or rcp is accurate.
 //
-// a/b -> a*rcp(b) when fast unsafe rcp is legal.
-static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
-                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
-                            bool HasDenormals, bool NeedHighAccuracy) {
+// a/b -> a*rcp(b) when fdiv is allowed to be re-associated.
+static Value *lowerUsingRcp (Value *Num, Value *Den, bool CanReassociateFDiv,
+                             bool RcpIsAccurate, IRBuilder<> Builder,
+                             MDNode *FPMath, Module *Mod) {
 
-  Type *Ty = Den->getType();
-  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
-                             (HasDenormals || NeedHighAccuracy))
+  if (!CanReassociateFDiv && !RcpIsAccurate)
     return nullptr;
 
+  Type *Ty = Den->getType();
   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
   if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
-    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
+    if (CanReassociateFDiv || RcpIsAccurate) {
       if (CLHS->isExactlyValue(1.0)) {
         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
         // the CI documentation has a worst case error of 1 ulp.
@@ -648,7 +646,7 @@
     }
   }
 
-  if (FastUnsafeRcpLegal) {
+  if (CanReassociateFDiv) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
     Value *Recip = Builder.CreateCall(Decl, { Den });
@@ -657,40 +655,54 @@
   return nullptr;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
-                              bool HasDenormals) {
-  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
-  if (!CNum)
-    return HasDenormals;
+// lowerUsingFDivFast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
+//
+// NOTE: lowerUsingRcp should be tried first because rcp is the preference.
+static Value *lowerUsingFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+                                 bool HasDenormals, IRBuilder<> Builder,
+                                 MDNode *FPMath, Module *Mod) {
+  // fdiv.fast can achieve 2.5 ULP accuracy.
+  if (ReqdAccuracy < 2.5f)
+    return nullptr;
 
-  if (FastUnsafeRcpLegal)
-    return true;
+  // Only have fdiv.fast for f32.
+  Type *Ty = Den->getType();
+  if (!Ty->isFloatTy())
+    return nullptr;
 
-  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+  bool NumIsOne = false;
+  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+    if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+      NumIsOne = true;
+  }
 
-  // Reciprocal f32 is handled separately without denormals.
-  return HasDenormals ^ IsOne;
-}
+  // fdiv does not support denormals. But 1.0/x is always fine to use it.
+  if (HasDenormals && !NumIsOne)
+    return nullptr;
 
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+  return Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
+}
 
 // Optimizations is performed based on fpmath, fast math flags as wells as
 // denormals to lower fdiv using either rcp or fdiv.fast.
 //
-// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
-//                     unsafe-fp-math, fast math flags, denormals and fpmath
-//                     accuracy request.
+// Use rcp:
+//   1/x -> rcp(x) when fdiv is allowed to be re-associated or rcp is
+//                 sufficiently accurate.
 //
-// RCP Optimizations:
-//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
-//                                                  denormals flushed.
-//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
+//   a/b -> a*rcp(b) fdiv is allowed to be re-associated.
 //
 // Use fdiv.fast:
-//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
-//                          fpmath >= 2.5ULP with denormals flushed.
+//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
 //
-//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
-//                          fpmath >= 2.5ULP with denormals.
+// Using rcp is the preference.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 
   Type *Ty = FDiv.getType()->getScalarType();
@@ -701,18 +713,16 @@
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
+  const float ReqdAccuracy =  FPOp->getFPAccuracy();
 
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  // Determine whether it is ok to use rcp based on unsafe-fp-math,
-  // fast math flags, denormals and accuracy request.
-  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
-          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
-                                     || FMF.approxFunc()));
+  const bool CanReassociateFDiv = HasUnsafeFPMath || FMF.allowReciprocal();
 
-  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
-  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
-                           !FastUnsafeRcpLegal;
+  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+  // rcp_f64 is never accurate.
+  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+            (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
@@ -730,31 +740,24 @@
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt = nullptr;
-      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
-                                           HasFP32Denormals)) {
-        Function *Decl =
-                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
-      }
-      if (!NewElt) // Try rcp.
-        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
-                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
-      if (!NewElt)
+      // Try rcp first.
+      Value *NewElt = lowerUsingRcp(NumEltI, DenEltI, CanReassociateFDiv,
+                                    RcpIsAccurate, Builder, FPMath, Mod);
+      if (!NewElt) // Try fdiv.fast.
+        NewElt = lowerUsingFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+                                    HasFP32Denormals, Builder, FPMath, Mod);
+      if (!NewElt) // Keep the original.
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else { // Scalar.
-    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
-                                          HasFP32Denormals)) {
-      Function *Decl =
-               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
-    }
-    if (!NewFDiv) { // Try rcp.
-      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
-                              Mod, HasFP32Denormals, NeedHighAccuracy);
+  } else { // Scalar FDiv.
+    // Try rcp first.
+    NewFDiv = lowerUsingRcp(Num, Den, CanReassociateFDiv, RcpIsAccurate,
+                            Builder, FPMath, Mod);
+    if (!NewFDiv) { // Try fdiv.fast.
+      NewFDiv = lowerUsingFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+                                   Builder, FPMath, Mod);
     }
   }
 
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7470,19 +7470,12 @@
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
 
-  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
-         (Flags.hasAllowReciprocal() &&
-          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
-            VT == MVT::f16 ||
-            Flags.hasApproximateFuncs()));
-
-  // Do rcp optimization only when fast unsafe rcp is legal here.
-  // NOTE: We already performed RCP optimization to insert intrinsics in
-  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
-  // rcp optimization.
-  //   However, there are cases like FREM, which is expended into a sequence
-  // of instructions including FDIV, which may expose new opportunities.
-  if (!FastUnsafeRcpLegal)
+  bool CanReassociateFDiv = DAG.getTarget().Options.UnsafeFPMath ||
+                            Flags.hasAllowReciprocal();
+
+  // Without !fpmath accuracy information, we can't do more because we don't
+  // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+  if (!CanReassociateFDiv)
     return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -49,7 +49,7 @@
 ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
 ; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x
+; CHECK: %arcp.no.md = call arcp float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
@@ -78,28 +78,6 @@
   ret void
 }
 
-; CHECK-LABEL: @rcp_fdiv_arcp_denormal(
-; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0
-; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2
-; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
-; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
-define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 {
-
-  %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0
-  store volatile float %arcp.low.accuracy, float addrspace(1)* %out
-
-  %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2
-  store volatile float %arcp.high.accuracy, float addrspace(1)* %out
-
-  %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0
-  store volatile float %arcp.low.afn, float addrspace(1)* %out
-
-  %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2
-  store volatile float %arcp.high.afn, float addrspace(1)* %out
-
-  ret void
-}
-
 ; CHECK-LABEL: @fdiv_fpmath_vector(
 ; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -173,10 +151,10 @@
 ; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
+; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO0]])
 ; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
 ; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 1.000000e+00, %[[ARCP_NO1]]
+; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO1]])
 ; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
 ; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
 
@@ -235,11 +213,12 @@
 ; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
+; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO0]])
 ; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
 ; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 2.000000e+00, %[[ARCP_NO1]]
-; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
+; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_NO1]])
+; CHECK: %[[ARCP_NO_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_NO_FDIV1]]
+; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
@@ -331,7 +310,8 @@
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
 ; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0
-; CHECK: %arcp.md.25ulp  = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %[[RCP_ARCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %arcp.md.25ulp  = fmul arcp float %a, %[[RCP_ARCP]], !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -63,7 +63,7 @@
   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv half 1.0, %b.val
+  %r.val = fdiv half 1.0, %b.val, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -82,7 +82,28 @@
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
   %b.abs = call half @llvm.fabs.f16(half %b.val)
-  %r.val = fdiv half 1.0, %b.abs
+  %r.val = fdiv half 1.0, %b.abs, !fpmath !0
+  store half %r.val, half addrspace(1)* %gep.r
+  ret void
+}
+
+; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.
+
+; GCN-LABEL: {{^}}reciprocal_f16_rounded:
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
+; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
+; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
+; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
+; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
+  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
+  %b.val = load volatile half, half addrspace(1)* %gep.b
+  %r.val = fdiv half 1.0, %b.val
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -100,7 +121,7 @@
   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv arcp half 1.0, %b.val
+  %r.val = fdiv arcp half 1.0, %b.val, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -118,7 +139,7 @@
   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv half -1.0, %b.val
+  %r.val = fdiv half -1.0, %b.val, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -137,7 +158,7 @@
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
-  %r.val = fdiv half 1.0, %b.sqrt
+  %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -157,7 +178,7 @@
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
-  %r.val = fdiv half -1.0, %b.sqrt
+  %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -249,3 +270,5 @@
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "unsafe-fp-math"="true" }
+
+!0 = !{float 2.500000e+00}