Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1856,4 +1856,10 @@
   [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
+
+// Emit correctly rounded fp32 divide and sqrt.
+def int_amdgcn_fdiv_rounded : Intrinsic<
+  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
 }
Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -70,6 +70,7 @@
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   bool HasUnsafeFPMath = false;
+  bool HasCorrectlyRoundedDivideSqrt = false;
   bool HasFP32Denormals = false;
 
   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
@@ -542,34 +543,45 @@
 // Insert an intrinsic for fast fdiv for safe math situations where we can
 // reduce precision. Leave fdiv for situations where the generic node is
 // expected to be optimized.
+//
+// Also, insert an intrinsic for safe fdiv when
+// -cl-fp32-correctly-rounded-divide-sqrt is enabled.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
   Type *Ty = FDiv.getType();
 
   if (!Ty->getScalarType()->isFloatTy())
     return false;
 
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  if (!FPMath)
-    return false;
-
-  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  float ULP = FPOp->getFPAccuracy();
-  if (ULP < 2.5f)
-    return false;
-
+   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
   FastMathFlags FMF = FPOp->getFastMathFlags();
   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
                                       FMF.allowReciprocal();
-
   // With UnsafeDiv node will be optimized to just rcp and mul.
   if (UnsafeDiv)
     return false;
 
+  bool SafeFast = true;
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  if (!FPMath) {
+    // Insert an intrinsic when -cl-fp32-correctly-rounded-divide-sqrt
+    // is enabled.
+    if (!HasCorrectlyRoundedDivideSqrt)
+      return false;
+    SafeFast = false;
+  } else {
+    float ULP = FPOp->getFPAccuracy();
+    if (ULP < 2.5f)
+      return false;
+  }
+
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+  unsigned IntrinsicOpc = SafeFast ? Intrinsic::amdgcn_fdiv_fast 
+                                   : Intrinsic::amdgcn_fdiv_rounded;
+
+  Function *Decl = Intrinsic::getDeclaration(Mod, IntrinsicOpc);
 
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
@@ -586,7 +598,7 @@
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
       Value *NewElt;
 
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
+      if (SafeFast && shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
       } else {
         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -595,7 +607,7 @@
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
   } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
+    if (!SafeFast || !shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
   }
 
@@ -613,6 +625,11 @@
   return Attr.getValueAsString() == "true";
 }
 
+static bool hasCorrectlyRoundedDivideSqrt(const Function &F) {
+  Attribute Attr = F.getFnAttribute("correctly-rounded-divide-sqrt-fp-math");
+  return (Attr.getValueAsString() == "true");
+}
+
 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
                                           Value *LHS, Value *RHS) {
   Type *I32Ty = Builder.getInt32Ty();
@@ -1034,6 +1051,7 @@
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DA = &getAnalysis<LegacyDivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
+  HasCorrectlyRoundedDivideSqrt = hasCorrectlyRoundedDivideSqrt(F);
   HasFP32Denormals = ST->hasFP32Denormals(F);
 
   bool MadeChange = false;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -80,6 +80,8 @@
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_ROUNDED(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_SAFE(SDValue LHS, SDValue RHS, SelectionDAG &DAG) const;
   SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5910,6 +5910,8 @@
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
+  case Intrinsic::amdgcn_fdiv_rounded:
+    return lowerFDIV_ROUNDED(Op, DAG);
   case Intrinsic::amdgcn_interp_p1_f16: {
     SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
                                     Op.getOperand(5), SDValue());
@@ -7720,13 +7722,9 @@
   return DAG.getTargetConstant(Mode, SL, MVT::i32);
 }
 
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
-    return FastLowered;
 
-  SDLoc SL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+SDValue SITargetLowering::lowerFDIV_SAFE(SDValue LHS, SDValue RHS, SelectionDAG &DAG) const {
+  SDLoc SL(RHS);
 
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
@@ -7824,6 +7822,21 @@
   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
 }
 
+SDValue SITargetLowering::lowerFDIV_ROUNDED(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  return lowerFDIV_SAFE(LHS, RHS, DAG);
+}
+
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  return lowerFDIV_SAFE(LHS, RHS, DAG);
+}
+
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getTarget().Options.UnsafeFPMath)
     return lowerFastUnsafeFDIV(Op, DAG);
Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -213,9 +213,36 @@
   ret void
 }
 
+; CHECK-LABEL: @fdiv_rounded(
+; CHECK: %no.md = call float @llvm.amdgcn.fdiv.rounded(float %a, float %b)
+define amdgpu_kernel void @fdiv_rounded(float addrspace(1)* %out, float %a, float %b) #3 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_rounded_vector(
+; CHECK: %[[Anomd0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[Bnomd0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[FDIVnomd0:[0-9]+]] = call float @llvm.amdgcn.fdiv.rounded(float %[[Anomd0]], float %[[Bnomd0]])
+; CHECK: %[[INSnomd0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIVnomd0]], i64 0
+; CHECK: %[[Anomd1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[Bnomd1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[FDIVnomd1:[0-9]+]] = call float @llvm.amdgcn.fdiv.rounded(float %[[Anomd1]], float %[[Bnomd1]])
+; CHECK: %no.md = insertelement <2 x float> %[[INSnomd0]], float %[[FDIVnomd1]], i64 1
+define amdgpu_kernel void @fdiv_rounded_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #3 {
+  %no.md = fdiv <2 x float> %a, %b
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+
 attributes #0 = { nounwind optnone noinline }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" }
 
 ; CHECK: !0 = !{float 2.500000e+00}
 ; CHECK: !1 = !{float 5.000000e-01}
Index: llvm/test/CodeGen/AMDGPU/fdiv.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -73,6 +73,68 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX10: s_denorm_mode 15
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX10: s_denorm_mode 12
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+
+define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #3 {
+entry:
+  %fdiv = fdiv float 1.000000e+00, %a
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; PREGFX10-NOT: s_setreg
+; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; PREGFX10-NOT: s_setreg
+
+; GFX10-NOT: s_denorm_mode
+; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
+; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
+; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
+; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
+; GFX10-NOT: s_denorm_mode
+
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #4 {
+entry:
+  %fdiv = fdiv float 1.000000e+00, %a
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+
 ; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
 ; GCN: v_cndmask_b32
 ; GCN: v_mul_f32
@@ -287,5 +349,8 @@
 attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" }
 attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
 attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }
+attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
+attributes #4 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" "target-features"="+fp32-denormals,-flat-for-global" }
+
 
 !0 = !{float 2.500000e+00}