Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -20,6 +20,7 @@
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
+class GCNTargetMachine;
 struct MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
@@ -51,7 +52,7 @@
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
 
 ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
 
Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
@@ -30,15 +32,28 @@
 namespace {
 
 class AMDGPUCodeGenPrepare : public FunctionPass,
-                             public InstVisitor<AMDGPUCodeGenPrepare> {
+                             public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+  const GCNTargetMachine *TM;
+  const SISubtarget *ST;
   DivergenceAnalysis *DA;
-  const TargetMachine *TM;
+  Module *Mod;
+  bool HasUnsafeFPMath;
 
 public:
   static char ID;
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
     FunctionPass(ID),
-    TM(TM) { }
+    TM(static_cast<const GCNTargetMachine *>(TM)),
+    ST(nullptr),
+    DA(nullptr),
+    Mod(nullptr),
+    HasUnsafeFPMath(false) { }
+
+  bool visitFDiv(BinaryOperator &I);
+
+  bool visitInstruction(Instruction &I) {
+    return false;
+  }
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -55,7 +70,92 @@
 
 } // End anonymous namespace
 
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+  if (!CNum)
+    return false;
+
+  // Reciprocal f32 is handled separately without denormals.
+  return UnsafeDiv && CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+  Type *Ty = FDiv.getType();
+
+  // TODO: Handle half
+  if (!Ty->getScalarType()->isFloatTy())
+    return false;
+
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  if (!FPMath)
+    return false;
+
+  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+  float ULP = FPOp->getFPAccuracy();
+  if (ULP < 2.5f)
+    return false;
+
+  FastMathFlags FMF = FPOp->getFastMathFlags();
+  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+                                      FMF.allowReciprocal();
+  if (ST->hasFP32Denormals() && !UnsafeDiv)
+    return false;
+
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+  Function *Decl
+    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  Value *NewFDiv = nullptr;
+
+  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    NewFDiv = UndefValue::get(VT);
+
+    // FIXME: Doesn't do the right thing for cases where the vector is partially
+    // constant. This works when the scalarizer pass is run first.
+    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+      Value *NumEltI = Builder.CreateExtractElement(Num, I);
+      Value *DenEltI = Builder.CreateExtractElement(Den, I);
+      Value *NewElt;
+
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+      } else {
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      }
+
+      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+    }
+  } else {
+    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  }
+
+  if (NewFDiv) {
+    FDiv.replaceAllUsesWith(NewFDiv);
+    NewFDiv->takeName(&FDiv);
+    FDiv.eraseFromParent();
+  }
+
+  return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+  return Attr.getValueAsString() == "true";
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
   return false;
 }
 
@@ -63,10 +163,21 @@
   if (!TM || skipFunction(F))
     return false;
 
+  ST = &TM->getSubtarget<SISubtarget>(F);
   DA = &getAnalysis<DivergenceAnalysis>();
-  visit(F);
+  HasUnsafeFPMath = hasUnsafeFPMath(F);
 
-  return true;
+  bool MadeChange = false;
+
+  for (BasicBlock &BB : F) {
+    BasicBlock::iterator Next;
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+      Next = std::next(I);
+      MadeChange |= visit(*I);
+    }
+  }
+
+  return MadeChange;
 }
 
 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
@@ -77,6 +188,6 @@
 
 char AMDGPUCodeGenPrepare::ID = 0;
 
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
   return new AMDGPUCodeGenPrepare(TM);
 }
Index: lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -34,13 +34,23 @@
 class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo();
+
+  StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
   std::string getName(unsigned IntrId, Type **Tys = nullptr,
-                      unsigned numTys = 0) const override;
+                      unsigned NumTys = 0) const override;
+
   unsigned lookupName(const char *Name, unsigned Len) const override;
   bool isOverloaded(unsigned IID) const override;
   Function *getDeclaration(Module *M, unsigned ID,
                            Type **Tys = nullptr,
-                           unsigned numTys = 0) const override;
+                           unsigned NumTys = 0) const override;
+
+  Function *getDeclaration(Module *M, unsigned ID,
+                           ArrayRef<Type *> = None) const;
+
+  FunctionType *getType(LLVMContext &Context, unsigned ID,
+                        ArrayRef<Type*> Tys = None) const;
 };
 
 } // end namespace llvm
Index: lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -29,16 +29,39 @@
 #undef GET_INTRINSIC_NAME_TABLE
 };
 
-std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  if (IntrID < Intrinsic::num_intrinsics) {
-    return nullptr;
-  }
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+                                       ArrayRef<Type *> Tys) const {
+  if (IntrID < Intrinsic::num_intrinsics)
+    return StringRef();
+
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
          "Invalid intrinsic ID");
 
-  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
-  return Result;
+  return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned NumTys) const {
+  return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+                                           ArrayRef<Type*> Tys) const {
+  // FIXME: Re-use Intrinsic::getType machinery
+  switch (ID) {
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    Type *F32Ty = Type::getFloatTy(Context);
+    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+  }
+  default:
+    llvm_unreachable("unhandled intrinsic");
+  }
 }
 
 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -69,7 +92,19 @@
 }
 
 Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              ArrayRef<Type *> Tys) const {
+  FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+  Function *F
+    = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+  AttributeSet AS = getAttributes(M->getContext(),
+                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  F->setAttributes(AS);
+  return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
                                               Type **Tys,
-                                              unsigned numTys) const {
-  llvm_unreachable("Not implemented");
+                                              unsigned NumTys) const {
+  return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
 }
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -290,6 +290,7 @@
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
+  void addIRPasses() override;
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
   bool addInstSelector() override;
@@ -474,6 +475,13 @@
   addPass(&DeadMachineInstructionElimID);
 }
 
+void GCNPassConfig::addIRPasses() {
+  // TODO: May want to move later or split into an early and late one.
+  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+  AMDGPUPassConfig::addIRPasses();
+}
+
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -36,7 +36,8 @@
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1716,6 +1716,9 @@
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    return lowerFDIV_FAST(Op, DAG);
+  }
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
                        Op.getOperand(1),
@@ -2022,7 +2025,8 @@
 
 // Catch division cases where we can use shortcuts with rcp and rsq
 // instructions.
-SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+                                              SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -2063,47 +2067,48 @@
   return SDValue();
 }
 
-SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
-  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
-    return FastLowered;
-
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
 
-  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
-  if (EnableAMDGPUFastFDIV) {
-    // This does not support denormals.
-    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
 
-    const APFloat K0Val(BitsToFloat(0x6f800000));
-    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
 
-    const APFloat K1Val(BitsToFloat(0x2f800000));
-    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
 
-    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
-    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
 
-    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
 
-    // TODO: Should this propagate fast-math-flags?
+  // TODO: Should this propagate fast-math-flags?
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
-    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+  // rcp does not support denormals.
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
 
-    // rcp does not support denormals.
-    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
 
-    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
 
-    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
-  }
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
 
-  // Generates more precise fpdiv32.
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
@@ -2133,7 +2138,7 @@
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getTarget().Options.UnsafeFPMath)
-    return LowerFastFDIV(Op, DAG);
+    return lowerFastUnsafeFDIV(Op, DAG);
 
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
Index: lib/Target/AMDGPU/SIIntrinsics.td
===================================================================
--- lib/Target/AMDGPU/SIIntrinsics.td
+++ lib/Target/AMDGPU/SIIntrinsics.td
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// SI Intrinsic Definitions
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
 //
 //===----------------------------------------------------------------------===//
 
@@ -177,6 +178,12 @@
 } // End TargetPrefix = "SI", isTarget = 1
 
 let TargetPrefix = "amdgcn", isTarget = 1 in {
+  // Emit 2.5 ulp, no denormal division. Should only be inserted by
+  // pass based on !fpmath metadata.
+  def int_amdgcn_fdiv_fast : Intrinsic<
+    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+  >;
+
   /* Control flow Intrinsics */
 
   def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
===================================================================
--- test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
+++ test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@@ -1,8 +1,242 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
-; RUN: opt -S -amdgpu-codegenprepare < %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
+; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
 ; Make sure this doesn't crash with no triple
 
-; CHECK-LABEL: @foo(
-define void @foo() {
+; NOOP-LABEL: @noop_fdiv_fpmath(
+; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
+define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: @fdiv_fpmath(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
+; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath(
+; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
+; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
+; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
+; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
+; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
+define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+  %no.md = fdiv float 1.0, %x
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp float 1.0, %x
+  store volatile float %arcp.no.md, float addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
+  store volatile float %arcp.25ulp, float addrspace(1)* %out
+
+  %fast.no.md = fdiv fast float 1.0, %x
+  store volatile float %fast.no.md, float addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
+  store volatile float %fast.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+
+; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
+; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
+; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
+; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
+define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+  %no.md = fdiv <2 x float> %a, %b
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
+
+  %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
+  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; FIXME: Should be able to get fdiv for 1.0 component
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
+
+  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
+; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+attributes #0 = { nounwind optnone noinline }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+
+; CHECK: !0 = !{float 2.500000e+00}
+; CHECK: !1 = !{float 5.000000e-01}
+; CHECK: !2 = !{float 1.000000e+00}
+; CHECK: !3 = !{float 3.000000e+00}
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 5.000000e-01}
+!2 = !{float 1.000000e+00}
+!3 = !{float 3.000000e+00}
Index: test/CodeGen/AMDGPU/fdiv.ll
===================================================================
--- test/CodeGen/AMDGPU/fdiv.ll
+++ test/CodeGen/AMDGPU/fdiv.ll
@@ -1,8 +1,4 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
@@ -15,22 +11,59 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
+; SI: v_div_scale_f32
+; SI-DAG: v_div_scale_f32
 
 ; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_mul_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_div_fmas_f32
+; SI: v_div_fixup_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv float %a, %b
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
+; SI: v_cndmask_b32
+; SI: v_mul_f32
+; SI: v_rcp_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; Use correct fdiv
+; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
+; SI: v_fma_f32
+; SI: v_div_fmas_f32
+; SI: v_div_fixup_f32
+define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+entry:
+  %fdiv = fdiv float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
 
-; I754-DAG: v_div_scale_f32
-; I754-DAG: v_rcp_f32
-; I754-DAG: v_fma_f32
-; I754-DAG: v_mul_f32
-; I754-DAG: v_fma_f32
-; I754-DAG: v_div_fixup_f32
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
-  %0 = fdiv float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv fast float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -38,15 +71,14 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
-  %0 = fdiv fast float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv fast float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -54,15 +86,14 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
-  %0 = fdiv arcp float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv arcp float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -72,26 +103,24 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+entry:
+  %fdiv = fdiv <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
+  ret void
+}
 
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
+; SI: v_cmp_gt_f32
+; SI: v_cmp_gt_f32
+define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -101,19 +130,12 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv fast <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv fast <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -123,19 +145,12 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv arcp <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv arcp <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -149,37 +164,11 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -198,24 +187,11 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -234,24 +210,11 @@
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -259,3 +222,9 @@
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
+attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
+attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
+
+!0 = !{float 2.500000e+00}
Index: test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.amdgcn.fdiv.fast(float, float) #0
+
+; CHECK-LABEL: {{^}}test_fdiv_fast:
+; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; CHECK: v_mul_f32_e32
+; CHECK: v_rcp_f32_e32
+; CHECK: v_mul_f32_e32
+; CHECK: v_mul_f32_e32
+define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+  %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }