diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -101,6 +101,8 @@
 
   void visitAtomicRMWInst(AtomicRMWInst &I);
   void visitIntrinsicInst(IntrinsicInst &I);
+
+  bool isScanStrategyIterative();
 };
 
 } // namespace
@@ -133,6 +135,21 @@
       .run(F);
 }
 
+bool AMDGPUAtomicOptimizerImpl::isScanStrategyIterative() {
+  return ScanImpl == ScanOptions::Iterative;
+}
+
+bool isOpFP(AtomicRMWInst::BinOp &Op) {
+  switch (Op) {
+  default:
+    return false;
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FMax:
+  case AtomicRMWInst::FMin:
+    return true;
+  }
+}
+
 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
 
@@ -202,9 +219,17 @@
   case AtomicRMWInst::Min:
   case AtomicRMWInst::UMax:
   case AtomicRMWInst::UMin:
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FMax:
+  case AtomicRMWInst::FMin:
     break;
   }
 
+  // FP Atomics are supported for only Iterative Strategy
+  if (isOpFP(Op) && !isScanStrategyIterative()) {
+    return;
+  }
+
   const unsigned PtrIdx = 0;
   const unsigned ValIdx = 1;
 
@@ -302,6 +327,20 @@
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
     Op = AtomicRMWInst::UMax;
     break;
+  case Intrinsic::amdgcn_global_atomic_fadd:
+    Op = AtomicRMWInst::FAdd;
+    break;
+  case Intrinsic::amdgcn_global_atomic_fmin:
+    Op = AtomicRMWInst::FMin;
+    break;
+  case Intrinsic::amdgcn_global_atomic_fmax:
+    Op = AtomicRMWInst::FMax;
+    break;
+  }
+
+  // FP Atomics are supported for only Iterative Strategy
+  if (isOpFP(Op) && !isScanStrategyIterative()) {
+    return;
   }
 
   const unsigned ValIdx = 0;
@@ -344,6 +383,8 @@
     llvm_unreachable("Unhandled atomic op");
   case AtomicRMWInst::Add:
     return B.CreateBinOp(Instruction::Add, LHS, RHS);
+  case AtomicRMWInst::FAdd:
+    return B.CreateBinOp(Instruction::FAdd, LHS, RHS);
   case AtomicRMWInst::Sub:
     return B.CreateBinOp(Instruction::Sub, LHS, RHS);
   case AtomicRMWInst::And:
@@ -365,6 +406,12 @@
   case AtomicRMWInst::UMin:
     Pred = CmpInst::ICMP_ULT;
     break;
+  case AtomicRMWInst::FMax:
+    Pred = CmpInst::FCMP_UGT;
+    break;
+  case AtomicRMWInst::FMin:
+    Pred = CmpInst::FCMP_ULT;
+    break;
   }
   Value *Cond = B.CreateICmp(Pred, LHS, RHS);
   return B.CreateSelect(Cond, LHS, RHS);
@@ -554,11 +601,15 @@
   // Use llvm.cttz instrinsic to find the lowest remaining active lane.
   auto *FF1 =
       B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
-  auto *LaneIdxInt = B.CreateTrunc(FF1, Ty);
+  auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
 
   // Get the value required for atomic operation
-  auto *LaneValue =
+  if (Ty->isFloatingPointTy())
+    V = B.CreateBitCast(V, B.getInt32Ty());
+  Value *LaneValue =
       B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+  if (Ty->isFloatingPointTy())
+    LaneValue = B.CreateBitCast(LaneValue, Ty);
 
   // Perform writelane if intermediate scan results are required later in the
   // kernel computations
@@ -590,6 +641,18 @@
   return {OldValue, NewAccumulator};
 }
 
+static APFloat getIdentityValueForFAtomicOp(AtomicRMWInst::BinOp Op) {
+  switch (Op) {
+  default:
+    llvm_unreachable("Unhandled atomic op");
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FMax:
+    return APFloat::getSmallest(APFloat::IEEEsingle(), false);
+  case AtomicRMWInst::FMin:
+    return APFloat::getLargest(APFloat::IEEEsingle(), false);
+  }
+}
+
 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
                                          unsigned BitWidth) {
   switch (Op) {
@@ -679,17 +742,22 @@
     Mbcnt =
         B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
   }
-  Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
+  Mbcnt = B.CreateIntCast(Mbcnt, B.getInt32Ty(), false);
 
-  Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
+  Function *F = I.getFunction();
+  LLVMContext &C = F->getContext();
+  Value *Identity;
+  if (Ty->isIntegerTy()) {
+    Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
+  } else if (Ty->isFloatingPointTy()) {
+    Identity = ConstantFP::get(C, getIdentityValueForFAtomicOp(Op));
+  }
 
   Value *ExclScan = nullptr;
   Value *NewV = nullptr;
 
   const bool NeedResult = !I.use_empty();
 
-  Function *F = I.getFunction();
-  LLVMContext &C = F->getContext();
   BasicBlock *ComputeLoop = nullptr;
   BasicBlock *ComputeEnd = nullptr;
   // If we have a divergent value in each lane, we need to combine the value
@@ -746,13 +814,22 @@
       NewV = buildMul(B, V, Ctpop);
       break;
     }
-
+    case AtomicRMWInst::FAdd: {
+      Value *const Ctpop =
+          B.CreateIntCast(B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot),
+                          B.getInt32Ty(), false);
+      Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+      NewV = B.CreateFMul(V, CtpopFP);
+      break;
+    }
     case AtomicRMWInst::And:
     case AtomicRMWInst::Or:
     case AtomicRMWInst::Max:
     case AtomicRMWInst::Min:
     case AtomicRMWInst::UMax:
     case AtomicRMWInst::UMin:
+    case AtomicRMWInst::FMin:
+    case AtomicRMWInst::FMax:
       // These operations with a uniform value are idempotent: doing the atomic
       // operation multiple times has the same effect as doing it once.
       NewV = V;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN:  opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 {
+; IR-LABEL: @global_atomic_fadd_uni_value(
+; IR-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; IR-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; IR-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0)
+; IR-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
+; IR-NEXT:    [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]])
+; IR-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; IR-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT:    [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]]
+; IR-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
+; IR-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
+; IR:       12:
+; IR-NEXT:    [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
+; IR-NEXT:    br label [[TMP14]]
+; IR:       14:
+; IR-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 {
+; IR-LABEL: @global_atomic_fadd_div_value(
+; IR-NEXT:    [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float
+; IR-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; IR-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; IR-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; IR-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0)
+; IR-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
+; IR-NEXT:    [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    br label [[COMPUTELOOP:%.*]]
+; IR:       8:
+; IR-NEXT:    [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4
+; IR-NEXT:    br label [[TMP10:%.*]]
+; IR:       10:
+; IR-NEXT:    ret void
+; IR:       ComputeLoop:
+; IR-NEXT:    [[ACCUMULATOR:%.*]] = phi float [ 0x36A0000000000000, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ]
+; IR-NEXT:    [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ]
+; IR-NEXT:    [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
+; IR-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
+; IR-NEXT:    [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
+; IR-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
+; IR-NEXT:    [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
+; IR-NEXT:    [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
+; IR-NEXT:    [[TMP17:%.*]] = shl i64 1, [[TMP11]]
+; IR-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP17]], -1
+; IR-NEXT:    [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]]
+; IR-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0
+; IR-NEXT:    br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
+; IR:       ComputeEnd:
+; IR-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0
+; IR-NEXT:    br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]]
+;
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %divValue = bitcast i32 %id.x to float
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst
+  ret void
+}
+
+attributes #0 = {"target-cpu"="gfx906"}