Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -47,12 +47,17 @@
   DominatorTree *DT;
   const GCNSubtarget *ST;
   bool IsPixelShader;
+  bool IsComputeKernel;
 
   Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                         Value *const Identity) const;
   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                    Value *const Identity) const;
   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
+  std::pair<Value *, Value *>
+  buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+                       Value *const Identity, Value *const Ballot, Value *V,
+                       Instruction &I) const;
   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
                       bool ValDivergent) const;
 
@@ -93,6 +98,7 @@
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   ST = &TM.getSubtarget<GCNSubtarget>(F);
   IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+  IsComputeKernel = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
 
   visit(F);
 
@@ -430,6 +436,73 @@
   return V;
 }
 
+// Use the builder to create an exclusive scan and compute the final reduced
+// value using an iterative approach. This provides an alternative
+// implementation to DPP which uses WMM for scan computations. This API iterate
+// over lanes to read, compute and update (conditionally) the value using
+// readlane and writelane intrinsics.
+std::pair<Value *, Value *> AMDGPUAtomicOptimizer::buildScanIteratively(
+    IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity,
+    Value *const Ballot, Value *V, Instruction &I) const {
+  Type *const Ty = I.getType();
+  const bool NeedResult = !I.use_empty();
+  Module *M = B.GetInsertBlock()->getModule();
+  Function *WriteLaneDecl =
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+  Value *ReducedValue = Identity;
+  Value *PartialSumTillPreviousLane = Identity;
+  BasicBlock *CurrentBasicBlock = nullptr;
+  PHINode *DestWrite = nullptr;
+  Value *Scan = V;
+  Value *DestWriteOfPreviousLane = V;
+  Instruction *TerminatorInConditionalWritelaneBlock = nullptr;
+  unsigned WaveFrontSize = ST->isWave32() ? 32 : 64;
+  Type *const BallotTy = Ballot->getType();
+  const unsigned TyBitWidth = DL->getTypeSizeInBits(BallotTy);
+
+  for (unsigned LaneIdx = 0; LaneIdx < WaveFrontSize; LaneIdx++) {
+    // Iterate over all the lanes of a wavefront to compute the partial sum. If
+    // the lane is not active select the Identity value in computation,
+    // otherwise consider value extracted using readlane.
+    Value *Mask =
+        B.CreateShl(B.getIntN(TyBitWidth, 1), B.getIntN(TyBitWidth, LaneIdx));
+    Value *BallotAndMask = B.CreateAnd(Ballot, Mask);
+    Value *IsBitSet = B.CreateICmpEQ(BallotAndMask, Mask);
+    Value *Select =
+        B.CreateSelect(IsBitSet,
+                       B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+                                         {V, B.getInt32(LaneIdx)}),
+                       Identity);
+    ReducedValue = buildNonAtomicBinOp(B, Op, ReducedValue, Select);
+
+    // Perform the writelane conditionally on only active lanes if the
+    // intermidiate scan results are required.
+    if (NeedResult) {
+      CurrentBasicBlock = I.getParent();
+      // Split the current basic block into IfThen to perform writelane
+      // conditionally on active lanes only
+      TerminatorInConditionalWritelaneBlock =
+          SplitBlockAndInsertIfThen(IsBitSet, &I, false, nullptr, DT, nullptr);
+
+      // Write exclusive scan (partial sum till the previous lane) result into a
+      // current lane
+      B.SetInsertPoint(TerminatorInConditionalWritelaneBlock);
+      Scan = B.CreateCall(WriteLaneDecl, {PartialSumTillPreviousLane,
+                                          B.getInt32(LaneIdx), Scan});
+
+      B.SetInsertPoint(&I);
+      DestWrite = B.CreatePHI(Ty, 2, "DestWrite");
+      DestWrite->addIncoming(DestWriteOfPreviousLane, CurrentBasicBlock);
+      DestWrite->addIncoming(
+          Scan, TerminatorInConditionalWritelaneBlock->getParent());
+      // Values used for the next iteration
+      DestWriteOfPreviousLane = DestWrite;
+      PartialSumTillPreviousLane = ReducedValue;
+    }
+  }
+  return std::make_pair(Scan, ReducedValue);
+}
+
 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
                                          unsigned BitWidth) {
   switch (Op) {
@@ -531,33 +604,39 @@
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
   if (ValDivergent) {
-    // First we need to set all inactive invocations to the identity value, so
-    // that they can correctly contribute to the final result.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
     const AtomicRMWInst::BinOp ScanOp =
         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
-    if (!NeedResult && ST->hasPermLaneX16()) {
-      // On GFX10 the permlanex16 instruction helps us build a reduction without
-      // too many readlanes and writelanes, which are generally bad for
-      // performance.
-      NewV = buildReduction(B, ScanOp, NewV, Identity);
+    if (!IsComputeKernel) {
+      // First we need to set all inactive invocations to the identity value, so
+      // that they can correctly contribute to the final result.
+      NewV =
+          B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+      const AtomicRMWInst::BinOp ScanOp =
+          Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+      if (!NeedResult && ST->hasPermLaneX16()) {
+        // On GFX10 the permlanex16 instruction helps us build a reduction
+        // without too many readlanes and writelanes, which are generally bad
+        // for performance.
+        NewV = buildReduction(B, ScanOp, NewV, Identity);
+      } else {
+        NewV = buildScan(B, ScanOp, NewV, Identity);
+        if (NeedResult)
+          ExclScan = buildShiftRight(B, NewV, Identity);
+        // Read the value from the last lane, which has accumulated the values
+        // of each active lane in the wavefront. This will be our new value
+        // which we will provide to the atomic operation.
+        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+        assert(TyBitWidth == 32);
+        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+                                 {NewV, LastLaneIdx});
+      }
+      // Finally mark the readlanes in the WWM section.
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
     } else {
-      NewV = buildScan(B, ScanOp, NewV, Identity);
-      if (NeedResult)
-        ExclScan = buildShiftRight(B, NewV, Identity);
-
-      // Read the value from the last lane, which has accumulated the values of
-      // each active lane in the wavefront. This will be our new value which we
-      // will provide to the atomic operation.
-      Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-      assert(TyBitWidth == 32);
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, LastLaneIdx});
+      // Alternative implementation for scan
+      std::tie(ExclScan, NewV) =
+          buildScanIteratively(B, ScanOp, Identity, Ballot, V, I);
     }
-
-    // Finally mark the readlanes in the WWM section.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
   } else {
     switch (Op) {
     default:
@@ -594,11 +673,19 @@
     }
   }
 
-  // We only want a single lane to enter our new control flow, and we do this
-  // by checking if there are any active lanes below us. Only one lane will
-  // have 0 active lanes below us, so that will be the only one to progress.
-  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
-
+  Value *Cond = nullptr;
+  if (ValDivergent && IsComputeKernel) {
+    // Only the first active lane will enter the new control flow to update the
+    // value.
+    CallInst *const FirstActiveLane =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, Mbcnt);
+    Cond = B.CreateICmpEQ(Mbcnt, FirstActiveLane);
+  } else {
+    // We only want a single lane to enter our new control flow, and we do this
+    // by checking if there are any active lanes below us. Only one lane will
+    // have 0 active lanes below us, so that will be the only one to progress.
+    Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
+  }
   // Store I's original basic block before we split the block.
   BasicBlock *const EntryBB = I.getParent();
 
@@ -660,8 +747,12 @@
     // from the first lane, to get our lane's index into the atomic result.
     Value *LaneOffset = nullptr;
     if (ValDivergent) {
-      LaneOffset =
-          B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+      if (!IsComputeKernel) {
+        LaneOffset =
+            B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+      } else {
+        LaneOffset = ExclScan;
+      }
     } else {
       switch (Op) {
       default:
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -270,11 +270,10 @@
     cl::init(true), cl::Hidden);
 
 // Enable atomic optimization
-static cl::opt<bool> EnableAtomicOptimizations(
-  "amdgpu-atomic-optimizations",
-  cl::desc("Enable atomic optimizations"),
-  cl::init(false),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableAtomicOptimizations("amdgpu-atomic-optimizations",
+                              cl::desc("Enable atomic optimizations"),
+                              cl::init(true), cl::Hidden);
 
 // Enable Mode register optimization
 static cl::opt<bool> EnableSIModeRegisterPass(