Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -47,12 +47,15 @@
   DominatorTree *DT;
   const GCNSubtarget *ST;
   bool IsPixelShader;
+  bool IsComputeKernel;
 
   Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                         Value *const Identity) const;
   Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                    Value *const Identity) const;
   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
+  std::pair<Value *, Value *> buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+              Value *const Identity, Value *const Ballot, Value *V, Instruction &I) const;
   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
                       bool ValDivergent) const;
 
@@ -93,7 +96,8 @@
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   ST = &TM.getSubtarget<GCNSubtarget>(F);
   IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
-
+	IsComputeKernel = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+  
   visit(F);
 
   const bool Changed = !ToReplace.empty();
@@ -430,6 +434,54 @@
   return V;
 }
 
+// Use the builder to create an exclusive scan and compute the final reduced value using an iterative approach.
+// This provides an alternative implementation to DPP which uses WMM for scan computations. This API iterate over
+// lanes to read, compute and update (conditionally) the value using readlane and writelane intrinsics. 
+std::pair<Value *, Value *> AMDGPUAtomicOptimizer::buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+                                        Value *const Identity, Value *const Ballot, Value *V, Instruction &I) const {
+    Type *const Ty = I.getType();
+    const bool NeedResult = !I.use_empty();
+    Module *M = B.GetInsertBlock()->getModule();
+    Function *WriteLaneDecl = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+    Value *ReducedValue = Identity;
+    Value *PartialSumTillPreviousLane = Identity;
+    BasicBlock * CurrentBasicBlock = nullptr;
+    PHINode * DestWrite = nullptr;
+    Value *Scan = V;
+    Value *DestWriteOfPreviousLane = V;
+    Instruction *TerminatorInConditionalWritelaneBlock = nullptr;
+    unsigned WaveFrontSize = ST->isWave32() ? 32 : 64;
+    for(unsigned LaneIdx = 0; LaneIdx < WaveFrontSize; LaneIdx++) {
+      // Iterate over all the lanes of a wavefront to compute the partial sum. If the lane is not active select the Identity value 
+      // in computation, otherwise consider value extracted using readlane.
+      Value *Mask = B.CreateShl(B.getInt64(1), B.getInt64(LaneIdx));      
+      Value *BallotAndMask = B.CreateAnd(Ballot, Mask);      
+      Value *IsBitSet =  B.CreateICmpEQ(BallotAndMask, Mask);
+      Value *Select = B.CreateSelect(IsBitSet, B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, B.getInt32(LaneIdx)}), Identity);
+      ReducedValue = buildNonAtomicBinOp(B, Op, ReducedValue, Select);
+      
+      // Perform the writelane conditionally on only active lanes if the intermidiate scan results are required.
+      if (NeedResult) {
+        CurrentBasicBlock = I.getParent();
+        // Split the current basic block into IfThen to perform writelane conditionally on active lanes only
+        TerminatorInConditionalWritelaneBlock = SplitBlockAndInsertIfThen(IsBitSet, &I, false, nullptr, DT, nullptr);
+        
+        // Write exclusive scan (partial sum till the previous lane) result into a current lane 
+        B.SetInsertPoint(TerminatorInConditionalWritelaneBlock);
+        Scan = B.CreateCall(WriteLaneDecl, {PartialSumTillPreviousLane, B.getInt32(LaneIdx), Scan});
+        
+        B.SetInsertPoint(&I);
+        DestWrite = B.CreatePHI(Ty, 2, "DestWrite");    
+        DestWrite->addIncoming(DestWriteOfPreviousLane, CurrentBasicBlock);
+        DestWrite->addIncoming(Scan, TerminatorInConditionalWritelaneBlock->getParent()); 
+        // Values used for the next iteration
+        DestWriteOfPreviousLane = DestWrite;   
+        PartialSumTillPreviousLane = ReducedValue;
+      }
+    }
+    return std::make_pair(Scan, ReducedValue);
+}
+
 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
                                          unsigned BitWidth) {
   switch (Op) {
@@ -530,34 +582,38 @@
 
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
-  if (ValDivergent) {
-    // First we need to set all inactive invocations to the identity value, so
-    // that they can correctly contribute to the final result.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
+	  if (ValDivergent) {
     const AtomicRMWInst::BinOp ScanOp =
-        Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
-    if (!NeedResult && ST->hasPermLaneX16()) {
-      // On GFX10 the permlanex16 instruction helps us build a reduction without
-      // too many readlanes and writelanes, which are generally bad for
-      // performance.
-      NewV = buildReduction(B, ScanOp, NewV, Identity);
+      Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+    if(!IsComputeKernel) {
+      // First we need to set all inactive invocations to the identity value, so
+      // that they can correctly contribute to the final result.
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+      const AtomicRMWInst::BinOp ScanOp =
+          Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+      if (!NeedResult && ST->hasPermLaneX16()) {
+        // On GFX10 the permlanex16 instruction helps us build a reduction without
+        // too many readlanes and writelanes, which are generally bad for
+        // performance.
+        NewV = buildReduction(B, ScanOp, NewV, Identity);
+      } else {
+        NewV = buildScan(B, ScanOp, NewV, Identity);
+        if (NeedResult)
+          ExclScan = buildShiftRight(B, NewV, Identity);
+        // Read the value from the last lane, which has accumulated the values of
+        // each active lane in the wavefront. This will be our new value which we
+        // will provide to the atomic operation.
+        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+        assert(TyBitWidth == 32);
+        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+                                {NewV, LastLaneIdx});
+      }
+      // Finally mark the readlanes in the WWM section.
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
     } else {
-      NewV = buildScan(B, ScanOp, NewV, Identity);
-      if (NeedResult)
-        ExclScan = buildShiftRight(B, NewV, Identity);
-
-      // Read the value from the last lane, which has accumulated the values of
-      // each active lane in the wavefront. This will be our new value which we
-      // will provide to the atomic operation.
-      Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-      assert(TyBitWidth == 32);
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, LastLaneIdx});
+      // Alternative implementation for scan 
+      std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, Ballot, V, I);
     }
-
-    // Finally mark the readlanes in the WWM section.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
   } else {
     switch (Op) {
     default:
@@ -594,10 +650,19 @@
     }
   }
 
-  // We only want a single lane to enter our new control flow, and we do this
-  // by checking if there are any active lanes below us. Only one lane will
-  // have 0 active lanes below us, so that will be the only one to progress.
-  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
+	
+  Value * Cond = nullptr;
+  if(ValDivergent && IsComputeKernel) {
+    // Only the first active lane will enter the new control flow to update the value.
+    CallInst *const FirstActiveLane =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, Mbcnt);
+    Cond = B.CreateICmpEQ(Mbcnt, FirstActiveLane);
+  } else {
+    // We only want a single lane to enter our new control flow, and we do this
+    // by checking if there are any active lanes below us. Only one lane will
+    // have 0 active lanes below us, so that will be the only one to progress.
+    Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
+  }
 
   // Store I's original basic block before we split the block.
   BasicBlock *const EntryBB = I.getParent();
@@ -660,8 +725,12 @@
     // from the first lane, to get our lane's index into the atomic result.
     Value *LaneOffset = nullptr;
     if (ValDivergent) {
-      LaneOffset =
-          B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+	    if(!IsComputeKernel) {
+        LaneOffset =
+            B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+      } else {
+        LaneOffset = ExclScan;
+      }
     } else {
       switch (Op) {
       default:
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -273,7 +273,7 @@
 static cl::opt<bool> EnableAtomicOptimizations(
   "amdgpu-atomic-optimizations",
   cl::desc("Enable atomic optimizations"),
-  cl::init(false),
+  cl::init(true),
   cl::Hidden);
 
 // Enable Mode register optimization