Index: lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -30,6 +30,7 @@
 enum DPP_CTRL {
   DPP_ROW_SR1 = 0x111,
   DPP_ROW_SR2 = 0x112,
+  DPP_ROW_SR3 = 0x113,
   DPP_ROW_SR4 = 0x114,
   DPP_ROW_SR8 = 0x118,
   DPP_WF_SR1 = 0x138,
@@ -279,44 +280,44 @@
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
   if (ValDivergent) {
+    Value *const Identity = B.getIntN(TyBitWidth, 0);
+
     // First we need to set all inactive invocations to 0, so that they can
     // correctly contribute to the final result.
-    CallInst *const SetInactive = B.CreateIntrinsic(
-        Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
+    CallInst *const SetInactive =
+        B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
     setConvergent(SetInactive);
-    NewV = SetInactive;
 
-    const unsigned Iters = 6;
-    const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1,     DPP_ROW_SR2,
-                                     DPP_ROW_SR4,     DPP_ROW_SR8,
-                                     DPP_ROW_BCAST15, DPP_ROW_BCAST31};
-    const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
+    CallInst *const FirstDPP =
+        B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
+                          {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
+                           B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+    setConvergent(FirstDPP);
+    NewV = FirstDPP;
+
+    const unsigned Iters = 7;
+    const unsigned DPPCtrl[Iters] = {
+        DPP_ROW_SR1, DPP_ROW_SR2,     DPP_ROW_SR3,    DPP_ROW_SR4,
+        DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
+    const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
+    const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
 
     // This loop performs an inclusive scan across the wavefront, with all lanes
     // active (by using the WWM intrinsic).
     for (unsigned Idx = 0; Idx < Iters; Idx++) {
-      CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
-                                              {NewV, B.getInt32(DPPCtrl[Idx]),
-                                               B.getInt32(RowMask[Idx]),
-                                               B.getInt32(0xf), B.getFalse()});
+      Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV;
+      CallInst *const DPP = B.CreateIntrinsic(
+          Intrinsic::amdgcn_update_dpp, Ty,
+          {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
+           B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
       setConvergent(DPP);
-      Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
 
-      NewV = B.CreateBinOp(Op, NewV, WWM);
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+      NewV = B.CreateBinOp(Op, NewV, DPP);
     }
 
-    // NewV has returned the inclusive scan of V, but for the lane offset we
-    // require an exclusive scan. We do this by shifting the values from the
-    // entire wavefront right by 1, and by setting the bound_ctrl (last argument
-    // to the intrinsic below) to true, we can guarantee that 0 will be shifted
-    // into the 0'th invocation.
-    CallInst *const DPP =
-        B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
-                          {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
-                           B.getInt32(0xf), B.getTrue()});
-    setConvergent(DPP);
-    LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+    LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty,
+                             B.CreateBinOp(Op, NewV, SetInactive));
 
     // Read the value from the last lane, which has accumlated the values of
     // each active lane in the wavefront. This will be our new value with which
Index: test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
===================================================================
--- test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -44,6 +44,14 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_add v[[value]]
@@ -104,6 +112,14 @@
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]