Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -376,26 +376,24 @@ CallInst *const SetInactive = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - CallInst *const FirstDPP = + ExclScan = B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, {Identity, SetInactive, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - ExclScan = FirstDPP; - const unsigned Iters = 7; - const unsigned DPPCtrl[Iters] = { - DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, - DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; + const unsigned Iters = 6; + const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, + DPP_ROW_SR4, DPP_ROW_SR8, + DPP_ROW_BCAST15, DPP_ROW_BCAST31}; + const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; + const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; // This loop performs an exclusive scan across the wavefront, with all lanes // active (by using the WWM intrinsic). for (unsigned Idx = 0; Idx < Iters; Idx++) { - Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; CallInst *const DPP = B.CreateIntrinsic( Intrinsic::amdgcn_update_dpp, Ty, - {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), + {Identity, ExclScan, B.getInt32(DPPCtrl[Idx]), B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -47,7 +47,6 @@ ; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf @@ -115,7 +114,6 @@ ; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf