diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -48,6 +48,8 @@ const GCNSubtarget *ST; bool IsPixelShader; + Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; @@ -279,6 +281,46 @@ return B.CreateSelect(Cond, LHS, RHS); } +// Use the builder to create a reduction of V across the wavefront, with all +// lanes active, returning the same result in all lanes. +Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + + // Reduce within each row of 16 lanes. + for (unsigned Idx = 0; Idx < 4; Idx++) { + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + } + + // Reduce within each pair of rows (i.e. 32 lanes). + assert(ST->hasPermLaneX16()); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); + + if (!ST->isWave32()) { + // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and + // combine them with a scalar operation. + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); + Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); + V = buildNonAtomicBinOp(B, Op, Lane0, Lane32); + } + + return V; +} + // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, @@ -489,16 +531,24 @@ const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumlated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx}); + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction without + // too many readlanes and writelanes, which are generally bad for + // performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + + // Read the value from the last lane, which has accumlated the values of + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -810,6 +810,9 @@ return GFX8Insts; } + /// \returns true if the subtarget has the v_permlanex16_b32 instruction. + bool hasPermLaneX16() const { return getGeneration() >= GFX10; } + bool hasDPP() const { return HasDPP; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -707,8 +707,10 @@ DPP_UNUSED8_LAST = 0x14F, ROW_NEWBCAST_FIRST= 0x150, ROW_NEWBCAST_LAST = 0x15F, + ROW_SHARE0 = 0x150, ROW_SHARE_FIRST = 0x150, ROW_SHARE_LAST = 0x15F, + ROW_XMASK0 = 0x160, ROW_XMASK_FIRST = 0x160, ROW_XMASK_LAST = 0x16F, DPP_LAST = ROW_XMASK_LAST diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -663,23 +663,21 @@ ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 +; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 -; GFX1064-NEXT: s_mov_b32 s0, s2 +; GFX1064-NEXT: s_add_i32 s0, s2, s3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB3_2 @@ -701,26 +699,24 @@ ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 +; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s0, s1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: ds_add_u32 v3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB3_2: @@ -1862,23 +1858,21 @@ ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 +; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 -; GFX1064-NEXT: s_mov_b32 s0, s2 +; GFX1064-NEXT: s_add_i32 s0, s2, s3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB10_2 @@ -1900,26 +1894,24 @@ ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 +; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s0, s1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: ds_sub_u32 v3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB10_2: