This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Simplify the exclusive scan used for optimized atomics
ClosedPublic

Authored by foad on Jul 9 2019, 6:13 AM.

Download Raw Diff

Details

Reviewers

arsenm
sheredom
critson
rampitec
vpykhtin

Commits

rG7d06ffff466d: [AMDGPU] Simplify the exclusive scan used for optimized atomics
rL366543: [AMDGPU] Simplify the exclusive scan used for optimized atomics

Summary

Change the scan algorithm to use only power-of-two shifts (1, 2, 4, 8,
16, 32) instead of starting off shifting by 1, 2 and 3 and then doing
a 3-way ADD, because:

It simplifies the compiler a little.
It minimizes vgpr pressure because each instruction is now of the form vn = vn + vn << c.
It is more friendly to the DPP combiner, which currently can't combine into an ADD3 instruction.

Because of #2 and #3 the end result is improved from this:

v_add_u32_dpp v4, v3, v3  row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
v_mov_b32_dpp v5, v3  row_shr:2 row_mask:0xf bank_mask:0xf
v_mov_b32_dpp v1, v3  row_shr:3 row_mask:0xf bank_mask:0xf
v_add3_u32 v1, v4, v5, v1
s_nop 1
v_add_u32_dpp v1, v1, v1  row_shr:4 row_mask:0xf bank_mask:0xe
s_nop 1
v_add_u32_dpp v1, v1, v1  row_shr:8 row_mask:0xf bank_mask:0xc
s_nop 1
v_add_u32_dpp v1, v1, v1  row_bcast:15 row_mask:0xa bank_mask:0xf
s_nop 1
v_add_u32_dpp v1, v1, v1  row_bcast:31 row_mask:0xc bank_mask:0xf

To this:

v_add_u32_dpp v1, v1, v1  row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
s_nop 1
v_add_u32_dpp v1, v1, v1  row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0
s_nop 1
v_add_u32_dpp v1, v1, v1  row_shr:4 row_mask:0xf bank_mask:0xe
s_nop 1
v_add_u32_dpp v1, v1, v1  row_shr:8 row_mask:0xf bank_mask:0xc
s_nop 1
v_add_u32_dpp v1, v1, v1  row_bcast:15 row_mask:0xa bank_mask:0xf
s_nop 1
v_add_u32_dpp v1, v1, v1  row_bcast:31 row_mask:0xc bank_mask:0xf

I.e. two fewer computational instructions, one extra nop where we could
schedule something else.

Diff Detail

Repository: rL LLVM

Event Timeline

foad created this revision.Jul 9 2019, 6:13 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 9 2019, 6:13 AM

Herald added subscribers: hiraditya, t-tye, tpr and 6 others. · View Herald Transcript

Harbormaster completed remote builds in B34588: Diff 208661.Jul 9 2019, 6:13 AM

See also comments here about whether to include the shift by 3: https://reviews.llvm.org/D57737#1392824

LGTM

This revision is now accepted and ready to land.Jul 18 2019, 5:34 PM

Closed by commit rL366543: [AMDGPU] Simplify the exclusive scan used for optimized atomics (authored by foad). · Explain WhyJul 19 2019, 1:40 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

AMDGPUAtomicOptimizer.cpp

18 lines

test/

CodeGen/

AMDGPU/

atomic_optimizations_buffer.ll

2 lines

Diff 210771

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Show First 20 Lines • Show All 370 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// If we have a divergent value in each lane, we need to combine the value		// If we have a divergent value in each lane, we need to combine the value
// using DPP.		// using DPP.
if (ValDivergent) {		if (ValDivergent) {
// First we need to set all inactive invocations to the identity value, so		// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.		// that they can correctly contribute to the final result.
CallInst *const SetInactive =		CallInst *const SetInactive =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});		B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});

CallInst *const FirstDPP =		ExclScan =
B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,		B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
{Identity, SetInactive, B.getInt32(DPP_WF_SR1),		{Identity, SetInactive, B.getInt32(DPP_WF_SR1),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});		B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
ExclScan = FirstDPP;

const unsigned Iters = 7;		const unsigned Iters = 6;
const unsigned DPPCtrl[Iters] = {		const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,		DPP_ROW_SR4, DPP_ROW_SR8,
DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};		DPP_ROW_BCAST15, DPP_ROW_BCAST31};
const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};		const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};		const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf};

// This loop performs an exclusive scan across the wavefront, with all lanes		// This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).		// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {		for (unsigned Idx = 0; Idx < Iters; Idx++) {
Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
CallInst *const DPP = B.CreateIntrinsic(		CallInst *const DPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Ty,		Intrinsic::amdgcn_update_dpp, Ty,
{Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),		{Identity, ExclScan, B.getInt32(DPPCtrl[Idx]),
B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});		B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});

ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);		ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}		}

NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);		NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);

// Read the value from the last lane, which has accumlated the values of		// Read the value from the last lane, which has accumlated the values of
▲ Show 20 Lines • Show All 176 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

	Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines
	; GCN-LABEL: add_i32_varying_vdata:			; GCN-LABEL: add_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_add v{{[0-9]+}}			; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
	; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_add v[[value]]			; GFX8MORE: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	; GCN-LABEL: sub_i32_varying_vdata:			; GCN-LABEL: sub_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}			; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
	; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_sub v[[value]]			; GFX8MORE: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	Show All 19 Lines