This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Translate s_and/s_andn2 to s_mov in vcc optimisation
ClosedPublic

Authored by critson on Jul 14 2020, 10:49 PM.

Download Raw Diff

Details

Reviewers

rampitec
arsenm

Commits

rG3a1866574834: [AMDGPU] Translate s_and/s_andn2 to s_mov in vcc optimisation

Summary

When SCC is dead, but VCC is required then replace s_and / s_andn2
with s_mov into VCC when mask value is 0 or -1.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

critson created this revision.Jul 14 2020, 10:49 PM

Herald added a project: Restricted Project. · View Herald TranscriptJul 14 2020, 10:49 PM

Herald added subscribers: llvm-commits, kerbowa, hiraditya and 8 others. · View Herald Transcript

Harbormaster failed remote builds in B64290: Diff 278073!Jul 14 2020, 11:37 PM

arsenm added inline comments.Jul 15 2020, 7:19 AM

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
140–153	Can we do this earlier? Removing the SCC def earlier would be more useful

It seems wave32 failure is real.

Add missing test diffs.

critson marked an inline comment as done.Jul 16 2020, 6:54 AM

critson added inline comments.

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
140–153	To the best of my understanding, the earliest this optimisation becomes available is after "Branch Probability Basic Block Placement". Which is not much earlier.

Harbormaster completed remote builds in B64506: Diff 278453.Jul 16 2020, 7:15 AM

LGTM

This revision is now accepted and ready to land.Jul 16 2020, 8:28 AM

Closed by commit rG3a1866574834: [AMDGPU] Translate s_and/s_andn2 to s_mov in vcc optimisation (authored by critson). · Explain WhyJul 16 2020, 7:49 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIPreEmitPeephole.cpp

16 lines

test/

CodeGen/

AMDGPU/

infinite-loop.ll

2 lines

insert-skip-from-vcc.mir

120 lines

wave32.ll

4 lines

Diff 278453

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
bool Changed = false;		bool Changed = false;
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();		const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
const bool IsWave32 = ST.isWave32();		const bool IsWave32 = ST.isWave32();
const unsigned CondReg = TRI->getVCC();		const unsigned CondReg = TRI->getVCC();
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;		const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;		const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;		const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
		const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;

MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),		MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
E = MBB.rend();		E = MBB.rend();
bool ReadsCond = false;		bool ReadsCond = false;
unsigned Threshold = 5;		unsigned Threshold = 5;
for (++A; A != E; ++A) {		for (++A; A != E; ++A) {
if (!--Threshold)		if (!--Threshold)
return false;		return false;
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	if (Op2.isReg()) {
llvm_unreachable("Op2 must be register or immediate");		llvm_unreachable("Op2 must be register or immediate");
}		}

// Invert mask for s_andn2		// Invert mask for s_andn2
assert(MaskValue == 0 \|\| MaskValue == -1);		assert(MaskValue == 0 \|\| MaskValue == -1);
if (A->getOpcode() == AndN2)		if (A->getOpcode() == AndN2)
MaskValue = ~MaskValue;		MaskValue = ~MaskValue;

if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&		if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
MI.killsRegister(CondReg, TRI))		if (!MI.killsRegister(CondReg, TRI)) {
		// Replace AND with MOV
		if (MaskValue == 0) {
		BuildMI(A->getParent(), A, A->getDebugLoc(), TII->get(Mov), CondReg)
		.addImm(0);
		} else {
		BuildMI(A->getParent(), A, A->getDebugLoc(), TII->get(Mov), CondReg)
		.addReg(ExecReg);
		}
		}
		// Remove AND instruction
A->eraseFromParent();		A->eraseFromParent();
		}
		arsenmUnsubmitted Not Done Reply Inline Actions Can we do this earlier? Removing the SCC def earlier would be more useful arsenm: Can we do this earlier? Removing the SCC def earlier would be more useful
		critsonAuthorUnsubmitted Done Reply Inline Actions To the best of my understanding, the earliest this optimisation becomes available is after "Branch Probability Basic Block Placement". Which is not much earlier. critson: To the best of my understanding, the earliest this optimisation becomes available is after…

bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;		bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
if (SReg == ExecReg) {		if (SReg == ExecReg) {
// EXEC is updated directly		// EXEC is updated directly
if (IsVCCZ) {		if (IsVCCZ) {
MI.eraseFromParent();		MI.eraseFromParent();
return true;		return true;
}		}
▲ Show 20 Lines • Show All 177 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/infinite-loop.ll

	Show First 20 Lines • Show All 152 Lines • ▼ Show 20 Lines
	; SI-NEXT: v_mov_b32_e32 v0, 0x3e7			; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
	; SI-NEXT: s_waitcnt lgkmcnt(0)			; SI-NEXT: s_waitcnt lgkmcnt(0)
	; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0			; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
	; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]			; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
	; SI-NEXT: s_cbranch_execnz BB3_3			; SI-NEXT: s_cbranch_execnz BB3_3
	; SI-NEXT: ; %bb.4: ; %loop.exit.guard			; SI-NEXT: ; %bb.4: ; %loop.exit.guard
	; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1			; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1
	; SI-NEXT: s_or_b64 exec, exec, s[2:3]			; SI-NEXT: s_or_b64 exec, exec, s[2:3]
	; SI-NEXT: s_and_b64 vcc, exec, 0			; SI-NEXT: s_mov_b64 vcc, 0
	; SI-NEXT: s_branch BB3_2			; SI-NEXT: s_branch BB3_2
	; SI-NEXT: BB3_5: ; %UnifiedReturnBlock			; SI-NEXT: BB3_5: ; %UnifiedReturnBlock
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	; IR-LABEL: @infinite_loop_nest_ret(			; IR-LABEL: @infinite_loop_nest_ret(
	; IR-NEXT: entry:			; IR-NEXT: entry:
	; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()			; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
	; IR-NEXT: [[COND1:%.*]] = icmp eq i32 [[TMP]], 1			; IR-NEXT: [[COND1:%.*]] = icmp eq i32 [[TMP]], 1
	; IR-NEXT: br i1 [[COND1]], label [[OUTER_LOOP:%.]], label [[UNIFIEDRETURNBLOCK:%.]]			; IR-NEXT: br i1 [[COND1]], label [[OUTER_LOOP:%.]], label [[UNIFIEDRETURNBLOCK:%.]]
	Show All 30 Lines

llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir

Show First 20 Lines • Show All 409 Lines • ▼ Show 20 Lines	bb.1:
S_NOP 0		S_NOP 0

bb.2:		bb.2:
$sgpr0_sgpr1 = S_MOV_B64 -1		$sgpr0_sgpr1 = S_MOV_B64 -1
$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc		$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc		S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
S_ENDPGM 0		S_ENDPGM 0
...		...
		---
		# GCN-LABEL: name: and_0_mov
		# GCN: bb.2:
		# GCN-NOT: S_AND
		# GCN: $vcc = S_MOV_B64 0
		# GCN-NEXT: S_BRANCH %bb.1
		name: and_0_mov
		body: \|
		bb.0:
		S_NOP 0

		bb.1:
		S_NOP 0

		bb.2:
		$sgpr0_sgpr1 = S_MOV_B64 0
		$vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
		S_CBRANCH_VCCZ %bb.1, implicit $vcc
		S_ENDPGM 0
		...
		---
		# GCN-LABEL: name: andn2_m1_mov
		# GCN: bb.2:
		# GCN-NOT: S_ANDN2
		# GCN: $vcc = S_MOV_B64 0
		# GCN-NEXT: S_BRANCH %bb.1
		name: andn2_m1_mov
		body: \|
		bb.0:
		S_NOP 0

		bb.1:
		S_NOP 0

		bb.2:
		$sgpr0_sgpr1 = S_MOV_B64 -1
		$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
		S_CBRANCH_VCCZ %bb.1, implicit $vcc
		S_ENDPGM 0
		...
		---
		# GCN-LABEL: name: and_m1_mov
		# GCN: bb.2:
		# GCN-NOT: S_AND
		# GCN: $vcc = S_MOV_B64 $exec
		# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
		name: and_m1_mov
		body: \|
		bb.0:
		S_NOP 0

		bb.1:
		S_NOP 0

		bb.2:
		$sgpr0_sgpr1 = S_MOV_B64 -1
		$vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
		S_CBRANCH_VCCZ %bb.1, implicit $vcc
		S_ENDPGM 0
		...
		---
		# GCN-LABEL: name: andn2_0_mov
		# GCN: bb.2:
		# GCN-NOT: S_ANDN2
		# GCN: $vcc = S_MOV_B64 $exec
		# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
		name: andn2_0_mov
		body: \|
		bb.0:
		S_NOP 0

		bb.1:
		S_NOP 0

		bb.2:
		$sgpr0_sgpr1 = S_MOV_B64 0
		$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
		S_CBRANCH_VCCZ %bb.1, implicit $vcc
		S_ENDPGM 0
		...
		---
		# GCN-LABEL: name: and_0_scc_req
		# GCN: bb.2:
		# GCN-NOT: S_MOV_
		# GCN: S_AND_
		# GCN-NEXT: S_BRANCH %bb.1
		name: and_0_scc_req
		body: \|
		bb.0:
		S_NOP 0

		bb.1:
		S_NOP 0

		bb.2:
		$sgpr0_sgpr1 = S_MOV_B64 0
		$vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
		S_CBRANCH_VCCZ %bb.1, implicit $vcc
		S_ENDPGM 0
		...
		---
		# GCN-LABEL: name: andn2_m1_scc_req
		# GCN: bb.2:
		# GCN-NOT: S_MOV_
		# GCN: S_ANDN2_
		# GCN-NEXT: S_BRANCH %bb.1
		name: andn2_m1_scc_req
		body: \|
		bb.0:
		S_NOP 0

		bb.1:
		S_NOP 0

		bb.2:
		$sgpr0_sgpr1 = S_MOV_B64 -1
		$vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc
		S_CBRANCH_VCCZ %bb.1, implicit $vcc
		S_ENDPGM 0
		...

llvm/test/CodeGen/AMDGPU/wave32.ll

	Show First 20 Lines • Show All 830 Lines • ▼ Show 20 Lines
	define amdgpu_ps void @test_wqm_vote(float %a) {			define amdgpu_ps void @test_wqm_vote(float %a) {
	%c1 = fcmp une float %a, 0.0			%c1 = fcmp une float %a, 0.0
	%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)			%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
	call void @llvm.amdgcn.kill(i1 %c2)			call void @llvm.amdgcn.kill(i1 %c2)
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}test_branch_true:			; GCN-LABEL: {{^}}test_branch_true:
	; GFX1032: s_and_b32 vcc_lo, exec_lo, -1			; GFX1032: s_mov_b32 vcc_lo, exec_lo
	; GFX1064: s_and_b64 vcc, exec, -1			; GFX1064: s_mov_b64 vcc, exec
	define amdgpu_kernel void @test_branch_true() #2 {			define amdgpu_kernel void @test_branch_true() #2 {
	entry:			entry:
	br i1 true, label %for.end, label %for.body.lr.ph			br i1 true, label %for.end, label %for.body.lr.ph

	for.body.lr.ph: ; preds = %entry			for.body.lr.ph: ; preds = %entry
	br label %for.body			br label %for.body

	for.body: ; preds = %for.body, %for.body.lr.ph			for.body: ; preds = %for.body, %for.body.lr.ph
	▲ Show 20 Lines • Show All 280 Lines • Show Last 20 Lines