Diff 117363

lib/Target/AMDGPU/SIInsertSkips.cpp

Show First 20 Lines • Show All 126 Lines • ▼ Show 20 Lines	for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();

// When a uniform loop is inside non-uniform control flow, the branch		// When a uniform loop is inside non-uniform control flow, the branch
// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken		// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
// when EXEC = 0. We should skip the loop lest it becomes infinite.		// when EXEC = 0. We should skip the loop lest it becomes infinite.
if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ \|\|		if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ \|\|
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)		I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
return true;		return true;

		// V_READFIRSTLANE/V_READLANE destination register may be used as operand
		// by some SALU instruction. If exec mask is zero vector instruction
		// defining the register that is used by the scalar one is not executed
		rampitecUnsubmitted Not Done Reply Inline Actions The comment is misleading. Scalar instructions executed even if exec = 0 (contrarily to the comment). That is unclear if there must be a scalar instruction consuming result of readlane too, since SGPR can be an operand of a vector instruction. rampitec: The comment is misleading. Scalar instructions executed even if exec = 0 (contrarily to the…
		// and scalar instruction will operate on undefined data. For
		// V_READFIRSTLANE/V_READLANE we should iterate over its users and avoid
		// predicated execution if one of the users is scalar.
		if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) \|\|
		(I->getOpcode() == AMDGPU::V_READLANE_B32)) {
		const MachineOperand *Def = I->defs().begin();
		unsigned Reg = Def->getReg();
		MachineBasicBlock::const_iterator E = MBB.getFirstTerminator();
		rampitecUnsubmitted Not Done Reply Inline Actions What if an user is actually hoisted out the block? What if that is terminator uses it? In both cases users should then read undefined data. rampitec: What if an user is actually hoisted out the block? What if that is terminator uses it? In both…
		rampitecUnsubmitted Not Done Reply Inline Actions It is named operand "src0", using getOperand(1) is not desirable. Also VReg is misleading, it reads like "virtual register". rampitec: It is named operand "src0", using getOperand(1) is not desirable. Also VReg is misleading, it…
		MachineBasicBlock::const_iterator II(I);
		while (++II != E) {
		if (TII->isScalarUnit(*II)) {
		rampitecUnsubmitted Not Done Reply Inline Actions What is interesting, an user can be also another v_readlane_b32's lane select operand. Subsequent v_readlane_b32 would be executed even with exec = 0 and will read an undefined data. rampitec: What is interesting, an user can be also another v_readlane_b32's lane select operand.
		for (auto U : II->uses()) {
		if (U.isReg() && U.readsReg()) {
		unsigned Use = U.getReg();
		if ((Use == Reg) \|\| TRI->isSubRegister(Use, Reg))
		return true;
		}
		}
		}
		rampitecUnsubmitted Not Done Reply Inline Actions Post increment is broken here. rampitec: Post increment is broken here.
		}
		rampitecUnsubmitted Not Done Reply Inline Actions Please follow the general and surrounding style: brace on the same line with expression. rampitec: Please follow the general and surrounding style: brace on the same line with expression.
		return false;
		}

if (I->isInlineAsm()) {		if (I->isInlineAsm()) {
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();		const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
const char *AsmStr = I->getOperand(0).getSymbolName();		const char *AsmStr = I->getOperand(0).getSymbolName();

// inlineasm length estimate is number of bytes assuming the longest		// inlineasm length estimate is number of bytes assuming the longest
// instruction.		// instruction.
uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);		uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
NumInstr += MaxAsmSize / MAI->getMaxInstLength();		NumInstr += MaxAsmSize / MAI->getMaxInstLength();
} else {		} else {
++NumInstr;		++NumInstr;
}		}

if (NumInstr >= SkipThreshold)		if (NumInstr >= SkipThreshold)
return true;		return true;
}		}
		rampitecUnsubmitted Not Done Reply Inline Actions That is broken. Remember that register is physical. Not speaking that is whole function scan is expensive. rampitec: That is broken. Remember that register is physical. Not speaking that is whole function scan is…
}		}

return false;		return false;
}		}

bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {		bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
MachineFunction *MF = MBB.getParent();		MachineFunction *MF = MBB.getParent();
▲ Show 20 Lines • Show All 192 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/readlane_exec0.mir

This file was added.

				# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-skips \| FileCheck %s
				rampitecUnsubmitted Not Done Reply Inline Actions Please use -check-label=GCN and GCN labels instead of CHECK. This seems to be our new convention. Also add -verify-machineinstrs. rampitec: Please use -check-label=GCN and GCN labels instead of CHECK. This seems to be our new…

				# CHECK-LABEL: bb.0
				rampitecUnsubmitted Not Done Reply Inline Actions Label in this case needs to be function's name. That is to add new function to the test later. bb.0 then becomes a regular check. rampitec: Label in this case needs to be function's name. That is to add new function to the test later.
				# CHECK: S_CBRANCH_EXECZ %bb.2
				rampitecUnsubmitted Not Done Reply Inline Actions You still need to add GCN-LABEL rampitec: You still need to add GCN-LABEL

				---
				name: readlane_exec0

				body: \|
				bb.0:
				successors: %bb.1, %bb.2
				liveins: %vgpr1_vgpr2:0x00000001, %vgpr2_vgpr3:0x00000003

				%vgpr4 = V_AND_B32_e32 1, %vgpr1, implicit %exec
				V_CMP_EQ_U32_e32 1, killed %vgpr4, implicit-def %vcc, implicit %exec
				%sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed %vcc, implicit-def %exec, implicit-def %scc, implicit %exec
				SI_MASK_BRANCH %bb.2, implicit %exec
				S_BRANCH %bb.1

				bb.1:

				%sgpr10 = V_READFIRSTLANE_B32 %vgpr2, implicit %exec
				%sgpr11 = V_READFIRSTLANE_B32 %vgpr3, implicit %exec
				%sgpr10 = S_LOAD_DWORD_IMM killed %sgpr10_sgpr11, 0, 0
				S_WAITCNT 127
				%vgpr0 = V_XOR_B32_e32 killed %sgpr10, killed %vgpr0, implicit %exec

				bb.2:

				%exec = S_OR_B64 %exec, killed %sgpr0_sgpr1, implicit-def %scc
				...

This is an archive of the discontinued LLVM Phabricator instance.

Avoid predicated execution of the basic blocks containing scalar instructions
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 117363

lib/Target/AMDGPU/SIInsertSkips.cpp

test/CodeGen/AMDGPU/readlane_exec0.mir

This is an archive of the discontinued LLVM Phabricator instance.

Avoid predicated execution of the basic blocks containing scalar instructionsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 117363

lib/Target/AMDGPU/SIInsertSkips.cpp

test/CodeGen/AMDGPU/readlane_exec0.mir

Avoid predicated execution of the basic blocks containing scalar instructions
ClosedPublic