This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Don't sometimes allow instructions before lowered si_end_cf
ClosedPublic

Authored by arsenm on Sep 11 2020, 12:33 PM.

Download Raw Diff

Details

Reviewers

rampitec
nhaehnle
alex-t
critson
foad

Summary

Since 6524a7a2b9ca072bd7f7b4355d1230e70c679d2f, this would sometimes
not emit the or to exec at the beginning of the block, where it really
has to be. If there is an instruction that defines one of the source
operands, split the block and turn the si_end_cf into a terminator.

This avoids regressions when regalloc fast is switched to inserting
reloads at the beginning of the block, instead of spills at the end of
the block.

In a future change, this should always split the block.

Diff Detail

Event Timeline

arsenm created this revision.Sep 11 2020, 12:33 PM

Herald added a project: Restricted Project. · View Herald TranscriptSep 11 2020, 12:33 PM

Herald added subscribers: kerbowa, hiraditya, t-tye and 5 others. · View Herald Transcript

arsenm requested review of this revision.Sep 11 2020, 12:33 PM

Herald added a subscriber: wdng. · View Herald TranscriptSep 11 2020, 12:33 PM

arsenm added a child revision: D87543: AMDGPU: Always split si_end_cf blocks.Sep 11 2020, 12:35 PM

critson added inline comments.Sep 13 2020, 12:36 AM

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
496	I wonder if we can place splitBlock somewhere that the code can be shared between multiple passes? Feels like we end up needing to do this in a few places.

arsenm added a child revision: D52010: RegAllocFast: Rewrite and improve.Sep 15 2020, 7:04 AM

arsenm added a child revision: D87760: CodeGen: Move split block utility to MachineBasicBlock.Sep 16 2020, 7:03 AM

arsenm added inline comments.

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
496	Moved this in D87760

ping

rampitec accepted this revision.Sep 18 2020, 10:30 AM

This revision is now accepted and ready to land.Sep 18 2020, 10:30 AM

0576f436e577cede25810729aef236ec8c649446

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIInstrInfo.cpp

7 lines

SIInstructions.td

1 line

SILowerControlFlow.cpp

114 lines

SIOptimizeExecMasking.cpp

6 lines

test/

CodeGen/

AMDGPU/

lower-control-flow-other-terminators.mir

6 lines

si-lower-control-flow.mir

328 lines

Diff 291311

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,653 Lines • ▼ Show 20 Lines	case AMDGPU::S_XOR_B64_term:
MI.setDesc(get(AMDGPU::S_XOR_B64));		MI.setDesc(get(AMDGPU::S_XOR_B64));
break;		break;

case AMDGPU::S_XOR_B32_term:		case AMDGPU::S_XOR_B32_term:
// This is only a terminator to get the correct spill code placement during		// This is only a terminator to get the correct spill code placement during
// register allocation.		// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B32));		MI.setDesc(get(AMDGPU::S_XOR_B32));
break;		break;
		case AMDGPU::S_OR_B64_term:
		// This is only a terminator to get the correct spill code placement during
		// register allocation.
		MI.setDesc(get(AMDGPU::S_OR_B64));
		break;
case AMDGPU::S_OR_B32_term:		case AMDGPU::S_OR_B32_term:
// This is only a terminator to get the correct spill code placement during		// This is only a terminator to get the correct spill code placement during
// register allocation.		// register allocation.
MI.setDesc(get(AMDGPU::S_OR_B32));		MI.setDesc(get(AMDGPU::S_OR_B32));
break;		break;

case AMDGPU::S_ANDN2_B64_term:		case AMDGPU::S_ANDN2_B64_term:
// This is only a terminator to get the correct spill code placement during		// This is only a terminator to get the correct spill code placement during
▲ Show 20 Lines • Show All 560 Lines • ▼ Show 20 Lines	bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
// Skip over the instructions that are artificially terminators for special		// Skip over the instructions that are artificially terminators for special
// exec management.		// exec management.
while (I != E && !I->isBranch() && !I->isReturn() &&		while (I != E && !I->isBranch() && !I->isReturn() &&
I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {		I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
switch (I->getOpcode()) {		switch (I->getOpcode()) {
case AMDGPU::SI_MASK_BRANCH:		case AMDGPU::SI_MASK_BRANCH:
case AMDGPU::S_MOV_B64_term:		case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:		case AMDGPU::S_XOR_B64_term:
		case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:		case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_MOV_B32_term:		case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:		case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:		case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:		case AMDGPU::S_ANDN2_B32_term:
break;		break;
case AMDGPU::SI_IF:		case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:		case AMDGPU::SI_ELSE:
▲ Show 20 Lines • Show All 5,030 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 258 Lines • ▼ Show 20 Lines	class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
let UseNamedOperandTable = base_inst.UseNamedOperandTable;		let UseNamedOperandTable = base_inst.UseNamedOperandTable;
let CodeSize = base_inst.CodeSize;		let CodeSize = base_inst.CodeSize;
let SchedRW = base_inst.SchedRW;		let SchedRW = base_inst.SchedRW;
}		}

let WaveSizePredicate = isWave64 in {		let WaveSizePredicate = isWave64 in {
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;		def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;		def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
		def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;		def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
}		}

let WaveSizePredicate = isWave32 in {		let WaveSizePredicate = isWave32 in {
def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;		def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;		def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;		def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;		def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
▲ Show 20 Lines • Show All 2,206 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Show First 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	private:
const TargetRegisterClass *BoolRC = nullptr;		const TargetRegisterClass *BoolRC = nullptr;
bool InsertKillCleanups;		bool InsertKillCleanups;
unsigned AndOpc;		unsigned AndOpc;
unsigned OrOpc;		unsigned OrOpc;
unsigned XorOpc;		unsigned XorOpc;
unsigned MovTermOpc;		unsigned MovTermOpc;
unsigned Andn2TermOpc;		unsigned Andn2TermOpc;
unsigned XorTermrOpc;		unsigned XorTermrOpc;
		unsigned OrTermrOpc;
unsigned OrSaveExecOpc;		unsigned OrSaveExecOpc;
unsigned Exec;		unsigned Exec;

void emitIf(MachineInstr &MI);		void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);		void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);		void emitIfBreak(MachineInstr &MI);
void emitLoop(MachineInstr &MI);		void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
		MachineBasicBlock splitBlock(MachineInstr &MI, MachineBasicBlock BB,
		LiveIntervals *LIS);
		MachineBasicBlock *emitEndCf(MachineInstr &MI);

void findMaskOperands(MachineInstr &MI, unsigned OpNo,		void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;		SmallVectorImpl<MachineOperand> &Src) const;

void combineMasks(MachineInstr &MI);		void combineMasks(MachineInstr &MI);

bool removeMBBifRedundant(MachineBasicBlock &MBB);		bool removeMBBifRedundant(MachineBasicBlock &MBB);

void process(MachineInstr &MI);		MachineBasicBlock *process(MachineInstr &MI);

// Skip to the next instruction, ignoring debug instructions, and trivial		// Skip to the next instruction, ignoring debug instructions, and trivial
// block boundaries (blocks that have one (typically fallthrough) successor,		// block boundaries (blocks that have one (typically fallthrough) successor,
// and the successor has one predecessor.		// and the successor has one predecessor.
MachineBasicBlock::iterator		MachineBasicBlock::iterator
skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,		skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
MachineBasicBlock::iterator It) const;		MachineBasicBlock::iterator It) const;

▲ Show 20 Lines • Show All 357 Lines • ▼ Show 20 Lines	do {
// If there is one trivial successor, advance to the next block.		// If there is one trivial successor, advance to the next block.
MachineBasicBlock Succ = B->succ_begin();		MachineBasicBlock Succ = B->succ_begin();

It = Succ->begin();		It = Succ->begin();
B = Succ;		B = Succ;
} while (true);		} while (true);
}		}

void SILowerControlFlow::emitEndCf(MachineInstr &MI) {		MachineBasicBlock *SILowerControlFlow::splitBlock(MachineInstr &MI,
		critsonUnsubmitted Not Done Reply Inline Actions I wonder if we can place splitBlock somewhere that the code can be shared between multiple passes? Feels like we end up needing to do this in a few places. critson: I wonder if we can place splitBlock somewhere that the code can be shared between multiple…
		arsenmAuthorUnsubmitted Done Reply Inline Actions Moved this in D87760 arsenm: Moved this in D87760
		MachineBasicBlock *BB,
		LiveIntervals *LIS) {
		MachineBasicBlock::iterator SplitPoint(&MI);
		++SplitPoint;

		if (SplitPoint == BB->end()) {
		// Don't bother with a new block.
		return BB;
		}

		// Make sure we add any physregs we define in the block as liveins to the new
		// block.
		LivePhysRegs LiveRegs(*TRI);
		LiveRegs.addLiveOuts(*BB);
		for (auto I = BB->rbegin(), E = SplitPoint.getReverse(); I != E; ++I)
		LiveRegs.stepBackward(*I);

		MachineFunction *MF = BB->getParent();
		MachineBasicBlock *SplitBB
		= MF->CreateMachineBasicBlock(BB->getBasicBlock());

		MF->insert(++MachineFunction::iterator(BB), SplitBB);
		SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());

		SplitBB->transferSuccessorsAndUpdatePHIs(BB);
		BB->addSuccessor(SplitBB);

		addLiveIns(*SplitBB, LiveRegs);

		if (LIS)
		LIS->insertMBBInMaps(SplitBB, &MI);

		return SplitBB;
		}

		MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
Register CFMask = MI.getOperand(0).getReg();
MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
const DebugLoc &DL = MI.getDebugLoc();		const DebugLoc &DL = MI.getDebugLoc();

MachineBasicBlock::iterator InsPt =		MachineBasicBlock::iterator InsPt = MBB.begin();
Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
: MBB.begin();		// If we have instructions that aren't prolog instructions, split the block
MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)		// and emit a terminator instruction. This ensures correct spill placement.
		// FIXME: We should unconditionally split the block here.
		bool NeedBlockSplit = false;
		Register DataReg = MI.getOperand(0).getReg();
		for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
		I != E; ++I) {
		if (I->modifiesRegister(DataReg, TRI)) {
		NeedBlockSplit = true;
		break;
		}
		}

		unsigned Opcode = OrOpc;
		MachineBasicBlock *SplitBB = &MBB;
		if (NeedBlockSplit) {
		SplitBB = splitBlock(MI, &MBB, LIS);
		Opcode = OrTermrOpc;
		InsPt = MI;
		}

		MachineInstr *NewMI =
		BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
.addReg(Exec)		.addReg(Exec)
.add(MI.getOperand(0));		.add(MI.getOperand(0));

LoweredEndCf.insert(NewMI);		LoweredEndCf.insert(NewMI);

// If this ends control flow which contains kills (as flagged in emitIf)		// If this ends control flow which contains kills (as flagged in emitIf)
// then insert an SI_KILL_CLEANUP immediately following the exec mask		// then insert an SI_KILL_CLEANUP immediately following the exec mask
// manipulation. This can be lowered to early termination if appropriate.		// manipulation. This can be lowered to early termination if appropriate.
MachineInstr *CleanUpMI = nullptr;		MachineInstr *CleanUpMI = nullptr;
if (NeedsKillCleanup.count(&MI))		if (NeedsKillCleanup.count(&MI))
CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));		CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));

if (LIS) {		if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);		LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
if (CleanUpMI)		if (CleanUpMI)
LIS->InsertMachineInstrInMaps(*CleanUpMI);		LIS->InsertMachineInstrInMaps(*CleanUpMI);
}		}

MI.eraseFromParent();		MI.eraseFromParent();

if (LIS)		if (LIS)
LIS->handleMove(*NewMI);		LIS->handleMove(*NewMI);
		return SplitBB;
}		}

// Returns replace operands for a logical operation, either single result		// Returns replace operands for a logical operation, either single result
// for exec or two operands if source was another equivalent operation.		// for exec or two operands if source was another equivalent operation.
void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,		void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const {		SmallVectorImpl<MachineOperand> &Src) const {
MachineOperand &Op = MI.getOperand(OpNo);		MachineOperand &Op = MI.getOperand(OpNo);
if (!Op.isReg() \|\| !Op.getReg().isVirtual()) {		if (!Op.isReg() \|\| !Op.getReg().isVirtual()) {
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	if (Def && LoweredIf.count(SavedExec)) {
if (LIS)		if (LIS)
LIS->RemoveMachineInstrFromMaps(*MI);		LIS->RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();		MI->eraseFromParent();
removeMBBifRedundant(MBB);		removeMBBifRedundant(MBB);
}		}
}		}
}		}

void SILowerControlFlow::process(MachineInstr &MI) {		MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
MachineBasicBlock::iterator I(MI);		MachineBasicBlock::iterator I(MI);
MachineInstr Prev = (I != MBB.begin()) ? &(std::prev(I)) : nullptr;		MachineInstr Prev = (I != MBB.begin()) ? &(std::prev(I)) : nullptr;

		MachineBasicBlock *SplitBB = &MBB;

switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
case AMDGPU::SI_IF:		case AMDGPU::SI_IF:
emitIf(MI);		emitIf(MI);
break;		break;

case AMDGPU::SI_ELSE:		case AMDGPU::SI_ELSE:
emitElse(MI);		emitElse(MI);
break;		break;

case AMDGPU::SI_IF_BREAK:		case AMDGPU::SI_IF_BREAK:
emitIfBreak(MI);		emitIfBreak(MI);
break;		break;

case AMDGPU::SI_LOOP:		case AMDGPU::SI_LOOP:
emitLoop(MI);		emitLoop(MI);
break;		break;

case AMDGPU::SI_END_CF:		case AMDGPU::SI_END_CF:
emitEndCf(MI);		SplitBB = emitEndCf(MI);
break;		break;

default:		default:
assert(false && "Attempt to process unsupported instruction");		assert(false && "Attempt to process unsupported instruction");
break;		break;
}		}

MachineBasicBlock::iterator Next;		MachineBasicBlock::iterator Next;
for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {		for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);		Next = std::next(I);
MachineInstr &MaskMI = *I;		MachineInstr &MaskMI = *I;
switch (MaskMI.getOpcode()) {		switch (MaskMI.getOpcode()) {
case AMDGPU::S_AND_B64:		case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64:		case AMDGPU::S_OR_B64:
case AMDGPU::S_AND_B32:		case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32:		case AMDGPU::S_OR_B32:
// Cleanup bit manipulations on exec mask		// Cleanup bit manipulations on exec mask
combineMasks(MaskMI);		combineMasks(MaskMI);
break;		break;
default:		default:
I = MBB.end();		I = MBB.end();
break;		break;
}		}
}		}

		return SplitBB;
}		}

bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {		bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
bool Redundant = true;		bool Redundant = true;
for (auto &I : MBB.instrs()) {		for (auto &I : MBB.instrs()) {
if (!I.isDebugInstr() && !I.isUnconditionalBranch())		if (!I.isDebugInstr() && !I.isUnconditionalBranch())
Redundant = false;		Redundant = false;
}		}
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {

if (ST.isWave32()) {		if (ST.isWave32()) {
AndOpc = AMDGPU::S_AND_B32;		AndOpc = AMDGPU::S_AND_B32;
OrOpc = AMDGPU::S_OR_B32;		OrOpc = AMDGPU::S_OR_B32;
XorOpc = AMDGPU::S_XOR_B32;		XorOpc = AMDGPU::S_XOR_B32;
MovTermOpc = AMDGPU::S_MOV_B32_term;		MovTermOpc = AMDGPU::S_MOV_B32_term;
Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;		Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
XorTermrOpc = AMDGPU::S_XOR_B32_term;		XorTermrOpc = AMDGPU::S_XOR_B32_term;
		OrTermrOpc = AMDGPU::S_OR_B32_term;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;		OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
Exec = AMDGPU::EXEC_LO;		Exec = AMDGPU::EXEC_LO;
} else {		} else {
AndOpc = AMDGPU::S_AND_B64;		AndOpc = AMDGPU::S_AND_B64;
OrOpc = AMDGPU::S_OR_B64;		OrOpc = AMDGPU::S_OR_B64;
XorOpc = AMDGPU::S_XOR_B64;		XorOpc = AMDGPU::S_XOR_B64;
MovTermOpc = AMDGPU::S_MOV_B64_term;		MovTermOpc = AMDGPU::S_MOV_B64_term;
Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;		Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
XorTermrOpc = AMDGPU::S_XOR_B64_term;		XorTermrOpc = AMDGPU::S_XOR_B64_term;
		OrTermrOpc = AMDGPU::S_OR_B64_term;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;		OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
Exec = AMDGPU::EXEC;		Exec = AMDGPU::EXEC;
}		}

SmallVector<MachineInstr *, 32> Worklist;		SmallVector<MachineInstr *, 32> Worklist;

MachineFunction::iterator NextBB;		MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();		for (MachineFunction::iterator BI = MF.begin();
BI != BE; BI = NextBB) {		BI != MF.end(); BI = NextBB) {
NextBB = std::next(BI);		NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;		MachineBasicBlock MBB = &BI;

MachineBasicBlock::iterator I, Next;		MachineBasicBlock::iterator I, E, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {		E = MBB->end();
		for (I = MBB->begin(); I != E; I = Next) {
Next = std::next(I);		Next = std::next(I);
MachineInstr &MI = *I;		MachineInstr &MI = *I;
		MachineBasicBlock *SplitMBB = MBB;

switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
case AMDGPU::SI_IF:		case AMDGPU::SI_IF:
process(MI);		SplitMBB = process(MI);
break;		break;

case AMDGPU::SI_ELSE:		case AMDGPU::SI_ELSE:
case AMDGPU::SI_IF_BREAK:		case AMDGPU::SI_IF_BREAK:
case AMDGPU::SI_LOOP:		case AMDGPU::SI_LOOP:
case AMDGPU::SI_END_CF:		case AMDGPU::SI_END_CF:
// Only build worklist if SI_IF instructions must be processed first.		// Only build worklist if SI_IF instructions must be processed first.
if (InsertKillCleanups)		if (InsertKillCleanups)
Worklist.push_back(&MI);		Worklist.push_back(&MI);
else		else
process(MI);		SplitMBB = process(MI);
break;		break;

default:		default:
break;		break;
}		}

		if (SplitMBB != MBB) {
		MBB = Next->getParent();
		E = MBB->end();
		}
}		}
}		}

for (MachineInstr *MI : Worklist)		for (MachineInstr *MI : Worklist)
process(*MI);		process(*MI);

optimizeEndCf();		optimizeEndCf();

LoweredEndCf.clear();		LoweredEndCf.clear();
LoweredIf.clear();		LoweredIf.clear();
NeedsKillCleanup.clear();		NeedsKillCleanup.clear();

return true;		return true;
}		}

llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp

Show First 20 Lines • Show All 190 Lines • ▼ Show 20 Lines	case AMDGPU::S_XOR_B64_term: {
return true;		return true;
}		}
case AMDGPU::S_XOR_B32_term: {		case AMDGPU::S_XOR_B32_term: {
// This is only a terminator to get the correct spill code placement during		// This is only a terminator to get the correct spill code placement during
// register allocation.		// register allocation.
MI.setDesc(TII.get(AMDGPU::S_XOR_B32));		MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
return true;		return true;
}		}
		case AMDGPU::S_OR_B64_term: {
		// This is only a terminator to get the correct spill code placement during
		// register allocation.
		MI.setDesc(TII.get(AMDGPU::S_OR_B64));
		return true;
		}
case AMDGPU::S_OR_B32_term: {		case AMDGPU::S_OR_B32_term: {
// This is only a terminator to get the correct spill code placement during		// This is only a terminator to get the correct spill code placement during
// register allocation.		// register allocation.
MI.setDesc(TII.get(AMDGPU::S_OR_B32));		MI.setDesc(TII.get(AMDGPU::S_OR_B32));
return true;		return true;
}		}
case AMDGPU::S_ANDN2_B64_term: {		case AMDGPU::S_ANDN2_B64_term: {
// This is only a terminator to get the correct spill code placement during		// This is only a terminator to get the correct spill code placement during
▲ Show 20 Lines • Show All 239 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir

Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	body: \|
; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec		; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec
; CHECK: S_BRANCH %bb.2		; CHECK: S_BRANCH %bb.2
; CHECK: bb.1:		; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)		; CHECK: successors: %bb.2(0x80000000)
; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]]		; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]]
; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1)		; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1)
; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]		; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]
; CHECK: bb.2:		; CHECK: bb.2:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)		; CHECK: successors: %bb.3(0x80000000)
; CHECK: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]]		; CHECK: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]]
; CHECK: $exec = S_OR_B64 $exec, killed [[COPY5]], implicit-def $scc		; CHECK: $exec = S_OR_B64_term $exec, killed [[COPY5]], implicit-def $scc
		; CHECK: bb.3:
		; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: S_SLEEP 1		; CHECK: S_SLEEP 1
; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec		; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc		; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
; CHECK: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc		; CHECK: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc
; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_1]]		; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec		; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
; CHECK: [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec		; CHECK: [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec		; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec
Show All 27 Lines

llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py		# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-lower-control-flow -verify-machineinstrs %s -o - \| FileCheck -check-prefixes=GCN %s		# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=liveintervals,si-lower-control-flow,si-lower-control-flow -verify-machineinstrs %s -o - \| FileCheck -check-prefixes=GCN %s

# Check that assert is not triggered		# Check that assert is not triggered

...
---		---
name: si-lower-control-flow		name: si-lower-control-flow
body: \|		body: \|
bb.0:		bb.0:
; GCN-LABEL: name: si-lower-control-flow		; GCN-LABEL: name: si-lower-control-flow
; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5		; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0		; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0, 0
; GCN: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 [[S_LOAD_DWORD_IMM]], 255, implicit-def $scc		; GCN: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 [[S_LOAD_DWORD_IMM]], 255, implicit-def $scc
; GCN: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc		; GCN: dead %3:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc
; GCN: S_ENDPGM 0		; GCN: S_ENDPGM 0
%0:sgpr_64 = COPY $sgpr4_sgpr5		%0:sgpr_64 = COPY $sgpr4_sgpr5
%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0, 0		%1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0, 0
%2:sreg_32_xm0 = S_AND_B32 %1, 255, implicit-def $scc		%2:sreg_32_xm0 = S_AND_B32 %1, 255, implicit-def $scc
%3:sreg_32_xm0 = S_AND_B32 65535, %2, implicit-def $scc		%3:sreg_32_xm0 = S_AND_B32 65535, %2, implicit-def $scc
S_ENDPGM 0		S_ENDPGM 0
...		...

Show All 22 Lines	body: \|

bb.1:		bb.1:
successors: %bb.2		successors: %bb.2

bb.2:		bb.2:
S_ENDPGM 0		S_ENDPGM 0

...		...

		# We need to split the block for SI_END_CF, but
		---
		name: end_cf_split_block_end
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: end_cf_split_block_end
		; GCN: bb.0:
		; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
		; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
		; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
		; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
		; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
		; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
		; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
		; GCN: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
		; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
		; GCN: S_BRANCH %bb.2
		; GCN: bb.1:
		; GCN: successors: %bb.2(0x80000000)
		; GCN: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
		; GCN: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc
		; GCN: bb.2:
		; GCN: S_ENDPGM 0
		bb.0:
		liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31

		%0:vgpr_32 = COPY killed $vgpr0
		%1:vgpr_32 = COPY killed $vgpr1
		%3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
		%4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
		S_BRANCH %bb.2

		bb.1:
		successors: %bb.2

		%6:sreg_64_xexec = COPY %5
		SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec

		bb.2:
		S_ENDPGM 0

		...

		---
		name: end_cf_split_block_physreg_livein
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: end_cf_split_block_physreg_livein
		; GCN: bb.0:
		; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
		; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31, $sgpr4_sgpr5
		; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
		; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
		; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
		; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
		; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
		; GCN: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
		; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
		; GCN: S_BRANCH %bb.2
		; GCN: bb.1:
		; GCN: successors: %bb.3(0x80000000)
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5
		; GCN: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
		; GCN: S_NOP 0
		; GCN: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc
		; GCN: bb.3:
		; GCN: successors: %bb.2(0x80000000)
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5
		; GCN: S_SLEEP 3
		; GCN: S_NOP 0, implicit $vgpr0, implicit $sgpr4_sgpr5
		; GCN: bb.2:
		; GCN: S_ENDPGM 0
		bb.0:
		liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31, $sgpr4_sgpr5

		%0:vgpr_32 = COPY killed $vgpr0
		%1:vgpr_32 = COPY killed $vgpr1
		%3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
		%4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
		S_BRANCH %bb.2

		bb.1:
		successors: %bb.2
		liveins: $vgpr0, $sgpr4_sgpr5

		%6:sreg_64_xexec = COPY %5
		S_NOP 0
		SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec
		S_SLEEP 3
		S_NOP 0, implicit $vgpr0, implicit $sgpr4_sgpr5

		bb.2:
		S_ENDPGM 0

		...

		---
		name: end_cf_split_block_physreg_livein_liveout
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: end_cf_split_block_physreg_livein_liveout
		; GCN: bb.0:
		; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
		; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003
		; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
		; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
		; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
		; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
		; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
		; GCN: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
		; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
		; GCN: S_BRANCH %bb.2
		; GCN: bb.1:
		; GCN: successors: %bb.3(0x80000000)
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003
		; GCN: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
		; GCN: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc
		; GCN: bb.3:
		; GCN: successors: %bb.2(0x80000000)
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9
		; GCN: S_SLEEP 3
		; GCN: S_NOP 0
		; GCN: bb.2:
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003
		; GCN: S_ENDPGM 0, implicit $vgpr0, implicit $sgpr4_sgpr5, implicit $sgpr8_sgpr9_sgpr10_sgpr11
		bb.0:
		liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x00000003

		%0:vgpr_32 = COPY killed $vgpr0
		%1:vgpr_32 = COPY killed $vgpr1
		%3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
		%4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
		S_BRANCH %bb.2

		bb.1:
		successors: %bb.2
		liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x00000003

		%6:sreg_64_xexec = COPY %5
		SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec
		S_SLEEP 3
		S_NOP 0

		bb.2:
		liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x00000003
		S_ENDPGM 0, implicit $vgpr0, implicit $sgpr4_sgpr5, implicit $sgpr8_sgpr9_sgpr10_sgpr11

		...

		---
		name: end_cf_split_block_physreg_liveout
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: end_cf_split_block_physreg_liveout
		; GCN: bb.0:
		; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
		; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
		; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
		; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
		; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
		; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
		; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
		; GCN: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
		; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
		; GCN: S_BRANCH %bb.2
		; GCN: bb.1:
		; GCN: successors: %bb.3(0x80000000)
		; GCN: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
		; GCN: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc
		; GCN: bb.3:
		; GCN: successors: %bb.2(0x80000000)
		; GCN: liveins: $vgpr3
		; GCN: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
		; GCN: $sgpr4_sgpr5 = S_MOV_B64 32
		; GCN: bb.2:
		; GCN: liveins: $vgpr3, $sgpr4_sgpr5
		; GCN: S_ENDPGM 0, implicit $vgpr3, implicit $sgpr4_sgpr5
		bb.0:
		liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31

		%0:vgpr_32 = COPY killed $vgpr0
		%1:vgpr_32 = COPY killed $vgpr1
		%3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
		%4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
		S_BRANCH %bb.2

		bb.1:
		successors: %bb.2

		%6:sreg_64_xexec = COPY %5
		SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec
		$vgpr3 = V_MOV_B32_e32 0, implicit $exec
		$sgpr4_sgpr5 = S_MOV_B64 32

		bb.2:
		liveins: $vgpr3, $sgpr4_sgpr5
		S_ENDPGM 0, implicit $vgpr3, implicit $sgpr4_sgpr5

		...

		---
		name: end_cf_split_block_physreg_live_across_split
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: end_cf_split_block_physreg_live_across_split
		; GCN: bb.0:
		; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
		; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31, $sgpr4_sgpr5
		; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
		; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
		; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
		; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
		; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
		; GCN: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
		; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
		; GCN: S_BRANCH %bb.2
		; GCN: bb.1:
		; GCN: successors: %bb.3(0x80000000)
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5
		; GCN: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
		; GCN: $sgpr4_sgpr5 = S_MOV_B64 32
		; GCN: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc
		; GCN: bb.3:
		; GCN: successors: %bb.2(0x80000000)
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5
		; GCN: S_SLEEP 3, implicit $sgpr4_sgpr5
		; GCN: S_NOP 0
		; GCN: bb.2:
		; GCN: liveins: $vgpr0, $sgpr4_sgpr5
		; GCN: S_ENDPGM 0, implicit $vgpr0, implicit $sgpr4_sgpr5
		bb.0:
		liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31, $sgpr4_sgpr5

		%0:vgpr_32 = COPY killed $vgpr0
		%1:vgpr_32 = COPY killed $vgpr1
		%3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
		%4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
		S_BRANCH %bb.2

		bb.1:
		successors: %bb.2
		liveins: $vgpr0, $sgpr4_sgpr5

		%6:sreg_64_xexec = COPY %5
		$sgpr4_sgpr5 = S_MOV_B64 32
		SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec
		S_SLEEP 3, implicit $sgpr4_sgpr5
		S_NOP 0

		bb.2:
		liveins: $vgpr0, $sgpr4_sgpr5
		S_ENDPGM 0, implicit $vgpr0, implicit $sgpr4_sgpr5

		...

		---
		name: end_cf_split_block_process_next_inst
		tracksRegLiveness: true
		body: \|
		; GCN-LABEL: name: end_cf_split_block_process_next_inst
		; GCN: bb.0:
		; GCN: successors: %bb.1(0x80000000)
		; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
		; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
		; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
		; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
		; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
		; GCN: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY2]], implicit $exec
		; GCN: dead %5:sreg_64_xexec = S_MOV_B64 0
		; GCN: bb.1:
		; GCN: successors: %bb.3(0x80000000)
		; GCN: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]]
		; GCN: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc
		; GCN: bb.3:
		; GCN: successors: %bb.2(0x80000000)
		; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
		; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
		; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY4]], implicit-def dead $scc
		; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
		; GCN: dead %8:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
		; GCN: S_CBRANCH_EXECZ %bb.2, implicit $exec
		; GCN: bb.2:
		; GCN: S_ENDPGM 0
		bb.0:
		liveins: $vgpr0, $vgpr1, $vgpr2

		%0:vgpr_32 = COPY killed $vgpr0
		%1:vgpr_32 = COPY killed $vgpr1
		%2:vgpr_32 = COPY killed $vgpr2
		%3:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, killed %1, implicit $exec
		%4:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %2, implicit $exec
		%5:sreg_64_xexec = S_MOV_B64 0

		bb.1:
		successors: %bb.2

		%6:sreg_64_xexec = COPY %3
		SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%7:sreg_64_xexec = SI_IF %4, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec
		%8:sreg_64_xexec = S_MOV_B64_term %7, implicit $exec

		bb.2:
		S_ENDPGM 0

		...