This is an archive of the discontinued LLVM Phabricator instance.

Differential D20092

[AMDGPU] Fix issues introduced by aggressive block placement
AbandonedPublic

Authored by cycheng on May 10 2016, 2:04 AM.

Download Raw Diff

Details

Reviewers

tjablin
• tstellarAMD
kbarton

Summary

Patch D20017 aggressively choosing the best loop top in a loop, this introduces 2 issues for AMDGPU backend:

Crash issue: it breaks assumption of basic block order in shouldSkip(), e.g.

   bb2<--+   bb2 (From)      Block      Flow (To)
   /   \ |   bb6        => Placement => bb2 (From)
bb6 -> Flow  Flow (To)                  bb6

In original code, To MBB is assumed after From MBB, so it gets crash when this is not true.

Unnecessary branch in fall through path, e.g.

   bb2<--+   Flow (Latch)  
   /   \ |   bb2  (Header) 
bb6 -> Flow  bb6
  
Flow: 
    s_cbranch_execnz bb2
    s_branch end
bb2:
    s_cbranch_execz Flow
bb6:
    s_branch Flow
end:

Flow can fall through bb2, so we can replace two branches into one conditional branch.

Diff Detail

Event Timeline

cycheng updated this revision to Diff 56678.May 10 2016, 2:04 AM

cycheng retitled this revision from to [AMDGPU] Fix issues introduced by aggressive block placement.

cycheng updated this object.

cycheng added reviewers: • tstellarAMD, tjablin, kbarton.

cycheng added subscribers: hfinkel, nemanjai, amehsan, llvm-commits.

Herald added subscribers: kzhuravl, arsenm. · View Herald TranscriptMay 10 2016, 2:04 AM

cycheng added a child revision: D20017: Aggressive choosing best loop top.May 10 2016, 4:49 AM

Would implementing AnalyzeBranch etc. help or are those not supposed to be needed for ocrrectness?

I always thought this pass didn't do anything for AMDGPU, since AnalyzeBranch wasn't implemented.

Because D20017 has been abandoned, so abandon this patch, too.
We might need this patch again if "-force-precise-rotation-cost" turn on by default, and do loop rotation on this pattern:

          entry               
            |                 
------> loop.header (body)    
|97%    /       \             
|      /50%      \50%         
--- latch <--- if.then        
       |
       |3%
   loop.end

Thanks for all reviewers!

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SILowerControlFlow.cpp

61 lines

Diff 56678

lib/Target/AMDGPU/SILowerControlFlow.cpp

Show First 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	FunctionPass *llvm::createSILowerControlFlowPass() {
return new SILowerControlFlow();		return new SILowerControlFlow();
}		}

bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,		bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
MachineBasicBlock *To) {		MachineBasicBlock *To) {

unsigned NumInstr = 0;		unsigned NumInstr = 0;

		// Check whether 'To MBB' is before 'From MBB', this is possible after Block
		// Placement Pass:
		// bb2<--+ bb2 (From) Block Flow (To)
		// / \ \| bb6 => Placement => bb2 (From)
		// bb6 -> Flow Flow (To) bb6
		for (MachineFunction::iterator MBBI = MachineFunction::iterator(From),
		ToI = MachineFunction::iterator(To);
		ToI != From->getParent()->end(); ++ToI) {
		// return true so we generate conditional branch for 'From MBB'
		if (MBBI == ToI)
		return true;
		}

for (MachineFunction::iterator MBBI = MachineFunction::iterator(From),		for (MachineFunction::iterator MBBI = MachineFunction::iterator(From),
ToI = MachineFunction::iterator(To); MBBI != ToI; ++MBBI) {		ToI = MachineFunction::iterator(To); MBBI != ToI; ++MBBI) {

MachineBasicBlock &MBB = *MBBI;		MachineBasicBlock &MBB = *MBBI;

for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
NumInstr < SkipThreshold && I != E; ++I) {		NumInstr < SkipThreshold && I != E; ++I) {

▲ Show 20 Lines • Show All 149 Lines • ▼ Show 20 Lines	void SILowerControlFlow::Loop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();		DebugLoc DL = MI.getDebugLoc();
unsigned Src = MI.getOperand(0).getReg();		unsigned Src = MI.getOperand(0).getReg();

BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)		BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)		.addReg(AMDGPU::EXEC)
.addReg(Src);		.addReg(Src);

		// After Block Placement Pass, Latch block might before Header block
		// bb2<--+ Flow (Latch) MBB
		// / \ \| bb2 (Header) NextBB
		// bb6 -> Flow bb6
		//
		// If this is the case, then 'Flow MBB' can fall through 'bb2 MBB'. But we
		// need to change branch condition for 'Flow MBB'

		// Check if Latch is before Header.
		// %Flow:
		// SI_LOOP %SGPR0_SGPR1, bb2, ..
		// S_BRANCH end
		// %bb2:
		// ..
		MachineBasicBlock &NextBB = *std::next(MachineFunction::iterator(MBB));
		if (&NextBB == MI.getOperand(1).getMBB()) {
		MachineInstr &NextMI = *std::next(MachineBasicBlock::iterator(MI));

		assert(NextMI.getOpcode() == AMDGPU::S_BRANCH &&
		"Next instruction of SI_LOOP should be S_BRANCH");

		// Result:
		// s_cbranch_execz end
		// s_branch end
		// The 's_branch end' is removed at Branch()
		BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
		.addOperand(NextMI.getOperand(0));
		}
		else {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))		BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addOperand(MI.getOperand(1));		.addOperand(MI.getOperand(1));
		}

MI.eraseFromParent();		MI.eraseFromParent();
}		}

void SILowerControlFlow::EndCf(MachineInstr &MI) {		void SILowerControlFlow::EndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();		DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();		unsigned Reg = MI.getOperand(0).getReg();

BuildMI(MBB, MBB.getFirstNonPHI(), DL,		BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)		TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)		.addReg(AMDGPU::EXEC)
.addReg(Reg);		.addReg(Reg);

MI.eraseFromParent();		MI.eraseFromParent();
}		}

void SILowerControlFlow::Branch(MachineInstr &MI) {		void SILowerControlFlow::Branch(MachineInstr &MI) {
		// If these aren't equal, this is probably an infinite loop.
if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())		if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
MI.eraseFromParent();		MI.eraseFromParent();
		else if (&*MI.getParent()->begin() != &MI) {
		MachineInstr &PrevMI = *std::prev(MachineBasicBlock::iterator(MI));

// If these aren't equal, this is probably an infinite loop.		// Look at this pattern (see comments in Loop()):
		// s_cbranch_execz end
		// s_branch end
		// Remove 's_branch end'
		if (PrevMI.getOpcode() == AMDGPU::S_CBRANCH_EXECZ &&
		PrevMI.getOperand(0).getMBB() == MI.getOperand(0).getMBB())
		MI.eraseFromParent();
		}
}		}

void SILowerControlFlow::Kill(MachineInstr &MI) {		void SILowerControlFlow::Kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();		MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();		DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);		const MachineOperand &Op = MI.getOperand(0);

#ifndef NDEBUG		#ifndef NDEBUG
▲ Show 20 Lines • Show All 294 Lines • Show Last 20 Lines