This is an archive of the discontinued LLVM Phabricator instance.

[CodeGenPrepare] Improved logic to speculate calls to cttz/ctlz.
ClosedPublic

Authored by andreadb on Jan 6 2015, 5:31 AM.

Download Raw Diff

Details

Reviewers

qcolombet
RKSimon
hfinkel

Commits

rGf807a6f29703: [CodeGenPrepare] Improved logic to speculate calls to cttz/ctlz.
rL225274: [CodeGenPrepare] Improved logic to speculate calls to cttz/ctlz.

Summary

Hi all,

This patch improves the logic implemented in CodeGenPrepare (committed at revision 224899 - see review D6728) that teaches the backend when it is profitable to speculate calls to cttz/ctlz.

The original algorithm conservatively avoided speculating more than one instruction from a basic block in a control flow graph modelling an if-statement. In particular, the only allowed instruction (excluding the terminator) was a call to cttz/ctlz.
However, there are cases where we could be less conservative and still be able to speculate a call to cttz/ctlz.

Example:
/code
define i64 @test(i32 %x) {
entry:

%tobool = icmp eq i32 %x, 0
br i1 %tobool, label %cond.end, label %cond.true

cond.true: ; preds = %entry

%0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
%phitmp2 = zext i32 %0 to i64
br label %cond.end

cond.end: ; preds = %entry, %cond.true

%cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ]
ret i64 %cond

}
/code

The cttz from basic block %cond.true could be safely speculated if we know that the 'zext' is "free" for the target. The same reasoning applies to the case where the value produced by the cttz/ctlz is truncated rather than zero extended, and the extra truncate instruction is known to be "free" for the target.

The 'zext' from the example above would be "free" on a x86-64 target. So, if BMI is available on the target, then the entire code from function @test could be safely expanded into a single 'tzcntl' followed by a return statement.

With this patch, CodeGenPrepare now tries to speculate a cttz/ctlz if the result is zero extended/ truncated in the same basic block, and the zext/trunc instruction is free for the target.
This fixes (i.e. improves) all the new test cases added by this patch in 'CodeGen/X86/cttz-ctlz.ll'.

Please let me know if ok to submit.

Thanks,
Andrea

Diff Detail

Repository: rL LLVM

Event Timeline

andreadb updated this revision to Diff 17829.Jan 6 2015, 5:31 AM

andreadb retitled this revision from to [CodeGenPrepare] Improved logic to speculate calls to cttz/ctlz..

andreadb updated this object.

andreadb edited the test plan for this revision. (Show Details)

andreadb added reviewers: hfinkel, qcolombet, RKSimon.

andreadb added subscribers: Unknown Object (MLST), • test.

Here is an updated patch.
I improved a couple of comments and added more conservative checks.

LGTM.

On PPC64, zext is not generally free, but is free after ctlz. We can deal with that later, however.

This revision is now accepted and ready to land.Jan 6 2015, 9:24 AM

Closed by commit rL225274: [CodeGenPrepare] Improved logic to speculate calls to cttz/ctlz. (authored by adibiagio). · Explain WhyJan 6 2015, 9:42 AM

This revision was automatically updated to reflect the committed changes.

In D6853#105747, @hfinkel wrote:

LGTM.

On PPC64, zext is not generally free, but is free after ctlz. We can deal with that later, however.

Thanks for the quick review Hal!
Committed revision 225274.

-Andrea

andreadb mentioned this in D7585: [TTI] Teach the cost heuristic how to query TLI to check if a zext/trunc is "free" for the target..Feb 12 2015, 5:26 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

CodeGenPrepare.cpp

41 lines

test/

CodeGen/

X86/

cttz-ctlz.ll

172 lines

Diff 17833

llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp

	Show First 20 Lines • Show All 4,002 Lines • ▼ Show 20 Lines
	static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {			static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {
	assert(BrInst->isConditional() && "Expected a conditional branch!");			assert(BrInst->isConditional() && "Expected a conditional branch!");
	BasicBlock *ThenBB = BrInst->getSuccessor(1);			BasicBlock *ThenBB = BrInst->getSuccessor(1);
	BasicBlock *EndBB = BrInst->getSuccessor(0);			BasicBlock *EndBB = BrInst->getSuccessor(0);

	// See if ThenBB contains only one instruction (excluding the			// See if ThenBB contains only one instruction (excluding the
	// terminator and DbgInfoIntrinsic calls).			// terminator and DbgInfoIntrinsic calls).
	IntrinsicInst *II = nullptr;			IntrinsicInst *II = nullptr;
				CastInst *CI = nullptr;
	for (BasicBlock::iterator I = ThenBB->begin(),			for (BasicBlock::iterator I = ThenBB->begin(),
	E = std::prev(ThenBB->end()); I != E; ++I) {			E = std::prev(ThenBB->end()); I != E; ++I) {
	// Skip debug info.			// Skip debug info.
	if (isa<DbgInfoIntrinsic>(I))			if (isa<DbgInfoIntrinsic>(I))
	continue;			continue;

	if (II)			// Check if this is a zero extension or a truncate of a previously
				// matched call to intrinsic cttz/ctlz.
				if (II) {
				// Early exit if we already found a "free" zero extend/truncate.
				if (CI)
				return false;

				Type *SrcTy = II->getType();
				Type *DestTy = I->getType();
				Value *V;

				if (match(cast<Instruction>(I), m_ZExt(m_Value(V))) && V == II) {
				// Speculate this zero extend only if it is "free" for the target.
				if (TLI.isZExtFree(SrcTy, DestTy)) {
				CI = cast<CastInst>(I);
				continue;
				}
				} else if (match(cast<Instruction>(I), m_Trunc(m_Value(V))) && V == II) {
				// Speculate this truncate only if it is "free" for the target.
				if (TLI.isTruncateFree(SrcTy, DestTy)) {
				CI = cast<CastInst>(I);
				continue;
				}
				} else {
	// Avoid speculating more than one instruction.			// Avoid speculating more than one instruction.
	return false;			return false;
				}
				}

	// See if this is a call to intrinsic cttz/ctlz.			// See if this is a call to intrinsic cttz/ctlz.
	if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {			if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {
	// Avoid speculating expensive intrinsic calls.			// Avoid speculating expensive intrinsic calls.
	if (!TLI.isCheapToSpeculateCttz())			if (!TLI.isCheapToSpeculateCttz())
	return false;			return false;
	}			}
	else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {			else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {
	// Avoid speculating expensive intrinsic calls.			// Avoid speculating expensive intrinsic calls.
	if (!TLI.isCheapToSpeculateCtlz())			if (!TLI.isCheapToSpeculateCtlz())
	return false;			return false;
	} else			} else
	return false;			return false;

	II = cast<IntrinsicInst>(I);			II = cast<IntrinsicInst>(I);
	}			}

	// Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.			// Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.
	BasicBlock *EntryBB = BrInst->getParent();			BasicBlock *EntryBB = BrInst->getParent();
	for (BasicBlock::iterator I = EndBB->begin();			for (BasicBlock::iterator I = EndBB->begin();
	PHINode *PN = dyn_cast<PHINode>(I); ++I) {			PHINode *PN = dyn_cast<PHINode>(I); ++I) {
	Value *ThenV = PN->getIncomingValueForBlock(ThenBB);			Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
	Value *OrigV = PN->getIncomingValueForBlock(EntryBB);			Value *OrigV = PN->getIncomingValueForBlock(EntryBB);

	if (!OrigV \|\| ThenV != II)			if (!OrigV)
				return false;

				if (ThenV != II && (!CI \|\| ThenV != CI))
	return false;			return false;

	if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {			if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {
	unsigned BitWidth = ThenV->getType()->getIntegerBitWidth();			unsigned BitWidth = II->getType()->getIntegerBitWidth();

	// Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz			// Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz
	// intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits			// intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits
	// of the value in input to the cttz/ctlz.			// of the value in input to the cttz/ctlz.
	if (CInt->getValue() != BitWidth)			if (CInt->getValue() != BitWidth)
	return false;			return false;

	// Hoist the call to cttz/ctlz from ThenBB into EntryBB.			// Hoist the call to cttz/ctlz from ThenBB into EntryBB.
	EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),			EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),
	ThenBB->begin(), std::prev(ThenBB->end()));			ThenBB->begin(), std::prev(ThenBB->end()));

	// Update PN setting ThenV as the incoming value from both 'EntryBB'			// Update PN setting ThenV as the incoming value from both 'EntryBB'
	// and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this			// and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this
	// phi node if all the incoming values are the same.			// phi node if all the incoming values are the same.
	PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);			PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);
	PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);			PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);

	// Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.			// Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.
	if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {			if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {
	Type *Ty = II->getArgOperand(0)->getType();			Type *Ty = II->getArgOperand(0)->getType();
	Value *Args[] = { II->getArgOperand(0),			Value *Args[] = { II->getArgOperand(0),
	ConstantInt::getFalse(II->getContext()) };			ConstantInt::getFalse(II->getContext()) };
	Module *M = EntryBB->getParent()->getParent();			Module *M = EntryBB->getParent()->getParent();
	Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);			Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);
	IRBuilder<> Builder(BrInst);			IRBuilder<> Builder(II);
	Instruction *NewI = Builder.CreateCall(IF, Args);			Instruction *NewI = Builder.CreateCall(IF, Args);

	// Replace the old call to cttz/ctlz.			// Replace the old call to cttz/ctlz.
	II->replaceAllUsesWith(NewI);			II->replaceAllUsesWith(NewI);
	II->eraseFromParent();			II->eraseFromParent();
	}			}

	// Update BrInst condition so that the branch to EndBB is always taken.			// Update BrInst condition so that the branch to EndBB is always taken.
	▲ Show 20 Lines • Show All 546 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/cttz-ctlz.ll

Show First 20 Lines • Show All 235 Lines • ▼ Show 20 Lines	cond.true: ; preds = %entry
%0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)		%0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
br label %cond.end		br label %cond.end

cond.end: ; preds = %entry, %cond.true		cond.end: ; preds = %entry, %cond.true
%cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]		%cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
ret i16 %cond		ret i16 %cond
}		}

		; The following tests verify that calls to cttz/ctlz are speculated even if
		; basic block %cond.true has an extra zero extend/truncate which is "free"
		; for the target.

		define i64 @test1e(i32 %x) {
		; ALL-LABEL: @test1e(
		; LZCNT: icmp eq i32 %x, 0
		; LZCNT: call i32 @llvm.cttz.i32(i32 %x, i1 true)
		; BMI: call i32 @llvm.cttz.i32(i32 %x, i1 false)
		; GENERIC: icmp eq i32 %x, 0
		; GENERIC: call i32 @llvm.cttz.i32(i32 %x, i1 true)
		entry:
		%tobool = icmp eq i32 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
		%phitmp2 = zext i32 %0 to i64
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ]
		ret i64 %cond
		}

		define i32 @test2e(i64 %x) {
		; ALL-LABEL: @test2e(
		; LZCNT: icmp eq i64 %x, 0
		; LZCNT: call i64 @llvm.cttz.i64(i64 %x, i1 true)
		; BMI: call i64 @llvm.cttz.i64(i64 %x, i1 false)
		; GENERIC: icmp eq i64 %x, 0
		; GENERIC: call i64 @llvm.cttz.i64(i64 %x, i1 true)
		entry:
		%tobool = icmp eq i64 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
		%cast = trunc i64 %0 to i32
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i32 [ %cast, %cond.true ], [ 64, %entry ]
		ret i32 %cond
		}

		define i64 @test3e(i32 %x) {
		; ALL-LABEL: @test3e(
		; BMI: icmp eq i32 %x, 0
		; BMI: call i32 @llvm.ctlz.i32(i32 %x, i1 true)
		; LZCNT: call i32 @llvm.ctlz.i32(i32 %x, i1 false)
		; GENERIC: icmp eq i32 %x, 0
		; GENERIC: call i32 @llvm.ctlz.i32(i32 %x, i1 true)
		entry:
		%tobool = icmp eq i32 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
		%phitmp2 = zext i32 %0 to i64
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ]
		ret i64 %cond
		}

		define i32 @test4e(i64 %x) {
		; ALL-LABEL: @test4e(
		; BMI: icmp eq i64 %x, 0
		; BMI: call i64 @llvm.ctlz.i64(i64 %x, i1 true)
		; LZCNT: call i64 @llvm.ctlz.i64(i64 %x, i1 false)
		; GENERIC: icmp eq i64 %x, 0
		; GENERIC: call i64 @llvm.ctlz.i64(i64 %x, i1 true)
		entry:
		%tobool = icmp eq i64 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
		%cast = trunc i64 %0 to i32
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i32 [ %cast, %cond.true ], [ 64, %entry ]
		ret i32 %cond
		}

		define i16 @test5e(i64 %x) {
		; ALL-LABEL: @test5e(
		; BMI: icmp eq i64 %x, 0
		; BMI: call i64 @llvm.ctlz.i64(i64 %x, i1 true)
		; LZCNT: call i64 @llvm.ctlz.i64(i64 %x, i1 false)
		; GENERIC: icmp eq i64 %x, 0
		; GENERIC: call i64 @llvm.ctlz.i64(i64 %x, i1 true)
		entry:
		%tobool = icmp eq i64 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
		%cast = trunc i64 %0 to i16
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i16 [ %cast, %cond.true ], [ 64, %entry ]
		ret i16 %cond
		}

		define i16 @test6e(i32 %x) {
		; ALL-LABEL: @test6e(
		; BMI: icmp eq i32 %x, 0
		; BMI: call i32 @llvm.ctlz.i32(i32 %x, i1 true)
		; LZCNT: call i32 @llvm.ctlz.i32(i32 %x, i1 false)
		; GENERIC: icmp eq i32 %x, 0
		; GENERIC: call i32 @llvm.ctlz.i32(i32 %x, i1 true)
		entry:
		%tobool = icmp eq i32 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
		%cast = trunc i32 %0 to i16
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i16 [ %cast, %cond.true ], [ 32, %entry ]
		ret i16 %cond
		}

		define i16 @test7e(i64 %x) {
		; ALL-LABEL: @test7e(
		; LZCNT: icmp eq i64 %x, 0
		; LZCNT: call i64 @llvm.cttz.i64(i64 %x, i1 true)
		; BMI: call i64 @llvm.cttz.i64(i64 %x, i1 false)
		; GENERIC: icmp eq i64 %x, 0
		; GENERIC: call i64 @llvm.cttz.i64(i64 %x, i1 true)
		entry:
		%tobool = icmp eq i64 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
		%cast = trunc i64 %0 to i16
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i16 [ %cast, %cond.true ], [ 64, %entry ]
		ret i16 %cond
		}

		define i16 @test8e(i32 %x) {
		; ALL-LABEL: @test8e(
		; LZCNT: icmp eq i32 %x, 0
		; LZCNT: call i32 @llvm.cttz.i32(i32 %x, i1 true)
		; BMI: call i32 @llvm.cttz.i32(i32 %x, i1 false)
		; GENERIC: icmp eq i32 %x, 0
		; GENERIC: call i32 @llvm.cttz.i32(i32 %x, i1 true)
		entry:
		%tobool = icmp eq i32 %x, 0
		br i1 %tobool, label %cond.end, label %cond.true

		cond.true: ; preds = %entry
		%0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
		%cast = trunc i32 %0 to i16
		br label %cond.end

		cond.end: ; preds = %entry, %cond.true
		%cond = phi i16 [ %cast, %cond.true ], [ 32, %entry ]
		ret i16 %cond
		}


declare i64 @llvm.ctlz.i64(i64, i1)		declare i64 @llvm.ctlz.i64(i64, i1)
declare i32 @llvm.ctlz.i32(i32, i1)		declare i32 @llvm.ctlz.i32(i32, i1)
declare i16 @llvm.ctlz.i16(i16, i1)		declare i16 @llvm.ctlz.i16(i16, i1)
declare i64 @llvm.cttz.i64(i64, i1)		declare i64 @llvm.cttz.i64(i64, i1)
declare i32 @llvm.cttz.i32(i32, i1)		declare i32 @llvm.cttz.i32(i32, i1)
declare i16 @llvm.cttz.i16(i16, i1)		declare i16 @llvm.cttz.i16(i16, i1)