This is an archive of the discontinued LLVM Phabricator instance.

[CGP] despeculate expensive cttz/ctlz intrinsics
ClosedPublic

Authored by spatel on Nov 12 2015, 4:23 PM.

Download Raw Diff

Details

Reviewers

andreadb
jmolloy
hfinkel

Commits

rG4699b8ab6acf: [CGP] despeculate expensive cttz/ctlz intrinsics
rL253573: [CGP] despeculate expensive cttz/ctlz intrinsics

Summary

This is another step towards allowing SimplifyCFG to speculate harder, but then have CGP clean things up if the target doesn't like it.

Previous patches in this series:
http://reviews.llvm.org/D12882
http://reviews.llvm.org/D13297

My hope is that D13297 will catch most expensive ops, but speculation of cttz/ctlz requires special handling because of weirdness in the intrinsic definition for handling a zero input (that definition can probably be blamed on x86).

For example, if we have the usual speculated-by-select expensive op pattern like this:

%tobool = icmp eq i64 %A, 0
%0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)   ; is_zero_undef == true
%cond = select i1 %tobool, i64 64, i64 %0
ret i64 %cond

There's an instcombine that will turn it into:

%0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 false)   ; is_zero_undef == false

This CGP patch is looking for that case and despeculating it back into:

entry:
%tobool = icmp eq i64 %A, 0
br i1 %tobool, label %cond.end, label %cond.true

cond.true:
%0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)    ; is_zero_undef == true
br label %cond.end

cond.end:
%cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
ret i64 %cond

This unfortunately may lead to poorer codegen (see the changes in the existing x86 test), but if we increase speculation in SimplifyCFG (the next step in this patch series), then we should avoid those kinds of cases in the first place.

The need for this patch was originally mentioned here:
http://reviews.llvm.org/D7506
with follow-up here:
http://reviews.llvm.org/D7554

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 40092.Nov 12 2015, 4:23 PM

spatel retitled this revision from to [CGP] despeculate expensive cttz/ctlz intrinsics.

spatel updated this object.

spatel added reviewers: andreadb, hfinkel, jmolloy.

spatel added a subscriber: llvm-commits.

Hi Sanjoy,

This looks fine to me, I think.

James

This revision is now accepted and ready to land.Nov 13 2015, 8:40 AM

Closed by commit rL253573: [CGP] despeculate expensive cttz/ctlz intrinsics (authored by spatel). · Explain WhyNov 19 2015, 8:39 AM

This revision was automatically updated to reflect the committed changes.

spatel mentioned this in D15213: [SimplifyCFG] allow speculation of exactly one expensive instruction (PR24818).Dec 3 2015, 3:13 PM

spatel mentioned this in rL255660: [SimplifyCFG] allow speculation of exactly one expensive instruction (PR24818).Dec 15 2015, 9:41 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

CodeGenPrepare.cpp

84 lines

test/

CodeGen/

X86/

clz.ll

57 lines

Diff 40658

llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp

Show First 20 Lines • Show All 1,600 Lines • ▼ Show 20 Lines	for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
Instruction *OldBr = IfBlock->getTerminator();		Instruction *OldBr = IfBlock->getTerminator();
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);		BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
OldBr->eraseFromParent();		OldBr->eraseFromParent();
IfBlock = NewIfBlock;		IfBlock = NewIfBlock;
}		}
CI->eraseFromParent();		CI->eraseFromParent();
}		}

		/// If counting leading or trailing zeros is an expensive operation and a zero
		/// input is defined, add a check for zero to avoid calling the intrinsic.
		///
		/// We want to transform:
		/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
		///
		/// into:
		/// entry:
		/// %cmpz = icmp eq i64 %A, 0
		/// br i1 %cmpz, label %cond.end, label %cond.false
		/// cond.false:
		/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
		/// br label %cond.end
		/// cond.end:
		/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
		///
		/// If the transform is performed, return true and set ModifiedDT to true.
		static bool despeculateCountZeros(IntrinsicInst *CountZeros,
		const TargetLowering *TLI,
		const DataLayout *DL,
		bool &ModifiedDT) {
		if (!TLI \|\| !DL)
		return false;

		// If a zero input is undefined, it doesn't make sense to despeculate that.
		if (match(CountZeros->getOperand(1), m_One()))
		return false;

		// If it's cheap to speculate, there's nothing to do.
		auto IntrinsicID = CountZeros->getIntrinsicID();
		if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) \|\|
		(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
		return false;

		// Only handle legal scalar cases. Anything else requires too much work.
		Type *Ty = CountZeros->getType();
		unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
		if (Ty->isVectorTy() \|\| SizeInBits > DL->getLargestLegalIntTypeSize())
		return false;

		// The intrinsic will be sunk behind a compare against zero and branch.
		BasicBlock *StartBlock = CountZeros->getParent();
		BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");

		// Create another block after the count zero intrinsic. A PHI will be added
		// in this block to select the result of the intrinsic or the bit-width
		// constant if the input to the intrinsic is zero.
		BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
		BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");

		// Set up a builder to create a compare, conditional branch, and PHI.
		IRBuilder<> Builder(CountZeros->getContext());
		Builder.SetInsertPoint(StartBlock->getTerminator());
		Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());

		// Replace the unconditional branch that was created by the first split with
		// a compare against zero and a conditional branch.
		Value *Zero = Constant::getNullValue(Ty);
		Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
		Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
		StartBlock->getTerminator()->eraseFromParent();

		// Create a PHI in the end block to select either the output of the intrinsic
		// or the bit width of the operand.
		Builder.SetInsertPoint(&EndBlock->front());
		PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
		CountZeros->replaceAllUsesWith(PN);
		Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
		PN->addIncoming(BitWidth, StartBlock);
		PN->addIncoming(CountZeros, CallBlock);

		// We are explicitly handling the zero case, so we can set the intrinsic's
		// undefined zero argument to 'true'. This will also prevent reprocessing the
		// intrinsic; we only despeculate when a zero input is defined.
		CountZeros->setArgOperand(1, Builder.getTrue());
		ModifiedDT = true;
		return true;
		}

bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {		bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
BasicBlock *BB = CI->getParent();		BasicBlock *BB = CI->getParent();

// Lower inline assembly if we can.		// Lower inline assembly if we can.
// If we found an inline asm expession, and if the target knows how to		// If we found an inline asm expession, and if the target knows how to
// lower it to normal LLVM code, do so now.		// lower it to normal LLVM code, do so now.
if (TLI && isa<InlineAsm>(CI->getCalledValue())) {		if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
if (TLI->ExpandInlineAsm(CI)) {		if (TLI->ExpandInlineAsm(CI)) {
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	case Intrinsic::aarch64_stxr: {
// optimizations don't touch it.		// optimizations don't touch it.
InsertedInsts.insert(ExtVal);		InsertedInsts.insert(ExtVal);
return true;		return true;
}		}
case Intrinsic::invariant_group_barrier:		case Intrinsic::invariant_group_barrier:
II->replaceAllUsesWith(II->getArgOperand(0));		II->replaceAllUsesWith(II->getArgOperand(0));
II->eraseFromParent();		II->eraseFromParent();
return true;		return true;

		case Intrinsic::cttz:
		case Intrinsic::ctlz:
		// If counting zeros is expensive, try to avoid it.
		return despeculateCountZeros(II, TLI, DL, ModifiedDT);
}		}

if (TLI) {		if (TLI) {
// Unknown address space.		// Unknown address space.
// TODO: Target hook to pick which address space the intrinsic cares		// TODO: Target hook to pick which address space the intrinsic cares
// about?		// about?
unsigned AddrSpace = ~0u;		unsigned AddrSpace = ~0u;
SmallVector<Value*, 2> PtrOps;		SmallVector<Value*, 2> PtrOps;
▲ Show 20 Lines • Show All 3,533 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/clz.ll

	Show First 20 Lines • Show All 81 Lines • ▼ Show 20 Lines
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: bsrq %rdi, %rax			; CHECK-NEXT: bsrq %rdi, %rax
	; CHECK-NEXT: xorq $63, %rax			; CHECK-NEXT: xorq $63, %rax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )			%tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )
	ret i64 %tmp			ret i64 %tmp
	}			}

	define i32 @ctlz_i32_cmov(i32 %n) {			define i32 @ctlz_i32_zero_test(i32 %n) {
	; CHECK-LABEL: ctlz_i32_cmov:			; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.

				; CHECK-LABEL: ctlz_i32_zero_test:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: bsrl %edi, %ecx			; CHECK-NEXT: movl $32, %eax
	; CHECK-NEXT: movl $63, %eax			; CHECK-NEXT: testl %edi, %edi
	; CHECK-NEXT: cmovnel %ecx, %eax			; CHECK-NEXT: je .LBB8_2
				; CHECK-NEXT: # BB#1: # %cond.false
				; CHECK-NEXT: bsrl %edi, %eax
	; CHECK-NEXT: xorl $31, %eax			; CHECK-NEXT: xorl $31, %eax
				; CHECK-NEXT: .LBB8_2: # %cond.end
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	; Generate a cmov to handle zero inputs when necessary.
	%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)			%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
	ret i32 %tmp1			ret i32 %tmp1
	}			}

	define i32 @ctlz_i32_fold_cmov(i32 %n) {			define i32 @ctlz_i32_fold_cmov(i32 %n) {
				; Don't generate the cmovne when the source is known non-zero (and bsr would
				; not set ZF).
				; rdar://9490949
				; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
				; codegen doesn't know how to delete the movl and je.

	; CHECK-LABEL: ctlz_i32_fold_cmov:			; CHECK-LABEL: ctlz_i32_fold_cmov:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: orl $1, %edi			; CHECK-NEXT: orl $1, %edi
				; CHECK-NEXT: movl $32, %eax
				; CHECK-NEXT: je .LBB9_2
				; CHECK-NEXT: # BB#1: # %cond.false
	; CHECK-NEXT: bsrl %edi, %eax			; CHECK-NEXT: bsrl %edi, %eax
	; CHECK-NEXT: xorl $31, %eax			; CHECK-NEXT: xorl $31, %eax
				; CHECK-NEXT: .LBB9_2: # %cond.end
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	; Don't generate the cmovne when the source is known non-zero (and bsr would
	; not set ZF).
	; rdar://9490949
	%or = or i32 %n, 1			%or = or i32 %n, 1
	%tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)			%tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
	ret i32 %tmp1			ret i32 %tmp1
	}			}

	define i32 @ctlz_bsr(i32 %n) {			define i32 @ctlz_bsr(i32 %n) {
				; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
				; the most significant bit, which is what 'bsr' does natively.

	; CHECK-LABEL: ctlz_bsr:			; CHECK-LABEL: ctlz_bsr:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: bsrl %edi, %eax			; CHECK-NEXT: bsrl %edi, %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
	; the most significant bit, which is what 'bsr' does natively.
	%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)			%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
	%bsr = xor i32 %ctlz, 31			%bsr = xor i32 %ctlz, 31
	ret i32 %bsr			ret i32 %bsr
	}			}

	define i32 @ctlz_bsr_cmov(i32 %n) {			define i32 @ctlz_bsr_zero_test(i32 %n) {
	; CHECK-LABEL: ctlz_bsr_cmov:			; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
	; CHECK: # BB#0:			; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
	; CHECK-NEXT: bsrl %edi, %ecx			; codegen doesn't know how to combine the $32 and $31 into $63.
	; CHECK-NEXT: movl $63, %eax
	; CHECK-NEXT: cmovnel %ecx, %eax			; CHECK-LABEL: ctlz_bsr_zero_test:
				; CHECK: # BB#0:
				; CHECK-NEXT: movl $32, %eax
				; CHECK-NEXT: testl %edi, %edi
				; CHECK-NEXT: je .LBB11_2
				; CHECK-NEXT: # BB#1: # %cond.false
				; CHECK-NEXT: bsrl %edi, %eax
				; CHECK-NEXT: xorl $31, %eax
				; CHECK-NEXT: .LBB11_2: # %cond.end
				; CHECK-NEXT: xorl $31, %eax
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	; Same as ctlz_bsr, but ensure this happens even when there is a potential
	; zero.
	%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)			%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
	%bsr = xor i32 %ctlz, 31			%bsr = xor i32 %ctlz, 31
	ret i32 %bsr			ret i32 %bsr
	}			}