This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] allow more than one use for vector cast folding with selects
ClosedPublic

Authored by spatel on Jun 9 2016, 10:50 AM.

Download Raw Diff

Details

Reviewers

RKSimon
majnemer
eli.friedman

Commits

rG216d8cf72018: [InstCombine] allow more than one use for vector bitcast folding with selects
rL273011: [InstCombine] allow more than one use for vector bitcast folding with selects

Summary

The motivating example for this transform is similar to D20774 where bitcasts interfere with a single cmp/select sequence, but in this case we have 2 uses of each bitcast to produce min and max ops:

define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) {
  %cmp = fcmp olt <4 x float> %a, %b
  %bc1 = bitcast <4 x float> %a to <4 x i32>
  %bc2 = bitcast <4 x float> %b to <4 x i32>
  %sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2
  %sel2 = select <4 x i1> %cmp, <4 x i32> %bc2, <4 x i32> %bc1
  %bc3 = bitcast <4 x float>* %ptr1 to <4 x i32>*
  store <4 x i32> %sel1, <4 x i32>* %bc3
  %bc4 = bitcast <4 x float>* %ptr2 to <4 x i32>*
  store <4 x i32> %sel2, <4 x i32>* %bc4
  ret void
}

With this patch, we move the selects up to use the input args which allows getting rid of all of the bitcasts:

define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) {
  %cmp = fcmp olt <4 x float> %a, %b
  %sel1.v = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
  %sel2.v = select <4 x i1> %cmp, <4 x float> %b, <4 x float> %a
  store <4 x float> %sel1.v, <4 x float>* %ptr1, align 16
  store <4 x float> %sel2.v, <4 x float>* %ptr2, align 16
  ret void
}

The asm for x86 SSE then improves from:

movaps	%xmm0, %xmm2
cmpltps	%xmm1, %xmm2
movaps	%xmm2, %xmm3
andnps	%xmm1, %xmm3
movaps	%xmm2, %xmm4
andnps	%xmm0, %xmm4
andps	%xmm2, %xmm0
orps	%xmm3, %xmm0
andps	%xmm1, %xmm2
orps	%xmm4, %xmm2
movaps	%xmm0, (%rdi)
movaps	%xmm2, (%rsi)

To:

movaps	%xmm0, %xmm2
minps	%xmm1, %xmm2
maxps	%xmm0, %xmm1
movaps	%xmm2, (%rdi)
movaps	%xmm1, (%rsi)

Diff Detail

Event Timeline

spatel updated this revision to Diff 60197.Jun 9 2016, 10:50 AM

spatel retitled this revision from to [InstCombine] allow more than one use for vector cast folding with selects.

spatel updated this object.

spatel added reviewers: majnemer, eli.friedman, RKSimon.

spatel added a subscriber: llvm-commits.

Herald added a subscriber: mcrosier. · View Herald TranscriptJun 9 2016, 10:50 AM

Is it possible that this could cause an infinite loop in instcombine? Previously, this transform always reduced the total number of bitcasts in the function, but this doesn't. (Consider the case where the select uses its own result.)

lib/Transforms/InstCombine/InstCombineSelect.cpp
150–156	Need to fix comment here?

In D21190#453695, @eli.friedman wrote:

Is it possible that this could cause an infinite loop in instcombine? Previously, this transform always reduced the total number of bitcasts in the function, but this doesn't. (Consider the case where the select uses its own result.)

Sorry for not seeing it - the select uses its own result via a phi? Can you show the construct that you have in mind?

Something like this?

loop:
  %sel = select <4 x i1> %cmp, <4 x i32> %selx, <4 x i32> %selx
  %selx = bitcast <4 x i32> %sel to <4 x f32>
  br label %loop

Your optimization flips the types of the select and the bitcast, I think. Obviously not an actual testcase because it's missing PHI nodes, but that's the basic idea.

In D21190#453811, @eli.friedman wrote:
Something like this?
loop:
  %sel = select <4 x i1> %cmp, <4 x i32> %selx, <4 x i32> %selx
  %selx = bitcast <4 x i32> %sel to <4 x f32>
  br label %loop
Your optimization flips the types of the select and the bitcast, I think. Obviously not an actual testcase because it's missing PHI nodes, but that's the basic idea.

Interesting...I'm trying to be more diabolical, but I can't get it yet. :)
In order for the bitcasted output to feed back into the original intstruction, it would have to be bitcasted back to the original type somewhere along the way? In that case, I think we'd eliminate the bitcasts as they get paired up:

define void @infloop(<4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b) {
entry:
  br label %loop

loop:
  %phi1 = phi <4 x i32> [ %a, %entry ], [ %self1, %loop ]
  %phi2 = phi <4 x i32> [ %b, %entry ], [ %self2, %loop ]
  %selx1 = bitcast <4 x i32> %phi1 to <4 x float>
  %selx2 = bitcast <4 x i32> %phi2 to <4 x float>
  %sel1 = select <4 x i1> %cmp, <4 x float> %selx1, <4 x float> %selx2
  %sel2 = select <4 x i1> %cmp, <4 x float> %selx2, <4 x float> %selx1
  %self1 = bitcast <4 x float> %sel1 to <4 x i32>
  %self2 = bitcast <4 x float> %sel2 to <4 x i32>
  br label %loop

  ret void
}

$ ./opt -instcombine infloop.ll -S

define void @infloop(<4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b) {
entry:
  br label %loop

loop:                                             ; preds = %loop, %entry
  %phi1 = phi <4 x i32> [ %a, %entry ], [ %sel1.v, %loop ]
  %phi2 = phi <4 x i32> [ %b, %entry ], [ %sel2.v, %loop ]
  %sel1.v = select <4 x i1> %cmp, <4 x i32> %phi1, <4 x i32> %phi2
  %sel2.v = select <4 x i1> %cmp, <4 x i32> %phi2, <4 x i32> %phi1
  br label %loop
                                                  ; No predecessors!
  ret void
}

Looking a bit more, I think the infinite loop isn't possible because of the way instcombine works with PHI nodes and the reachable code restriction. Sorry about the false alarm.

In D21190#453940, @eli.friedman wrote:

Looking a bit more, I think the infinite loop isn't possible because of the way instcombine works with PHI nodes and the reachable code restriction. Sorry about the false alarm.

No problem - thanks for making me look harder at the possibilities. I'll upload a new draft with a code comment change.

Patch updated:
Add comment to better explain the one-use restriction.
Also, add a TODO comment for cleanup because there's a strange combo of isa/dyn_cast/llvm_unreachable below here.

Ping.

This patch doesn't only apply to bitcasts, which can lead to gigantic codegen changes in some cases. That isn't a new problem, though; consider:

define void @min_max_trunc(<4 x float> %a, <4 x float> %b, <4 x i64> %c, <4 x i64> %d, <4 x i32>* %ptr1, <4 x i32>* %ptr2) {
  %cmp = fcmp olt <4 x float> %a, %b
  %bc1 = trunc <4 x i64> %c to <4 x i32>
  %bc2 = trunc <4 x i64> %d to <4 x i32>
  %sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2
  store <4 x i32> %sel1, <4 x i32>* %ptr1
  ret void
}

instcombine makes this generate much worse code for SSE2. Although, I'm pretty sure we can blame SelectionDAG for some part of that because it's generating absolutely terrible code. Feel free to just file a bug for this, but I'm pretty sure we need some sort of target-hook for this.

It would be nice to throw together a few testcases for zext/sext/trunc/sitofp/fptoui just to make sure we have coverage.

Otherwise LGTM.

This revision is now accepted and ready to land.Jun 16 2016, 10:48 AM

In D21190#460029, @eli.friedman wrote:

This patch doesn't only apply to bitcasts, which can lead to gigantic codegen changes in some cases.

...

Feel free to just file a bug for this, but I'm pretty sure we need some sort of target-hook for this.

Thanks:
https://llvm.org/bugs/show_bug.cgi?id=28160

I think I should limit this patch to only bitcasts to make it safer?

In D21190#460241, @spatel wrote:

I think I should limit this patch to only bitcasts to make it safer?

If you like. Probably not a big deal either way, given the transforms we already perform.

Closed by commit rL273011: [InstCombine] allow more than one use for vector bitcast folding with selects (authored by spatel). · Explain WhyJun 17 2016, 9:53 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Transforms/

InstCombine/

InstCombineSelect.cpp

35 lines

test/

Transforms/

InstCombine/

select.ll

24 lines

Diff 60246

lib/Transforms/InstCombine/InstCombineSelect.cpp

Show First 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	case Instruction::AShr:
return Constant::getNullValue(I->getType());		return Constant::getNullValue(I->getType());
case Instruction::And:		case Instruction::And:
return Constant::getAllOnesValue(I->getType());		return Constant::getAllOnesValue(I->getType());
case Instruction::Mul:		case Instruction::Mul:
return ConstantInt::get(I->getType(), 1);		return ConstantInt::get(I->getType(), 1);
}		}
}		}

/// Here we have (select c, TI, FI), and we know that TI and FI		/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
/// have the same opcode and only one use each. Try to simplify this.
Instruction InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction TI,		Instruction InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction TI,
Instruction *FI) {		Instruction *FI) {
// If this is a cast from the same type, merge.		// If this is a cast from the same type, merge.
if (TI->getNumOperands() == 1 && TI->isCast()) {		if (TI->getNumOperands() == 1 && TI->isCast()) {
Type *FIOpndTy = FI->getOperand(0)->getType();		Type *FIOpndTy = FI->getOperand(0)->getType();
if (TI->getOperand(0)->getType() != FIOpndTy)		if (TI->getOperand(0)->getType() != FIOpndTy)
return nullptr;		return nullptr;

// The select condition may be a vector. We may only change the operand		// The select condition may be a vector. We may only change the operand
// type if the vector width remains the same (and matches the condition).		// type if the vector width remains the same (and matches the condition).
Type *CondTy = SI.getCondition()->getType();		Type *CondTy = SI.getCondition()->getType();
if (CondTy->isVectorTy() &&		if (CondTy->isVectorTy()) {
(!FIOpndTy->isVectorTy() \|\|		if (!FIOpndTy->isVectorTy())
CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
return nullptr;		return nullptr;
		if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())
		return nullptr;
		} else if (!TI->hasOneUse() \|\| !FI->hasOneUse()) {
		// TODO: The one-use restrictions for a scalar select could be eased if
		// the fold of a select in visitLoadInst() was enhanced to match a pattern
		// that includes a cast.
		return nullptr;
		}

// Fold this by inserting a select from the input values.		// Fold this by inserting a select from the input values.
Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),		Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
FI->getOperand(0), SI.getName()+".v");		FI->getOperand(0), SI.getName()+".v");
return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,		return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
TI->getType());		TI->getType());
}		}

// Only handle binary operators here.		// TODO: This function ends awkwardly in unreachable - fix to be more normal.
if (!isa<BinaryOperator>(TI))
		// Only handle binary operators with one-use here. As with the cast case
		// above, it may be possible to relax the one-use constraint, but that needs
		// be examined carefully since it may not reduce the total number of
		// instructions.
		if (!isa<BinaryOperator>(TI) \|\| !TI->hasOneUse() \|\| !FI->hasOneUse())
		eli.friedmanUnsubmitted Not Done Reply Inline Actions Need to fix comment here? eli.friedman: Need to fix comment here?
return nullptr;		return nullptr;

// Figure out if the operations have any operands in common.		// Figure out if the operations have any operands in common.
Value MatchOp, OtherOpT, *OtherOpF;		Value MatchOp, OtherOpT, *OtherOpF;
bool MatchIsOpZero;		bool MatchIsOpZero;
if (TI->getOperand(0) == FI->getOperand(0)) {		if (TI->getOperand(0) == FI->getOperand(0)) {
MatchOp = TI->getOperand(0);		MatchOp = TI->getOperand(0);
OtherOpT = TI->getOperand(1);		OtherOpT = TI->getOperand(1);
▲ Show 20 Lines • Show All 897 Lines • ▼ Show 20 Lines	Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
// See if we are selecting two values based on a comparison of the two values.		// See if we are selecting two values based on a comparison of the two values.
if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))		if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))		if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))
return Result;		return Result;

if (Instruction Add = foldAddSubSelect(SI, Builder))		if (Instruction Add = foldAddSubSelect(SI, Builder))
return Add;		return Add;

		// Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
auto *TI = dyn_cast<Instruction>(TrueVal);		auto *TI = dyn_cast<Instruction>(TrueVal);
auto *FI = dyn_cast<Instruction>(FalseVal);		auto *FI = dyn_cast<Instruction>(FalseVal);
if (TI && FI && TI->hasOneUse() && FI->hasOneUse()) {		if (TI && FI && TI->getOpcode() == FI->getOpcode())
// Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
if (TI->getOpcode() == FI->getOpcode())
if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))		if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
return IV;		return IV;
}

// See if we can fold the select into one of our operands.		// See if we can fold the select into one of our operands.
if (SI.getType()->isIntOrIntVectorTy() \|\| SI.getType()->isFPOrFPVectorTy()) {		if (SI.getType()->isIntOrIntVectorTy() \|\| SI.getType()->isFPOrFPVectorTy()) {
if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))		if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
return FoldI;		return FoldI;

Value LHS, RHS, LHS2, RHS2;		Value LHS, RHS, LHS2, RHS2;
Instruction::CastOps CastOp;		Instruction::CastOps CastOp;
▲ Show 20 Lines • Show All 163 Lines • Show Last 20 Lines

test/Transforms/InstCombine/select.ll

	Show First 20 Lines • Show All 717 Lines • ▼ Show 20 Lines
	define i48 @test51(<3 x i1> %icmp, <3 x i16> %tmp) {			define i48 @test51(<3 x i1> %icmp, <3 x i16> %tmp) {
	; CHECK-LABEL: @test51(			; CHECK-LABEL: @test51(
	%select = select <3 x i1> %icmp, <3 x i16> zeroinitializer, <3 x i16> %tmp			%select = select <3 x i1> %icmp, <3 x i16> zeroinitializer, <3 x i16> %tmp
	; CHECK: select <3 x i1>			; CHECK: select <3 x i1>
	%tmp2 = bitcast <3 x i16> %select to i48			%tmp2 = bitcast <3 x i16> %select to i48
	ret i48 %tmp2			ret i48 %tmp2
	}			}

				; Allow select promotion even if there are multiple uses of casted ops.
				; Hoisting the selects allows later pattern matching to see that these are min/max ops.

				define void @min_max_bitcast(<4 x float> %a, <4 x float> %b, <4 x i32>* %ptr1, <4 x i32>* %ptr2) {
				; CHECK-LABEL: @min_max_bitcast(
				; CHECK-NEXT: [[CMP:%.*]] = fcmp olt <4 x float> %a, %b
				; CHECK-NEXT: [[SEL1_V:%.*]] = select <4 x i1> [[CMP]], <4 x float> %a, <4 x float> %b
				; CHECK-NEXT: [[SEL2_V:%.*]] = select <4 x i1> [[CMP]], <4 x float> %b, <4 x float> %a
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <4 x i32> %ptr1 to <4 x float>*
				; CHECK-NEXT: store <4 x float> [[SEL1_V]], <4 x float>* [[TMP1]], align 16
				; CHECK-NEXT: [[TMP2:%.]] = bitcast <4 x i32> %ptr2 to <4 x float>*
				; CHECK-NEXT: store <4 x float> [[SEL2_V]], <4 x float>* [[TMP2]], align 16
				; CHECK-NEXT: ret void
				;
				%cmp = fcmp olt <4 x float> %a, %b
				%bc1 = bitcast <4 x float> %a to <4 x i32>
				%bc2 = bitcast <4 x float> %b to <4 x i32>
				%sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2
				%sel2 = select <4 x i1> %cmp, <4 x i32> %bc2, <4 x i32> %bc1
				store <4 x i32> %sel1, <4 x i32>* %ptr1
				store <4 x i32> %sel2, <4 x i32>* %ptr2
				ret void
				}

	; PR8575			; PR8575

	define i32 @test52(i32 %n, i32 %m) nounwind {			define i32 @test52(i32 %n, i32 %m) nounwind {
	; CHECK-LABEL: @test52(			; CHECK-LABEL: @test52(
	%cmp = icmp sgt i32 %n, %m			%cmp = icmp sgt i32 %n, %m
	%. = select i1 %cmp, i32 1, i32 3			%. = select i1 %cmp, i32 1, i32 3
	%add = add nsw i32 %., 3			%add = add nsw i32 %., 3
	%storemerge = select i1 %cmp, i32 %., i32 %add			%storemerge = select i1 %cmp, i32 %., i32 %add
	▲ Show 20 Lines • Show All 876 Lines • Show Last 20 Lines