This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] reverse 'trunc X to <N x i1>' canonicalization
ClosedPublic

Authored by spatel on Oct 1 2018, 2:02 PM.

Download Raw Diff

Details

Reviewers

efriedma
craig.topper
RKSimon
lebedev.ri
javed.absar

Commits

rG05aadf885d52: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization; 2nd try
rGe9ca7ea3e5c0: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization
rL344181: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization; 2nd try
rL344082: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization

Summary

icmp ne (and X, 1), 0 --> trunc X to N x i1

Ideally, I think we'd do the same for scalars, but I'm afraid of unintended consequences.
The motivating vector case is from PR37549:
https://bugs.llvm.org/show_bug.cgi?id=37549

define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
  %c = fcmp ole <4 x float> %x, %y
  %s = sext <4 x i1> %c to <4 x i32>
  %s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  %s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  %cond = or <4 x i32> %s1, %s2
  %condtr = trunc <4 x i32> %cond to <4 x i1>
  %r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w
  ret <4 x float> %r
}

Here's a sampling of the vector codegen for that case using mask+icmp (current behavior) vs. trunc (with this patch):

AVX before:

vcmpleps	%xmm1, %xmm0, %xmm0
vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps	%xmm0, %xmm1, %xmm0
vandps	LCPI0_0(%rip), %xmm0, %xmm0
vxorps	%xmm1, %xmm1, %xmm1
vpcmpeqd	%xmm1, %xmm0, %xmm0
vblendvps	%xmm0, %xmm3, %xmm2, %xmm0

AVX after:

vcmpleps	%xmm1, %xmm0, %xmm0
vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps	%xmm0, %xmm1, %xmm0
vblendvps	%xmm0, %xmm2, %xmm3, %xmm0

AVX512f before:

vcmpleps	%xmm1, %xmm0, %xmm0
vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps	%xmm0, %xmm1, %xmm0
vpbroadcastd	LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1]
vptestnmd	%zmm1, %zmm0, %k1
vblendmps	%zmm3, %zmm2, %zmm0 {%k1}

AVX512f after:

vcmpleps	%xmm1, %xmm0, %xmm0
vpermilps	$80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1]
vpermilps	$250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3]
vorps	%xmm0, %xmm1, %xmm0
vpslld	$31, %xmm0, %xmm0
vptestmd	%zmm0, %zmm0, %k1
vblendmps	%zmm2, %zmm3, %zmm0 {%k1}

AArch64 before:

fcmge	v0.4s, v1.4s, v0.4s
zip1	v1.4s, v0.4s, v0.4s
zip2	v0.4s, v0.4s, v0.4s
orr	v0.16b, v1.16b, v0.16b
movi	v1.4s, #1
and	v0.16b, v0.16b, v1.16b
cmeq	v0.4s, v0.4s, #0
bsl	v0.16b, v3.16b, v2.16b

AArch64 after:

fcmge	v0.4s, v1.4s, v0.4s
zip1	v1.4s, v0.4s, v0.4s
zip2	v0.4s, v0.4s, v0.4s
orr	v0.16b, v1.16b, v0.16b
bsl	v0.16b, v2.16b, v3.16b

PowerPC-le before:

xvcmpgesp 34, 35, 34
vspltisw 0, 1
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxlxor 35, 35, 35
xxland 34, 0, 32
vcmpequw 2, 2, 3
xxsel 34, 36, 37, 34

PowerPC-le after:

xvcmpgesp 34, 35, 34
vmrglw 3, 2, 2
vmrghw 2, 2, 2
xxlor 0, 35, 34
xxsel 34, 37, 36, 0

Diff Detail

Event Timeline

spatel created this revision.Oct 1 2018, 2:02 PM

Herald added a reviewer: javed.absar. · View Herald TranscriptOct 1 2018, 2:02 PM

Herald added subscribers: kristof.beyls, mcrosier. · View Herald Transcript

craig.topper added inline comments.Oct 1 2018, 11:17 PM

lib/Transforms/InstCombine/InstCombineCompares.cpp
1712	Should this be in foldICmpAndConstConst? And it should use the APInts we already extracted?

Ideally, I think we'd do the same for scalars, but I'm afraid of unintended consequences.

I *think* this originates from rL67635.

In D52747#1252143, @lebedev.ri wrote:

Ideally, I think we'd do the same for scalars, but I'm afraid of unintended consequences.

I *think* this originates from rL67635.

Thanks for digging that up! As the codegen examples here show, the icmp variant is not always better for vector codegen at least (we could just fix the backend, but since we can reduce the IR, I figured that's the better option).

My bigger worry is that we're going to expose IR-level holes for trunc patterns with scalar code (and-of-icmps and similar or the example with shift that's now shown here). Those seem less likely for vector code.

lib/Transforms/InstCombine/InstCombineCompares.cpp
1712	There are 2 independent problems here, and I should have put this in a code comment: If we use the already extracted APInt values, we won't handle vectors with undefs because m_APInt doesn't match those (yet). So in cases like this, I've been using the more specific matcher even if it looks redundant. I will add a test that includes undefs in the constant vector values. Depending on where we position this fold, it exposes another canonicalization question because it will affect patterns with shifts like this: %shr = ashr <2 x i84> %X, <i84 4, i84 4> %and = and <2 x i84> %shr, <i84 1, i84 1> %cmp = icmp ne <2 x i84> %and, zeroinitializer Should that become: %m = and <2 x i84> %X, <i84 16, i84 16> %cmp = icmp ne <2 x i84> %m, zeroinitializer or: %sh = lshr <2 x i84> %X, <i84 4, i84 4> %cmp = trunc <2 x i84> %sh to <2 x i1> This patch sidesteps that question by allowing the larger pattern to match first, but we could make that a prerequisite step for this patch.

If we use the already extracted APInt values, we won't handle vectors with undefs because m_APInt doesn't match those (yet).
So in cases like this, I've been using the more specific matcher even if it looks redundant.
I will add a test that includes undefs in the constant vector values.

Looking at this closer...as the patch is written currently, we would fail to match if the compare constant (zero) has undefs because we already used m_APInt as a condition to get here in the first place.

spatel mentioned this in rL343595: [InstCombine] add tests with undef elements; NFC.Oct 2 2018, 8:02 AM

Patch updated:

Moved the and+icmp --> trunc transform earlier in visitICmpInst, so we have a better idea about potential regressions.
This required adding 2 trunc folds to avoid known regressions. These transforms have phantom (cosmetic-only) test diffs in apint-shift.ll and icmp.ll::icmp_and_or_lshr_cst_vec(), so we can see that the new code is firing on the patterns with trunc.
The other test diffs are all wins in IR (less instructions). Included in that, we see that 1 of the existing icmp transforms that we're replacing doesn't work if the operands are commuted.
The loop vectorizer tests (running with full -O3 in that test file...) produce mixed results in codegen: both tests improve (less instructions in the inner loop) on KNL, but regress (more instructions in the inner loop) with AVX2. Note: that diff should've been in the previous rev of this patch, but I missed it.

So we have IR improvements in all cases shown here (but there could be regressions for patterns that have no vector test coverage), codegen improvements for the motivating blendv cases across a range of targets, codegen improvements on larger loop tests on AVX512, but codegen regressions with that same IR on AVX2.

@craig.topper The final codegen from the updated IR in masked_load_store.ll regresses due to masked stores not making use of only requiring the MSB of the mask vector (lots of SIGN_EXTEND_INREG etc.) - X86ISelLowering's combineMaskedStore only handles the PCMPGT case, how tricky would it be to replace it with a general SimplifyDemandedBits calls?

In D52747#1257240, @RKSimon wrote:

@craig.topper The final codegen from the updated IR in masked_load_store.ll regresses due to masked stores not making use of only requiring the MSB of the mask vector (lots of SIGN_EXTEND_INREG etc.) - X86ISelLowering's combineMaskedStore only handles the PCMPGT case, how tricky would it be to replace it with a general SimplifyDemandedBits calls?

I have a draft of that patch in progress. Let me add some tests, clean it up, and post it. That's the only regression that I'm aware of from this patch, so we can make this patch dependent on that one.

spatel mentioned this in D52964: [x86] use demanded bits to simplify masked store codegen.Oct 6 2018, 7:30 AM

spatel added a parent revision: D52964: [x86] use demanded bits to simplify masked store codegen.Oct 6 2018, 7:32 AM

spatel mentioned this in rL344048: [x86] use demanded bits to simplify masked store codegen.Oct 9 2018, 7:06 AM

Now that D52964 has landed - is there anything stopping this?

In D52747#1258801, @RKSimon wrote:

Now that D52964 has landed - is there anything stopping this?

IMO, no. The known IR regressions are now handled with additional trunc pattern matching, so all changes in IR shown here are improvements. All known codegen regressions have been squashed.

LGTM - thanks

This revision is now accepted and ready to land.Oct 9 2018, 8:02 AM

Closed by commit rL344082: [InstCombine] reverse 'trunc X to <N x i1>' canonicalization (authored by spatel). · Explain WhyOct 9 2018, 2:27 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Transforms/

InstCombine/

InstCombineCasts.cpp

5 lines

InstCombineCompares.cpp

6 lines

test/

Transforms/

InstCombine/

apint-shl-trunc.ll

5 lines

vector-casts.ll

6 lines

Diff 167820

lib/Transforms/InstCombine/InstCombineCasts.cpp

Show First 20 Lines • Show All 700 Lines • ▼ Show 20 Lines	if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)		if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
return nullptr;		return nullptr;

// See if we can simplify any instructions used by the input whose sole		// See if we can simplify any instructions used by the input whose sole
// purpose is to compute bits we don't care about.		// purpose is to compute bits we don't care about.
if (SimplifyDemandedInstructionBits(CI))		if (SimplifyDemandedInstructionBits(CI))
return &CI;		return &CI;

// Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.		// Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
if (DestTy->getScalarSizeInBits() == 1) {		// TODO: Why is using an icmp preferable to the minimal form?
		if (DestTy->getPrimitiveSizeInBits() == 1) {
Constant *One = ConstantInt::get(SrcTy, 1);		Constant *One = ConstantInt::get(SrcTy, 1);
Src = Builder.CreateAnd(Src, One);		Src = Builder.CreateAnd(Src, One);
Value *Zero = Constant::getNullValue(Src->getType());		Value *Zero = Constant::getNullValue(Src->getType());
return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);		return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
}		}

// FIXME: Maybe combine the next two transforms to handle the no cast case		// FIXME: Maybe combine the next two transforms to handle the no cast case
// more efficiently. Support vector types. Cleanup code by using m_OneUse.		// more efficiently. Support vector types. Cleanup code by using m_OneUse.
▲ Show 20 Lines • Show All 1,680 Lines • Show Last 20 Lines

lib/Transforms/InstCombine/InstCombineCompares.cpp

Show First 20 Lines • Show All 1,702 Lines • ▼ Show 20 Lines	if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
ConstantInt *C2 = cast<ConstantInt>(Y);		ConstantInt *C2 = cast<ConstantInt>(Y);
if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))		if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
return Res;		return Res;
}		}

if (!Cmp.isEquality())		if (!Cmp.isEquality())
return nullptr;		return nullptr;

		// For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
		// TODO: We canonicalize to the longer form for scalars. Why?
		craig.topperUnsubmitted Not Done Reply Inline Actions Should this be in foldICmpAndConstConst? And it should use the APInts we already extracted? craig.topper: Should this be in foldICmpAndConstConst? And it should use the APInts we already extracted?
		spatelAuthorUnsubmitted Not Done Reply Inline Actions There are 2 independent problems here, and I should have put this in a code comment: If we use the already extracted APInt values, we won't handle vectors with undefs because m_APInt doesn't match those (yet). So in cases like this, I've been using the more specific matcher even if it looks redundant. I will add a test that includes undefs in the constant vector values. Depending on where we position this fold, it exposes another canonicalization question because it will affect patterns with shifts like this: %shr = ashr <2 x i84> %X, <i84 4, i84 4> %and = and <2 x i84> %shr, <i84 1, i84 1> %cmp = icmp ne <2 x i84> %and, zeroinitializer Should that become: %m = and <2 x i84> %X, <i84 16, i84 16> %cmp = icmp ne <2 x i84> %m, zeroinitializer or: %sh = lshr <2 x i84> %X, <i84 4, i84 4> %cmp = trunc <2 x i84> %sh to <2 x i1> This patch sidesteps that question by allowing the larger pattern to match first, but we could make that a prerequisite step for this patch. spatel: There are 2 independent problems here, and I should have put this in a code comment: 1. If we…
		if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() &&
		match(Cmp.getOperand(1), m_Zero()) && match(And->getOperand(1), m_One()))
		return CastInst::CreateTruncOrBitCast(And->getOperand(0), Cmp.getType());

// X & -C == -C -> X > u ~C		// X & -C == -C -> X > u ~C
// X & -C != -C -> X <= u ~C		// X & -C != -C -> X <= u ~C
// iff C is a power of 2		// iff C is a power of 2
if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {		if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT		auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
: CmpInst::ICMP_ULE;		: CmpInst::ICMP_ULE;
return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));		return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
}		}
▲ Show 20 Lines • Show All 3,766 Lines • Show Last 20 Lines

test/Transforms/InstCombine/apint-shl-trunc.ll

	Show All 21 Lines
	;			;
	%B = lshr i799 %X, %A			%B = lshr i799 %X, %A
	%D = trunc i799 %B to i1			%D = trunc i799 %B to i1
	ret i1 %D			ret i1 %D
	}			}

	define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {			define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) {
	; CHECK-LABEL: @test0vec(			; CHECK-LABEL: @test0vec(
	; CHECK-NEXT: [[TMP1:%.]] = shl <2 x i39> <i39 1, i39 1>, [[A:%.]]			; CHECK-NEXT: [[B:%.]] = lshr <2 x i39> [[X:%.]], [[A:%.*]]
	; CHECK-NEXT: [[TMP2:%.]] = and <2 x i39> [[TMP1]], [[X:%.]]			; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1>
	; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer
	; CHECK-NEXT: ret <2 x i1> [[D]]			; CHECK-NEXT: ret <2 x i1> [[D]]
	;			;
	%B = lshr <2 x i39> %X, %A			%B = lshr <2 x i39> %X, %A
	%D = trunc <2 x i39> %B to <2 x i1>			%D = trunc <2 x i39> %B to <2 x i1>
	ret <2 x i1> %D			ret <2 x i1> %D
	}			}

test/Transforms/InstCombine/vector-casts.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -instcombine -S \| FileCheck %s			; RUN: opt < %s -instcombine -S \| FileCheck %s

	; This turns into a&1 != 0			; This turns into a&1 != 0

	define <2 x i1> @trunc(<2 x i64> %a) {			define <2 x i1> @trunc(<2 x i64> %a) {
	; CHECK-LABEL: @trunc(			; CHECK-LABEL: @trunc(
	; CHECK-NEXT: [[TMP1:%.]] = and <2 x i64> [[A:%.]], <i64 1, i64 1>			; CHECK-NEXT: [[T:%.]] = trunc <2 x i64> [[A:%.]] to <2 x i1>
	; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
	; CHECK-NEXT: ret <2 x i1> [[T]]			; CHECK-NEXT: ret <2 x i1> [[T]]
	;			;
	%t = trunc <2 x i64> %a to <2 x i1>			%t = trunc <2 x i64> %a to <2 x i1>
	ret <2 x i1> %t			ret <2 x i1> %t
	}			}

	define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {			define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
	; CHECK-LABEL: @and_cmp_is_trunc(			; CHECK-LABEL: @and_cmp_is_trunc(
	; CHECK-NEXT: [[T:%.]] = and <2 x i64> [[A:%.]], <i64 1, i64 1>			; CHECK-NEXT: [[R:%.]] = trunc <2 x i64> [[A:%.]] to <2 x i1>
	; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer
	; CHECK-NEXT: ret <2 x i1> [[R]]			; CHECK-NEXT: ret <2 x i1> [[R]]
	;			;
	%t = and <2 x i64> %a, <i64 1, i64 1>			%t = and <2 x i64> %a, <i64 1, i64 1>
	%r = icmp ne <2 x i64> %t, zeroinitializer			%r = icmp ne <2 x i64> %t, zeroinitializer
	ret <2 x i1> %r			ret <2 x i1> %r
	}			}

	; The ashr turns into an lshr.			; The ashr turns into an lshr.
	▲ Show 20 Lines • Show All 327 Lines • Show Last 20 Lines