This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
-
InstCombineAndOrXor.cpp
-
test/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
-
vector-casts.ll

Differential D36213

[InstCombine] Remove check for sext of vector icmp from shouldOptimizeCast
ClosedPublic

Authored by craig.topper on Aug 1 2017, 11:53 PM.

Download Raw Diff

Details

Reviewers

spatel
majnemer
davide

Commits

rGec4b82571c68: [InstCombine] Remove check for sext of vector icmp from shouldOptimizeCast
rL311508: [InstCombine] Remove check for sext of vector icmp from shouldOptimizeCast

Summary

Looks like for 'and' and 'or' we end up performing at least some of the transformations this is bocking in a round about way anyway.

For 'and sext(cmp1), sext(cmp2) we end up later turning it into 'select cmp1, sext(cmp2), 0'. Then we optimize that back to sext (and cmp1, cmp2). This is the same result we would have gotten if shouldOptimizeCast hadn't blocked it. We do something analogous for 'or'.

With this patch we allow that transformation to happen directly in foldCastedBitwiseLogic. And we now support the same thing for 'xor'. This is definitely opening up many other cases, but since we already went around it for some cases hopefully it's ok.

Diff Detail

Repository: rL LLVM

Event Timeline

craig.topper created this revision.Aug 1 2017, 11:53 PM

I pushed 'test7' through llc for x86 and PPC64LE, and no problems. But then I tried AArch64 and ARM, and they went nuts whether it was an 'xor' or an 'and':

define <2 x i64> @test7(<4 x float> %a, <4 x float> %b) {
  %cmp = fcmp ult <4 x float> %a, zeroinitializer
  %cmp4 = fcmp ult <4 x float> %b, zeroinitializer
  %sext = sext <4 x i1> %cmp to <4 x i32>
  %sext5 = sext <4 x i1> %cmp4 to <4 x i32>
  %and = and <4 x i32> %sext, %sext5
  %conv = bitcast <4 x i32> %and to <2 x i64>
  ret <2 x i64> %conv
}

define <2 x i64> @test7_better(<4 x float> %a, <4 x float> %b) {
  %cmp = fcmp ult <4 x float> %a, zeroinitializer
  %cmp4 = fcmp ult <4 x float> %b, zeroinitializer
  %and1 = and <4 x i1> %cmp, %cmp4
  %and = sext <4 x i1> %and1 to <4 x i32>
  %conv = bitcast <4 x i32> %and to <2 x i64>
  ret <2 x i64> %conv
}

$ ./llc -o - vcmp.ll -mtriple=aarch64

test7:                           // @test7
	fcmge	v0.4s, v0.4s, #0.0
	mvn	 v0.16b, v0.16b
	fcmge	v1.4s, v1.4s, #0.0
	bic	v0.16b, v0.16b, v1.16b
	ret
test7_better:                           // @test7_better
// BB#0:
	fcmge	v0.4s, v0.4s, #0.0
	fcmge	v1.4s, v1.4s, #0.0
	mvn	 v0.16b, v0.16b
	mvn	 v1.16b, v1.16b
	xtn	v0.4h, v0.4s
	xtn	v1.4h, v1.4s
	and	v0.8b, v0.8b, v1.8b
	ushll	v0.4s, v0.4h, #0
	shl	v0.4s, v0.4s, #31
	sshr	v0.4s, v0.4s, #31
	ret

Given that the more common problem patterns already exist independent of this patch, I would agree to proceed. But let's ping people with an ARM stake for their opinions - @t.p.northover @efriedma @mcrosier ?

Our handling of i1 masks in SelectionDAG is generally terrible.

x86 doesn't have this particular problem for <4 x float>, because it doesn't have 64-bit vectors, and it doesn't have this problem for <8 x float> if AVX is turned on because there's a target-specific DAGCombine to work around the issue (WidenMaskArithmetic), but it does show up in other cases. For example, try the following on x86 without AVX:

define <8 x i32> @testa(<8 x float> %a, <8 x float> %b) {
  %cmp = fcmp ult <8 x float> %a, zeroinitializer
  %cmp4 = fcmp ult <8 x float> %b, zeroinitializer
  %and1 = and <8 x i1> %cmp, %cmp4
  %and = sext <8 x i1> %and1 to <8 x i32>
  ret <8 x i32> %and
}

Or the following with AVX (but not AVX512):

define <16 x i32> @testb(<16 x float> %a, <16 x float> %b) {
  %cmp = fcmp ult <16 x float> %a, zeroinitializer
  %cmp4 = fcmp ult <16 x float> %b, zeroinitializer
  %and1 = and <16 x i1> %cmp, %cmp4
  %and = sext <16 x i1> %and1 to <16 x i32>
  ret <16 x i32> %and
}

That said, I don't think it makes sense to block this patch on that issue; it's an existing problem.

Ping

LGTM.

I filed an AArch64 bug to track the backend problem:
https://bugs.llvm.org/show_bug.cgi?id=34290

This revision is now accepted and ready to land.Aug 22 2017, 4:33 PM

Closed by commit rL311508: [InstCombine] Remove check for sext of vector icmp from shouldOptimizeCast (authored by ctopper). · Explain WhyAug 22 2017, 4:41 PM

This revision was automatically updated to reflect the committed changes.

nikic mentioned this in rGb5c44564e55e: [InstCombine] Remove redundant folds in foldCastedBitwiseLogic() (NFCI).Oct 24 2023, 8:07 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineAndOrXor.cpp

6 lines

test/

Transforms/

InstCombine/

vector-casts.ll

5 lines

Diff 112259

llvm/trunk/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

Show First 20 Lines • Show All 985 Lines • ▼ Show 20 Lines	if (CI->getSrcTy() == CI->getDestTy() \|\| isa<Constant>(CastSrc))
return false;		return false;

// If this cast is paired with another cast that can be eliminated, we prefer		// If this cast is paired with another cast that can be eliminated, we prefer
// to have it eliminated.		// to have it eliminated.
if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))		if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))
if (isEliminableCastPair(PrecedingCI, CI))		if (isEliminableCastPair(PrecedingCI, CI))
return false;		return false;

// If this is a vector sext from a compare, then we don't want to break the
// idiom where each element of the extended vector is either zero or all ones.
if (CI->getOpcode() == Instruction::SExt &&
isa<CmpInst>(CastSrc) && CI->getDestTy()->isVectorTy())
return false;

return true;		return true;
}		}

/// Fold {and,or,xor} (cast X), C.		/// Fold {and,or,xor} (cast X), C.
static Instruction foldLogicCastConstant(BinaryOperator &Logic, CastInst Cast,		static Instruction foldLogicCastConstant(BinaryOperator &Logic, CastInst Cast,
InstCombiner::BuilderTy &Builder) {		InstCombiner::BuilderTy &Builder) {
Constant *C = dyn_cast<Constant>(Logic.getOperand(1));		Constant *C = dyn_cast<Constant>(Logic.getOperand(1));
if (!C)		if (!C)
▲ Show 20 Lines • Show All 1,461 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/vector-casts.ll

Show First 20 Lines • Show All 90 Lines • ▼ Show 20 Lines	;
%and = or <4 x i32> %sext, %sext5		%and = or <4 x i32> %sext, %sext5
%conv = bitcast <4 x i32> %and to <2 x i64>		%conv = bitcast <4 x i32> %and to <2 x i64>
ret <2 x i64> %conv		ret <2 x i64> %conv
}		}

define <2 x i64> @test7(<4 x float> %a, <4 x float> %b) {		define <2 x i64> @test7(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @test7(		; CHECK-LABEL: @test7(
; CHECK-NEXT: [[CMP:%.]] = fcmp ult <4 x float> [[A:%.]], zeroinitializer		; CHECK-NEXT: [[CMP:%.]] = fcmp ult <4 x float> [[A:%.]], zeroinitializer
; CHECK-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
; CHECK-NEXT: [[CMP4:%.]] = fcmp ult <4 x float> [[B:%.]], zeroinitializer		; CHECK-NEXT: [[CMP4:%.]] = fcmp ult <4 x float> [[B:%.]], zeroinitializer
; CHECK-NEXT: [[SEXT5:%.*]] = sext <4 x i1> [[CMP4]] to <4 x i32>		; CHECK-NEXT: [[AND1:%.*]] = xor <4 x i1> [[CMP]], [[CMP4]]
; CHECK-NEXT: [[AND:%.*]] = xor <4 x i32> [[SEXT]], [[SEXT5]]		; CHECK-NEXT: [[AND:%.*]] = sext <4 x i1> [[AND1]] to <4 x i32>
; CHECK-NEXT: [[CONV:%.*]] = bitcast <4 x i32> [[AND]] to <2 x i64>		; CHECK-NEXT: [[CONV:%.*]] = bitcast <4 x i32> [[AND]] to <2 x i64>
; CHECK-NEXT: ret <2 x i64> [[CONV]]		; CHECK-NEXT: ret <2 x i64> [[CONV]]
;		;
%cmp = fcmp ult <4 x float> %a, zeroinitializer		%cmp = fcmp ult <4 x float> %a, zeroinitializer
%sext = sext <4 x i1> %cmp to <4 x i32>		%sext = sext <4 x i1> %cmp to <4 x i32>
%cmp4 = fcmp ult <4 x float> %b, zeroinitializer		%cmp4 = fcmp ult <4 x float> %b, zeroinitializer
%sext5 = sext <4 x i1> %cmp4 to <4 x i32>		%sext5 = sext <4 x i1> %cmp4 to <4 x i32>
%and = xor <4 x i32> %sext, %sext5		%and = xor <4 x i32> %sext, %sext5
▲ Show 20 Lines • Show All 233 Lines • Show Last 20 Lines