This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine][x86] Constant fold psll intrinsics.
ClosedPublic

Authored by Bigcheese on Apr 11 2014, 11:24 AM.

Download Raw Diff

Details

Reviewers

grosbach
Bigcheese

Summary

The psll{,i}{w,d,q} instruction is almost a vector shl however, it has defined
behavior of evaluating to 0 for shifts greater than the bitwidth of the elements.
We can’t currently represent this directly in llvm without generating extra
code, but we can handle the constant case.

This excludes avx512 as I don't have hardware to verify. It excludes _dq
variants because they are represented in the IR as <{2,4} x i64> when it's
actually a byte shift of the entire i{128,265}.

This also excludes _dq_bs as they aren't at all supported by the backend.
There are also no corresponding instructions in the ISA. I have no idea why
they exist...

Diff Detail

Event Timeline

Seems reasonable. Shouldn't we do the same thing for psllw, psllq, vpsll[wdq], though?

I agree with Jim.
You can also do something similar to simplify packed logical shift right instructions ( psrlw/ psrld/ psrlq/vpsrlw/ vpsrld/ vpsrlq ).
SSE2/AVX2 packed logical shift right instructions also evaluate to 0 if the shift count is greater than or equal to the element size.

Yes, I'll add the others. Seems reasonable to have them all in the same commit.

Bigcheese updated this revision to Unknown Object (????).Apr 15 2014, 5:48 PM

Excellent.

Nadav commented on a separate patch that he's interested in these sorts of things being target DAG combines rather than InstCombines. Might want to check with him to get a bit more info on his thoughts about that.

Bigcheese accepted this revision.Apr 23 2014, 6:09 PM

Bigcheese added a reviewer: Bigcheese.

This revision is now accepted and ready to land.Apr 23 2014, 6:09 PM

Committed as r207058. Approved by Nadav.

Revision Contents

Path

Size

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

41 lines

test/

Transforms/

InstCombine/

vec_demanded_elts.ll

108 lines

Diff 8549

lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 548 Lines • ▼ Show 20 Lines	case Intrinsic::x86_sse2_cvttsd2si64: {
if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),		if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),
DemandedElts, UndefElts)) {		DemandedElts, UndefElts)) {
II->setArgOperand(0, V);		II->setArgOperand(0, V);
return II;		return II;
}		}
break;		break;
}		}

		// Constant fold <A x Bi> << Ci.
		// FIXME: We don't handle _dq because it's a shift of an i128, but is
		// represented in the IR as <2 x i64>. A per element shift is wrong.
		case Intrinsic::x86_sse2_psll_d:
		case Intrinsic::x86_sse2_psll_q:
		case Intrinsic::x86_sse2_psll_w:
		case Intrinsic::x86_sse2_pslli_d:
		case Intrinsic::x86_sse2_pslli_q:
		case Intrinsic::x86_sse2_pslli_w:
		case Intrinsic::x86_avx2_psll_d:
		case Intrinsic::x86_avx2_psll_q:
		case Intrinsic::x86_avx2_psll_w:
		case Intrinsic::x86_avx2_pslli_d:
		case Intrinsic::x86_avx2_pslli_q:
		case Intrinsic::x86_avx2_pslli_w: {
		// Simplify if count is constant. To 0 if > BitWidth, otherwise to shl.
		auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1));
		auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1));
		if (!CDV && !CInt)
		break;
		ConstantInt *Count;
		if (CDV)
		Count = cast<ConstantInt>(CDV->getElementAsConstant(0));
		else
		Count = CInt;

		auto Vec = II->getArgOperand(0);
		auto VT = cast<VectorType>(Vec->getType());
		if (Count->getZExtValue() >
		VT->getElementType()->getPrimitiveSizeInBits() - 1)
		return ReplaceInstUsesWith(
		CI, ConstantAggregateZero::get(Vec->getType()));
		else {
		unsigned VWidth = VT->getNumElements();
		// Get a constant vector of the same type as the first operand.
		auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue());
		return BinaryOperator::CreateShl(
		Vec, Builder->CreateVectorSplat(VWidth, VTCI));
		}
		break;
		}

case Intrinsic::x86_sse41_pmovsxbw:		case Intrinsic::x86_sse41_pmovsxbw:
case Intrinsic::x86_sse41_pmovsxwd:		case Intrinsic::x86_sse41_pmovsxwd:
case Intrinsic::x86_sse41_pmovsxdq:		case Intrinsic::x86_sse41_pmovsxdq:
case Intrinsic::x86_sse41_pmovzxbw:		case Intrinsic::x86_sse41_pmovzxbw:
case Intrinsic::x86_sse41_pmovzxwd:		case Intrinsic::x86_sse41_pmovzxwd:
case Intrinsic::x86_sse41_pmovzxdq: {		case Intrinsic::x86_sse41_pmovzxdq: {
// pmov{s\|z}x ignores the upper half of their input vectors.		// pmov{s\|z}x ignores the upper half of their input vectors.
▲ Show 20 Lines • Show All 818 Lines • Show Last 20 Lines

test/Transforms/InstCombine/vec_demanded_elts.ll

; RUN: opt < %s -instcombine -S \| FileCheck %s		; RUN: opt < %s -instcombine -S \| FileCheck %s
		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

define i16 @test1(float %f) {		define i16 @test1(float %f) {
entry:		entry:
; CHECK-LABEL: @test1(		; CHECK-LABEL: @test1(
; CHECK: fmul float		; CHECK: fmul float
; CHECK-NOT: insertelement {{.*}} 0.00		; CHECK-NOT: insertelement {{.*}} 0.00
; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul		; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul
; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub		; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub
▲ Show 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
%b0 = insertelement <4 x float> undef, float %g, i32 0		%b0 = insertelement <4 x float> undef, float %g, i32 0
%b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1		%b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
%b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2		%b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
%b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3		%b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
%ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3		%ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
ret <4 x float> %ret		ret <4 x float> %ret
}		}

		define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
		%S = bitcast i32 1 to i32
		%1 = zext i32 %S to i64
		%2 = insertelement <2 x i64> undef, i64 %1, i32 0
		%3 = insertelement <2 x i64> %2, i64 0, i32 1
		%4 = bitcast <2 x i64> %3 to <8 x i16>
		%5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
		%6 = bitcast <8 x i16> %5 to <4 x i32>
		%7 = bitcast <2 x i64> %3 to <4 x i32>
		%8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
		%9 = bitcast <4 x i32> %8 to <2 x i64>
		%10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
		%11 = bitcast <2 x i64> %10 to <8 x i16>
		%12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
		%13 = bitcast <8 x i16> %12 to <4 x i32>
		%14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
		%15 = bitcast <4 x i32> %14 to <2 x i64>
		%16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
		ret <2 x i64> %16

		; CHECK: test_sse2_1
		; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
		}

		define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
		%S = bitcast i32 1 to i32
		%1 = zext i32 %S to i64
		%2 = insertelement <2 x i64> undef, i64 %1, i32 0
		%3 = insertelement <2 x i64> %2, i64 0, i32 1
		%4 = bitcast <2 x i64> %3 to <8 x i16>
		%5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
		%6 = bitcast <16 x i16> %5 to <8 x i32>
		%7 = bitcast <2 x i64> %3 to <4 x i32>
		%8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
		%9 = bitcast <8 x i32> %8 to <4 x i64>
		%10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
		%11 = bitcast <4 x i64> %10 to <16 x i16>
		%12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
		%13 = bitcast <16 x i16> %12 to <8 x i32>
		%14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
		%15 = bitcast <8 x i32> %14 to <4 x i64>
		%16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
		ret <4 x i64> %16
		; CHECK: test_avx2_1
		; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
		}

		define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
		%S = bitcast i32 128 to i32
		%1 = zext i32 %S to i64
		%2 = insertelement <2 x i64> undef, i64 %1, i32 0
		%3 = insertelement <2 x i64> %2, i64 0, i32 1
		%4 = bitcast <2 x i64> %3 to <8 x i16>
		%5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
		%6 = bitcast <8 x i16> %5 to <4 x i32>
		%7 = bitcast <2 x i64> %3 to <4 x i32>
		%8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
		%9 = bitcast <4 x i32> %8 to <2 x i64>
		%10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
		%11 = bitcast <2 x i64> %10 to <8 x i16>
		%12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
		%13 = bitcast <8 x i16> %12 to <4 x i32>
		%14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
		%15 = bitcast <4 x i32> %14 to <2 x i64>
		%16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
		ret <2 x i64> %16

		; CHECK: test_sse2_0
		; CHECK: ret <2 x i64> zeroinitializer
		}

		define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
		%S = bitcast i32 128 to i32
		%1 = zext i32 %S to i64
		%2 = insertelement <2 x i64> undef, i64 %1, i32 0
		%3 = insertelement <2 x i64> %2, i64 0, i32 1
		%4 = bitcast <2 x i64> %3 to <8 x i16>
		%5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
		%6 = bitcast <16 x i16> %5 to <8 x i32>
		%7 = bitcast <2 x i64> %3 to <4 x i32>
		%8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
		%9 = bitcast <8 x i32> %8 to <4 x i64>
		%10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
		%11 = bitcast <4 x i64> %10 to <16 x i16>
		%12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
		%13 = bitcast <16 x i16> %12 to <8 x i32>
		%14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
		%15 = bitcast <8 x i32> %14 to <4 x i64>
		%16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
		ret <4 x i64> %16
		; CHECK: test_avx2_0
		; CHECK: ret <4 x i64> zeroinitializer
		}

		declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
		declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
		declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
		declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
		declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
		declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
		declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
		declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
		declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
		declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
		declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
		declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1

		attributes #1 = { nounwind readnone }