This is an archive of the discontinued LLVM Phabricator instance.

Differential D11938

[InstCombine] SSE/AVX vector shifts demanded shift amount bits
ClosedPublic

Authored by RKSimon on Aug 11 2015, 6:38 AM.

Download Raw Diff

Details

Reviewers

majnemer
andreadb
mkuper

Commits

rGbecd5e8abdc4: [InstCombine] SSE/AVX vector shifts demanded shift amount bits
rL244872: [InstCombine] SSE/AVX vector shifts demanded shift amount bits

Summary

Most SSE/AVX (non-constant) vector shift instructions only use the lower 64-bits of the 128-bit shift amount vector operand, this patch calls SimplifyDemandedVectorElts to optimize for this.

I had to refactor some of my recent InstCombiner work on the vector shifts to avoid quite a bit of duplicate code. it means that SimplifyX86immshift now (re)decodes the type of shift.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 31803.Aug 11 2015, 6:38 AM

RKSimon retitled this revision from to [InstCombine] SSE/AVX vector shifts demanded shift amount bits.

RKSimon updated this object.

RKSimon added reviewers: andreadb, mkuper, majnemer.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

LGTM, with a nit, and a question about unrelated code. :-)

lib/Transforms/InstCombine/InstCombineCalls.cpp
259 ↗	(On Diff #31803)	Could this code also be replaced by a call to SimplifyDemandedVectorElts(), to remove duplication? Or does it do something smarter?
876 ↗	(On Diff #31803)	all -> only?

This revision is now accepted and ready to land.Aug 12 2015, 5:35 AM

Thanks Michael.

lib/Transforms/InstCombine/InstCombineCalls.cpp
259 ↗	(On Diff #31803)	AFAICT SimplifyDemandedVectorElts doesn't help us here - what we'd really need is a helper function that bitcasts a ConstantDataVector and we get the raw APInt values. There is similar code in ConstantFolding.cpp (and DAGCombiner::ConstantFoldBITCASTofBUILD_VECTOR) so it isn't out of the question but beyond the scope of this patch.
876 ↗	(On Diff #31803)	OK.

mkuper added inline comments.Aug 12 2015, 7:37 AM

lib/Transforms/InstCombine/InstCombineCalls.cpp
259 ↗	(On Diff #31803)	Of course, I definitely didn't mean that should be part of this patch. (Also, I misread the code, it was a silly question to begin with, sorry for the noise.)

Closed by commit rL244872: [InstCombine] SSE/AVX vector shifts demanded shift amount bits (authored by RKSimon). · Explain WhyAug 13 2015, 12:40 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

111 lines

test/

Transforms/

InstCombine/

x86-vector-shifts.ll

148 lines

Diff 32033

llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 192 Lines • ▼ Show 20 Lines	if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
MI->setLength(Constant::getNullValue(LenC->getType()));		MI->setLength(Constant::getNullValue(LenC->getType()));
return MI;		return MI;
}		}

return nullptr;		return nullptr;
}		}

static Value *SimplifyX86immshift(const IntrinsicInst &II,		static Value *SimplifyX86immshift(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder,		InstCombiner::BuilderTy &Builder) {
bool LogicalShift, bool ShiftLeft) {		bool LogicalShift = false;
		bool ShiftLeft = false;

		switch (II.getIntrinsicID()) {
		default:
		return nullptr;
		case Intrinsic::x86_sse2_psra_d:
		case Intrinsic::x86_sse2_psra_w:
		case Intrinsic::x86_sse2_psrai_d:
		case Intrinsic::x86_sse2_psrai_w:
		case Intrinsic::x86_avx2_psra_d:
		case Intrinsic::x86_avx2_psra_w:
		case Intrinsic::x86_avx2_psrai_d:
		case Intrinsic::x86_avx2_psrai_w:
		LogicalShift = false; ShiftLeft = false;
		break;
		case Intrinsic::x86_sse2_psrl_d:
		case Intrinsic::x86_sse2_psrl_q:
		case Intrinsic::x86_sse2_psrl_w:
		case Intrinsic::x86_sse2_psrli_d:
		case Intrinsic::x86_sse2_psrli_q:
		case Intrinsic::x86_sse2_psrli_w:
		case Intrinsic::x86_avx2_psrl_d:
		case Intrinsic::x86_avx2_psrl_q:
		case Intrinsic::x86_avx2_psrl_w:
		case Intrinsic::x86_avx2_psrli_d:
		case Intrinsic::x86_avx2_psrli_q:
		case Intrinsic::x86_avx2_psrli_w:
		LogicalShift = true; ShiftLeft = false;
		break;
		case Intrinsic::x86_sse2_psll_d:
		case Intrinsic::x86_sse2_psll_q:
		case Intrinsic::x86_sse2_psll_w:
		case Intrinsic::x86_sse2_pslli_d:
		case Intrinsic::x86_sse2_pslli_q:
		case Intrinsic::x86_sse2_pslli_w:
		case Intrinsic::x86_avx2_psll_d:
		case Intrinsic::x86_avx2_psll_q:
		case Intrinsic::x86_avx2_psll_w:
		case Intrinsic::x86_avx2_pslli_d:
		case Intrinsic::x86_avx2_pslli_q:
		case Intrinsic::x86_avx2_pslli_w:
		LogicalShift = true; ShiftLeft = true;
		break;
		}
assert((LogicalShift \|\| !ShiftLeft) && "Only logical shifts can shift left");		assert((LogicalShift \|\| !ShiftLeft) && "Only logical shifts can shift left");

// Simplify if count is constant.		// Simplify if count is constant.
auto Arg1 = II.getArgOperand(1);		auto Arg1 = II.getArgOperand(1);
auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);		auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
auto CDV = dyn_cast<ConstantDataVector>(Arg1);		auto CDV = dyn_cast<ConstantDataVector>(Arg1);
auto CInt = dyn_cast<ConstantInt>(Arg1);		auto CInt = dyn_cast<ConstantInt>(Arg1);
if (!CAZ && !CDV && !CInt)		if (!CAZ && !CDV && !CInt)
▲ Show 20 Lines • Show All 572 Lines • ▼ Show 20 Lines	if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),
DemandedElts, UndefElts)) {		DemandedElts, UndefElts)) {
II->setArgOperand(0, V);		II->setArgOperand(0, V);
return II;		return II;
}		}
break;		break;
}		}

// Constant fold ashr( <A x Bi>, Ci ).		// Constant fold ashr( <A x Bi>, Ci ).
case Intrinsic::x86_sse2_psra_d:		// Constant fold lshr( <A x Bi>, Ci ).
case Intrinsic::x86_sse2_psra_w:		// Constant fold shl( <A x Bi>, Ci ).
case Intrinsic::x86_sse2_psrai_d:		case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_sse2_psrai_w:		case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_avx2_psra_d:
case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx2_psrai_d:		case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:		case Intrinsic::x86_avx2_psrai_w:
if (Value V = SimplifyX86immshift(II, *Builder, false, false))
return ReplaceInstUsesWith(*II, V);
break;

// Constant fold lshr( <A x Bi>, Ci ).
case Intrinsic::x86_sse2_psrl_d:
case Intrinsic::x86_sse2_psrl_q:
case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_sse2_psrli_d:		case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:		case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:		case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_avx2_psrl_d:
case Intrinsic::x86_avx2_psrl_q:
case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx2_psrli_d:		case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:		case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:		case Intrinsic::x86_avx2_psrli_w:
if (Value V = SimplifyX86immshift(II, *Builder, true, false))		case Intrinsic::x86_sse2_pslli_d:
		case Intrinsic::x86_sse2_pslli_q:
		case Intrinsic::x86_sse2_pslli_w:
		case Intrinsic::x86_avx2_pslli_d:
		case Intrinsic::x86_avx2_pslli_q:
		case Intrinsic::x86_avx2_pslli_w:
		if (Value V = SimplifyX86immshift(II, *Builder))
return ReplaceInstUsesWith(*II, V);		return ReplaceInstUsesWith(*II, V);
break;		break;

// Constant fold shl( <A x Bi>, Ci ).		case Intrinsic::x86_sse2_psra_d:
		case Intrinsic::x86_sse2_psra_w:
		case Intrinsic::x86_avx2_psra_d:
		case Intrinsic::x86_avx2_psra_w:
		case Intrinsic::x86_sse2_psrl_d:
		case Intrinsic::x86_sse2_psrl_q:
		case Intrinsic::x86_sse2_psrl_w:
		case Intrinsic::x86_avx2_psrl_d:
		case Intrinsic::x86_avx2_psrl_q:
		case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_sse2_psll_d:		case Intrinsic::x86_sse2_psll_d:
case Intrinsic::x86_sse2_psll_q:		case Intrinsic::x86_sse2_psll_q:
case Intrinsic::x86_sse2_psll_w:		case Intrinsic::x86_sse2_psll_w:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_avx2_psll_d:		case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:		case Intrinsic::x86_avx2_psll_q:
case Intrinsic::x86_avx2_psll_w:		case Intrinsic::x86_avx2_psll_w: {
case Intrinsic::x86_avx2_pslli_d:		if (Value V = SimplifyX86immshift(II, *Builder))
case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
if (Value V = SimplifyX86immshift(II, *Builder, true, true))
return ReplaceInstUsesWith(*II, V);		return ReplaceInstUsesWith(*II, V);

		// SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
		// operand to compute the shift amount.
		auto ShiftAmt = II->getArgOperand(1);
		auto ShiftType = cast<VectorType>(ShiftAmt->getType());
		assert(ShiftType->getPrimitiveSizeInBits() == 128 &&
		"Unexpected packed shift size");
		unsigned VWidth = ShiftType->getNumElements();

		APInt DemandedElts = APInt::getLowBitsSet(VWidth, VWidth / 2);
		APInt UndefElts(VWidth, 0);
		if (Value *V =
		SimplifyDemandedVectorElts(ShiftAmt, DemandedElts, UndefElts)) {
		II->setArgOperand(1, V);
		return II;
		}
break;		break;
		}

case Intrinsic::x86_sse41_pmovsxbd:		case Intrinsic::x86_sse41_pmovsxbd:
case Intrinsic::x86_sse41_pmovsxbq:		case Intrinsic::x86_sse41_pmovsxbq:
case Intrinsic::x86_sse41_pmovsxbw:		case Intrinsic::x86_sse41_pmovsxbw:
case Intrinsic::x86_sse41_pmovsxdq:		case Intrinsic::x86_sse41_pmovsxdq:
case Intrinsic::x86_sse41_pmovsxwd:		case Intrinsic::x86_sse41_pmovsxwd:
case Intrinsic::x86_sse41_pmovsxwq:		case Intrinsic::x86_sse41_pmovsxwq:
case Intrinsic::x86_avx2_pmovsxbd:		case Intrinsic::x86_avx2_pmovsxbd:
▲ Show 20 Lines • Show All 1,175 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll

	Show First 20 Lines • Show All 820 Lines • ▼ Show 20 Lines
	define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable {			define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable {
	; CHECK-LABEL: @avx2_psll_q_64			; CHECK-LABEL: @avx2_psll_q_64
	; CHECK-NEXT: ret <4 x i64> zeroinitializer			; CHECK-NEXT: ret <4 x i64> zeroinitializer
	%1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)			%1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
	ret <4 x i64> %1			ret <4 x i64> %1
	}			}

	;			;
				; Vector Demanded Bits
				;

				define <8 x i16> @sse2_psra_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psra_w_var
				; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %a)
				; CHECK-NEXT: ret <8 x i16> %1
				%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
				%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1)
				ret <8 x i16> %2
				}

				define <4 x i32> @sse2_psra_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psra_d_var
				; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %a)
				; CHECK-NEXT: ret <4 x i32> %1
				%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> %1)
				ret <4 x i32> %2
				}

				define <16 x i16> @avx2_psra_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psra_w_var
				; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %a)
				; CHECK-NEXT: ret <16 x i16> %1
				%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
				%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> %1)
				ret <16 x i16> %2
				}

				define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psra_d_var
				; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %a)
				; CHECK-NEXT: ret <8 x i32> %1
				%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1)
				ret <8 x i32> %2
				}

				define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psrl_w_var
				; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a)
				; CHECK-NEXT: ret <8 x i16> %1
				%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
				%2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %1)
				ret <8 x i16> %2
				}

				define <4 x i32> @sse2_psrl_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psrl_d_var
				; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %a)
				; CHECK-NEXT: ret <4 x i32> %1
				%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1)
				ret <4 x i32> %2
				}

				define <2 x i64> @sse2_psrl_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psrl_q_var
				; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %a)
				; CHECK-NEXT: ret <2 x i64> %1
				%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
				%2 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %v, <2 x i64> %1)
				ret <2 x i64> %2
				}

				define <16 x i16> @avx2_psrl_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psrl_w_var
				; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %a)
				; CHECK-NEXT: ret <16 x i16> %1
				%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
				%2 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %v, <8 x i16> %1)
				ret <16 x i16> %2
				}

				define <8 x i32> @avx2_psrl_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psrl_d_var
				; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %a)
				; CHECK-NEXT: ret <8 x i32> %1
				%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%2 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %v, <4 x i32> %1)
				ret <8 x i32> %2
				}

				define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psrl_q_var
				; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %a)
				; CHECK-NEXT: ret <4 x i64> %1
				%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
				%2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1)
				ret <4 x i64> %2
				}

				define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psll_w_var
				; CHECK-NEXT: %1 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a)
				; CHECK-NEXT: ret <8 x i16> %1
				%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
				%2 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %1)
				ret <8 x i16> %2
				}

				define <4 x i32> @sse2_psll_d_var(<4 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psll_d_var
				; CHECK-NEXT: %1 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %a)
				; CHECK-NEXT: ret <4 x i32> %1
				%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%2 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %v, <4 x i32> %1)
				ret <4 x i32> %2
				}

				define <2 x i64> @sse2_psll_q_var(<2 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @sse2_psll_q_var
				; CHECK-NEXT: %1 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %a)
				; CHECK-NEXT: ret <2 x i64> %1
				%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
				%2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1)
				ret <2 x i64> %2
				}

				define <16 x i16> @avx2_psll_w_var(<16 x i16> %v, <8 x i16> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psll_w_var
				; CHECK-NEXT: %1 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %a)
				; CHECK-NEXT: ret <16 x i16> %1
				%1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
				%2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1)
				ret <16 x i16> %2
				}

				define <8 x i32> @avx2_psll_d_var(<8 x i32> %v, <4 x i32> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psll_d_var
				; CHECK-NEXT: %1 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %a)
				; CHECK-NEXT: ret <8 x i32> %1
				%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				%2 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %v, <4 x i32> %1)
				ret <8 x i32> %2
				}

				define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) nounwind readnone uwtable {
				; CHECK-LABEL: @avx2_psll_q_var
				; CHECK-NEXT: %1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %a)
				; CHECK-NEXT: ret <4 x i64> %1
				%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
				%2 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> %1)
				ret <4 x i64> %2
				}

				;
	; Constant Folding			; Constant Folding
	;			;

	define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) {			define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) {
	; CHECK-LABEL: @test_sse2_psra_w_0			; CHECK-LABEL: @test_sse2_psra_w_0
	; CHECK-NEXT: ret <8 x i16> %A			; CHECK-NEXT: ret <8 x i16> %A
	%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)			%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
	%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)			%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
	▲ Show 20 Lines • Show All 290 Lines • Show Last 20 Lines