This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/
-
Target/X86/
-
X86/
-
X86ISelLowering.cpp
-
Transforms/InstCombine/
-
InstCombine/
-
InstCombineCalls.cpp
-
test/
-
CodeGen/X86/
-
X86/
-
combine-avx2-intrinsics.ll
-
combine-sse2-intrinsics.ll
-
Transforms/InstCombine/
-
InstCombine/
-
x86-vector-shifts.ll

Differential D11886

[InstCombine] Move SSE2/AVX2 arithmetic vector shift folding to instcombiner
ClosedPublic

Authored by RKSimon on Aug 9 2015, 6:16 AM.

Download Raw Diff

Details

Reviewers

qcolombet
andreadb
mkuper

Commits

rGa3a72b41de52: [InstCombine] Move SSE2/AVX2 arithmetic vector shift folding to instcombiner
rL244495: [InstCombine] Move SSE2/AVX2 arithmetic vector shift folding to instcombiner

Summary

As discussed in D11760, this patch moves the (V)PSRA(WD) arithmetic shift-by-constant folding to InstCombine to match the logical shift implementations.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 31618.Aug 9 2015, 6:16 AM

RKSimon retitled this revision from to [InstCombine] Move SSE2/AVX2 arithmetic vector shift folding to instcombiner.

RKSimon updated this object.

RKSimon added reviewers: andreadb, qcolombet, mkuper.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

Thanks Simon,

I have a few minor questions about testing, other than that, looks good.

lib/Transforms/InstCombine/InstCombineCalls.cpp
261 ↗	(On Diff #31618)	Do we have an ISel test that these AShrs get lowered correctly? If we don't, should we?
test/CodeGen/X86/combine-avx2-intrinsics.ll
6 ↗	(On Diff #31618)	We still want to test these combines, right? (Only as part of InstCombine, not ISel)

RKSimon added inline comments.Aug 9 2015, 8:30 AM

lib/Transforms/InstCombine/InstCombineCalls.cpp
261 ↗	(On Diff #31618)	We have the tests in test\CodeGen\X86\vector-shift-ashr-*.ll
test/CodeGen/X86/combine-avx2-intrinsics.ll
6 ↗	(On Diff #31618)	I can add these shift accumulation tests as well if you wish but I will keep the simple tests in there too. The x86-vector-shifts.ll test file already has some general constant folding tests at the end that do various forms of accumulation.

mkuper added inline comments.Aug 9 2015, 8:32 AM

lib/Transforms/InstCombine/InstCombineCalls.cpp
261 ↗	(On Diff #31618)	Ok.
test/CodeGen/X86/combine-avx2-intrinsics.ll
6 ↗	(On Diff #31618)	I'm just against removing (working) regression tests on principle. :-) But yes, I meant in addition, not instead of the simple test.

majnemer added a subscriber: majnemer.Aug 9 2015, 8:39 AM

majnemer added inline comments.

lib/Transforms/InstCombine/InstCombineCalls.cpp
202 ↗	(On Diff #31618)	Would it make sense to assert `(LogicalShift \|\| !ShiftLeft)` seeing as how there is no arithmetic left shift. Alternatively, you could make that state impossible by construction by using an enum for the three states.

Thanks guys - I'll get an updated patch up as soon as I can.

lib/Transforms/InstCombine/InstCombineCalls.cpp
202 ↗	(On Diff #31618)	No problem - I'll add the assert.
test/CodeGen/X86/combine-avx2-intrinsics.ll
6 ↗	(On Diff #31618)	OK I'll transfer the tests over - note that I'll have to refactor them as they won't lower anymore.

Hi Simon,

I saw that Michael and David already reviewed your patch.
If you address their comments then the patch looks good to me too. Thanks!

In future, we should also move the target specific combine rules on sse/avx blend intrinsic calls from 'PerformINTRINSIC_WO_CHAINCombine' to InstCombine.

Thanks,
-Andrea

Updated based on feedback.

LGTM

This revision is now accepted and ready to land.Aug 10 2015, 7:08 AM

Closed by commit rL244495: [InstCombine] Move SSE2/AVX2 arithmetic vector shift folding to instcombiner (authored by RKSimon). · Explain WhyAug 10 2015, 1:21 PM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in D11934: [InstCombine] Move SSE/AVX vector blend folding to instcombiner.Aug 11 2015, 3:37 AM

RKSimon mentioned this in rL244723: [InstCombine] Move SSE/AVX vector blend folding to instcombiner.Aug 12 2015, 1:10 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

44 lines

Transforms/

InstCombine/

InstCombineCalls.cpp

38 lines

test/

CodeGen/

X86/

combine-avx2-intrinsics.ll

45 lines

combine-sse2-intrinsics.ll

53 lines

Transforms/

InstCombine/

x86-vector-shifts.ll

309 lines

Diff 31713

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 23,458 Lines • ▼ Show 20 Lines	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
if (C->isNullValue())		if (C->isNullValue())
return Op0;		return Op0;
if (C->isAllOnesValue())		if (C->isAllOnesValue())
return Op1;		return Op1;
}		}

return SDValue();		return SDValue();
}		}

// Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
case Intrinsic::x86_avx2_psrai_d:
case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_sse2_psra_d:
case Intrinsic::x86_avx2_psra_w:
case Intrinsic::x86_avx2_psra_d: {
SDValue Op0 = N->getOperand(1);
SDValue Op1 = N->getOperand(2);
EVT VT = Op0.getValueType();
assert(VT.isVector() && "Expected a vector type!");

if (isa<BuildVectorSDNode>(Op1))
Op1 = Op1.getOperand(0);

if (!isa<ConstantSDNode>(Op1))
return SDValue();

EVT SVT = VT.getVectorElementType();
unsigned SVTBits = SVT.getSizeInBits();

ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
uint64_t ShAmt = C.getZExtValue();

// Don't try to convert this shift into a ISD::SRA if the shift
// count is bigger than or equal to the element size.
if (ShAmt >= SVTBits)
return SDValue();

// Trivial case: if the shift count is zero, then fold this
// into the first operand.
if (ShAmt == 0)
return Op0;

// Replace this packed shift intrinsic with a target independent
// shift dag node.
SDLoc DL(N);
SDValue Splat = DAG.getConstant(C, DL, VT);
return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat);
}
}		}
}		}

/// PerformMulCombine - Optimize a single multiply with constant into two		/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.		/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.		/// LEA + SHL, LEA + LEA.
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,		static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {		TargetLowering::DAGCombinerInfo &DCI) {
▲ Show 20 Lines • Show All 3,059 Lines • Show Last 20 Lines

llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 193 Lines • ▼ Show 20 Lines	if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
return MI;		return MI;
}		}

return nullptr;		return nullptr;
}		}

static Value *SimplifyX86immshift(const IntrinsicInst &II,		static Value *SimplifyX86immshift(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder,		InstCombiner::BuilderTy &Builder,
bool ShiftLeft) {		bool LogicalShift, bool ShiftLeft) {
		assert((LogicalShift \|\| !ShiftLeft) && "Only logical shifts can shift left");

// Simplify if count is constant.		// Simplify if count is constant.
auto Arg1 = II.getArgOperand(1);		auto Arg1 = II.getArgOperand(1);
auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);		auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
auto CDV = dyn_cast<ConstantDataVector>(Arg1);		auto CDV = dyn_cast<ConstantDataVector>(Arg1);
auto CInt = dyn_cast<ConstantInt>(Arg1);		auto CInt = dyn_cast<ConstantInt>(Arg1);
if (!CAZ && !CDV && !CInt)		if (!CAZ && !CDV && !CInt)
return nullptr;		return nullptr;

Show All 22 Lines	static Value *SimplifyX86immshift(const IntrinsicInst &II,
auto SVT = VT->getElementType();		auto SVT = VT->getElementType();
unsigned VWidth = VT->getNumElements();		unsigned VWidth = VT->getNumElements();
unsigned BitWidth = SVT->getPrimitiveSizeInBits();		unsigned BitWidth = SVT->getPrimitiveSizeInBits();

// If shift-by-zero then just return the original value.		// If shift-by-zero then just return the original value.
if (Count == 0)		if (Count == 0)
return Vec;		return Vec;

// Handle cases when Shift >= BitWidth - just return zero.		// Handle cases when Shift >= BitWidth.
if (Count.uge(BitWidth))		if (Count.uge(BitWidth)) {
		// If LogicalShift - just return zero.
		if (LogicalShift)
return ConstantAggregateZero::get(VT);		return ConstantAggregateZero::get(VT);

		// If ArithmeticShift - clamp Shift to (BitWidth - 1).
		Count = APInt(64, BitWidth - 1);
		}

// Get a constant vector of the same type as the first operand.		// Get a constant vector of the same type as the first operand.
auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));		auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);		auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);

if (ShiftLeft)		if (ShiftLeft)
return Builder.CreateShl(Vec, ShiftVec);		return Builder.CreateShl(Vec, ShiftVec);

		if (LogicalShift)
return Builder.CreateLShr(Vec, ShiftVec);		return Builder.CreateLShr(Vec, ShiftVec);

		return Builder.CreateAShr(Vec, ShiftVec);
}		}

static Value *SimplifyX86extend(const IntrinsicInst &II,		static Value *SimplifyX86extend(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder,		InstCombiner::BuilderTy &Builder,
bool SignExtend) {		bool SignExtend) {
VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType());		VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType());
VectorType *DstTy = cast<VectorType>(II.getType());		VectorType *DstTy = cast<VectorType>(II.getType());
unsigned NumDstElts = DstTy->getNumElements();		unsigned NumDstElts = DstTy->getNumElements();
▲ Show 20 Lines • Show All 510 Lines • ▼ Show 20 Lines	case Intrinsic::x86_sse2_cvttsd2si64: {
if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),		if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),
DemandedElts, UndefElts)) {		DemandedElts, UndefElts)) {
II->setArgOperand(0, V);		II->setArgOperand(0, V);
return II;		return II;
}		}
break;		break;
}		}

		// Constant fold ashr( <A x Bi>, Ci ).
		case Intrinsic::x86_sse2_psra_d:
		case Intrinsic::x86_sse2_psra_w:
		case Intrinsic::x86_sse2_psrai_d:
		case Intrinsic::x86_sse2_psrai_w:
		case Intrinsic::x86_avx2_psra_d:
		case Intrinsic::x86_avx2_psra_w:
		case Intrinsic::x86_avx2_psrai_d:
		case Intrinsic::x86_avx2_psrai_w:
		if (Value V = SimplifyX86immshift(II, *Builder, false, false))
		return ReplaceInstUsesWith(*II, V);
		break;

// Constant fold lshr( <A x Bi>, Ci ).		// Constant fold lshr( <A x Bi>, Ci ).
case Intrinsic::x86_sse2_psrl_d:		case Intrinsic::x86_sse2_psrl_d:
case Intrinsic::x86_sse2_psrl_q:		case Intrinsic::x86_sse2_psrl_q:
case Intrinsic::x86_sse2_psrl_w:		case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_sse2_psrli_d:		case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:		case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_sse2_psrli_w:		case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_avx2_psrl_d:		case Intrinsic::x86_avx2_psrl_d:
case Intrinsic::x86_avx2_psrl_q:		case Intrinsic::x86_avx2_psrl_q:
case Intrinsic::x86_avx2_psrl_w:		case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx2_psrli_d:		case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:		case Intrinsic::x86_avx2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:		case Intrinsic::x86_avx2_psrli_w:
if (Value V = SimplifyX86immshift(II, *Builder, false))		if (Value V = SimplifyX86immshift(II, *Builder, true, false))
return ReplaceInstUsesWith(*II, V);		return ReplaceInstUsesWith(*II, V);
break;		break;

// Constant fold shl( <A x Bi>, Ci ).		// Constant fold shl( <A x Bi>, Ci ).
case Intrinsic::x86_sse2_psll_d:		case Intrinsic::x86_sse2_psll_d:
case Intrinsic::x86_sse2_psll_q:		case Intrinsic::x86_sse2_psll_q:
case Intrinsic::x86_sse2_psll_w:		case Intrinsic::x86_sse2_psll_w:
case Intrinsic::x86_sse2_pslli_d:		case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:		case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_sse2_pslli_w:		case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_avx2_psll_d:		case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:		case Intrinsic::x86_avx2_psll_q:
case Intrinsic::x86_avx2_psll_w:		case Intrinsic::x86_avx2_psll_w:
case Intrinsic::x86_avx2_pslli_d:		case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:		case Intrinsic::x86_avx2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:		case Intrinsic::x86_avx2_pslli_w:
if (Value V = SimplifyX86immshift(II, *Builder, true))		if (Value V = SimplifyX86immshift(II, *Builder, true, true))
return ReplaceInstUsesWith(*II, V);		return ReplaceInstUsesWith(*II, V);
break;		break;

case Intrinsic::x86_sse41_pmovsxbd:		case Intrinsic::x86_sse41_pmovsxbd:
case Intrinsic::x86_sse41_pmovsxbq:		case Intrinsic::x86_sse41_pmovsxbq:
case Intrinsic::x86_sse41_pmovsxbw:		case Intrinsic::x86_sse41_pmovsxbw:
case Intrinsic::x86_sse41_pmovsxdq:		case Intrinsic::x86_sse41_pmovsxdq:
case Intrinsic::x86_sse41_pmovsxwd:		case Intrinsic::x86_sse41_pmovsxwd:
▲ Show 20 Lines • Show All 1,166 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/combine-avx2-intrinsics.ll

	; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 \| FileCheck %s			; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 \| FileCheck %s

	; Verify that the backend correctly combines AVX2 builtin intrinsics.			; Verify that the backend correctly combines AVX2 builtin intrinsics.


	define <8 x i32> @test_psra_1(<8 x i32> %A) {
	%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
	%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
	%3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
	ret <8 x i32> %3
	}
	; CHECK-LABEL: test_psra_1
	; CHECK: vpsrad $8, %ymm0, %ymm0
	; CHECK-NEXT: ret

	define <16 x i16> @test_psra_2(<16 x i16> %A) {
	%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
	%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
	%3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
	ret <16 x i16> %3
	}
	; CHECK-LABEL: test_psra_2
	; CHECK: vpsraw $8, %ymm0, %ymm0
	; CHECK-NEXT: ret

	define <16 x i16> @test_psra_3(<16 x i16> %A) {
	%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
	%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
	%3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
	ret <16 x i16> %3
	}
	; CHECK-LABEL: test_psra_3
	; CHECK-NOT: vpsraw
	; CHECK: ret

	define <8 x i32> @test_psra_4(<8 x i32> %A) {
	%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
	%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
	%3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
	ret <8 x i32> %3
	}
	; CHECK-LABEL: test_psra_4
	; CHECK-NOT: vpsrad
	; CHECK: ret


	define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {			define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
	%res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1)			%res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1)
	ret <32 x i8> %res			ret <32 x i8> %res
	}			}
	; CHECK-LABEL: test_x86_avx2_pblendvb			; CHECK-LABEL: test_x86_avx2_pblendvb
	; CHECK-NOT: vpblendvb			; CHECK-NOT: vpblendvb
	; CHECK: ret			; CHECK: ret

	▲ Show 20 Lines • Show All 97 Lines • ▼ Show 20 Lines
	; CHECK-NOT: vpblendd			; CHECK-NOT: vpblendd
	; CHECK: ret			; CHECK: ret


	declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)			declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
	declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)			declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
	declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)			declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
	declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32)			declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32)
	declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
	declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
	declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
	declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)

llvm/trunk/test/CodeGen/X86/combine-sse2-intrinsics.ll

	; RUN: llc < %s -march=x86 -mcpu=core2 \| FileCheck %s
	; RUN: llc < %s -march=x86-64 -mcpu=corei7 \| FileCheck %s

	; Verify that the backend correctly combines SSE2 builtin intrinsics.


	define <4 x i32> @test_psra_1(<4 x i32> %A) {
	%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3)
	%2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
	%3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2)
	ret <4 x i32> %3
	}
	; CHECK-LABEL: test_psra_1
	; CHECK: psrad $8, %xmm0
	; CHECK-NEXT: ret

	define <8 x i16> @test_psra_2(<8 x i16> %A) {
	%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3)
	%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
	%3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2)
	ret <8 x i16> %3
	}
	; CHECK-LABEL: test_psra_2
	; CHECK: psraw $8, %xmm0
	; CHECK-NEXT: ret

	define <4 x i32> @test_psra_3(<4 x i32> %A) {
	%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
	%2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
	%3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0)
	ret <4 x i32> %3
	}
	; CHECK-LABEL: test_psra_3
	; CHECK-NOT: psrad
	; CHECK: ret


	define <8 x i16> @test_psra_4(<8 x i16> %A) {
	%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
	%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
	%3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
	ret <8 x i16> %3
	}
	; CHECK-LABEL: test_psra_4
	; CHECK-NOT: psraw
	; CHECK: ret


	declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>)
	declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
	declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>)
	declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)

llvm/trunk/test/Transforms/InstCombine/x86-vector-shifts.ll

; RUN: opt < %s -instcombine -S \| FileCheck %s		; RUN: opt < %s -instcombine -S \| FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

;		;
		; ASHR - Immediate
		;

		define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psrai_w_0
		; CHECK-NEXT: ret <8 x i16> %v
		%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0)
		ret <8 x i16> %1
		}

		define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psrai_w_15
		; CHECK-NEXT: %1 = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <8 x i16> %1
		%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15)
		ret <8 x i16> %1
		}

		define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psrai_w_64
		; CHECK-NEXT: %1 = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <8 x i16> %1
		%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64)
		ret <8 x i16> %1
		}

		define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psrai_d_0
		; CHECK-NEXT: ret <4 x i32> %v
		%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0)
		ret <4 x i32> %1
		}

		define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psrai_d_15
		; CHECK-NEXT: %1 = ashr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
		; CHECK-NEXT: ret <4 x i32> %1
		%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15)
		ret <4 x i32> %1
		}

		define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psrai_d_64
		; CHECK-NEXT: %1 = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
		; CHECK-NEXT: ret <4 x i32> %1
		%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64)
		ret <4 x i32> %1
		}

		define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psrai_w_0
		; CHECK-NEXT: ret <16 x i16> %v
		%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0)
		ret <16 x i16> %1
		}

		define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psrai_w_15
		; CHECK-NEXT: %1 = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <16 x i16> %1
		%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15)
		ret <16 x i16> %1
		}

		define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psrai_w_64
		; CHECK-NEXT: %1 = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <16 x i16> %1
		%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64)
		ret <16 x i16> %1
		}

		define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psrai_d_0
		; CHECK-NEXT: ret <8 x i32> %v
		%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0)
		ret <8 x i32> %1
		}

		define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psrai_d_15
		; CHECK-NEXT: %1 = ashr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
		; CHECK-NEXT: ret <8 x i32> %1
		%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15)
		ret <8 x i32> %1
		}

		define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psrai_d_64
		; CHECK-NEXT: %1 = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
		; CHECK-NEXT: ret <8 x i32> %1
		%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64)
		ret <8 x i32> %1
		}

		;
; LSHR - Immediate		; LSHR - Immediate
;		;

define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) nounwind readnone uwtable {		define <8 x i16> @sse2_psrli_w_0(<8 x i16> %v) nounwind readnone uwtable {
; CHECK-LABEL: @sse2_psrli_w_0		; CHECK-LABEL: @sse2_psrli_w_0
; CHECK-NEXT: ret <8 x i16> %v		; CHECK-NEXT: ret <8 x i16> %v
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0)		%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %v, i32 0)
ret <8 x i16> %1		ret <8 x i16> %1
▲ Show 20 Lines • Show All 256 Lines • ▼ Show 20 Lines
define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) nounwind readnone uwtable {		define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) nounwind readnone uwtable {
; CHECK-LABEL: @avx2_pslli_q_64		; CHECK-LABEL: @avx2_pslli_q_64
; CHECK-NEXT: ret <4 x i64> zeroinitializer		; CHECK-NEXT: ret <4 x i64> zeroinitializer
%1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64)		%1 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %v, i32 64)
ret <4 x i64> %1		ret <4 x i64> %1
}		}

;		;
		; ASHR - Constant Vector
		;

		define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_w_0
		; CHECK-NEXT: ret <8 x i16> %v
		%1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer)
		ret <8 x i16> %1
		}

		define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_w_15
		; CHECK-NEXT: %1 = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <8 x i16> %1
		%1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
		ret <8 x i16> %1
		}

		define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_w_15_splat
		; CHECK-NEXT: %1 = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <8 x i16> %1
		%1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
		ret <8 x i16> %1
		}

		define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_w_64
		; CHECK-NEXT: %1 = ashr <8 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <8 x i16> %1
		%1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
		ret <8 x i16> %1
		}

		define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_d_0
		; CHECK-NEXT: ret <4 x i32> %v
		%1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer)
		ret <4 x i32> %1
		}

		define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_d_15
		; CHECK-NEXT: %1 = ashr <4 x i32> %v, <i32 15, i32 15, i32 15, i32 15>
		; CHECK-NEXT: ret <4 x i32> %1
		%1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
		ret <4 x i32> %1
		}

		define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_d_15_splat
		; CHECK-NEXT: %1 = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
		; CHECK-NEXT: ret <4 x i32> %1
		%1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
		ret <4 x i32> %1
		}

		define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @sse2_psra_d_64
		; CHECK-NEXT: %1 = ashr <4 x i32> %v, <i32 31, i32 31, i32 31, i32 31>
		; CHECK-NEXT: ret <4 x i32> %1
		%1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
		ret <4 x i32> %1
		}

		define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_w_0
		; CHECK-NEXT: ret <16 x i16> %v
		%1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer)
		ret <16 x i16> %1
		}

		define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_w_15
		; CHECK-NEXT: %1 = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <16 x i16> %1
		%1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 15, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
		ret <16 x i16> %1
		}

		define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_w_15_splat
		; CHECK-NEXT: %1 = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <16 x i16> %1
		%1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
		ret <16 x i16> %1
		}

		define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_w_64
		; CHECK-NEXT: %1 = ashr <16 x i16> %v, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
		; CHECK-NEXT: ret <16 x i16> %1
		%1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> <i16 64, i16 0, i16 0, i16 0, i16 9999, i16 9999, i16 9999, i16 9999>)
		ret <16 x i16> %1
		}

		define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_d_0
		; CHECK-NEXT: ret <8 x i32> %v
		%1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer)
		ret <8 x i32> %1
		}

		define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_d_15
		; CHECK-NEXT: %1 = ashr <8 x i32> %v, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
		; CHECK-NEXT: ret <8 x i32> %1
		%1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 15, i32 0, i32 9999, i32 9999>)
		ret <8 x i32> %1
		}

		define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_d_15_splat
		; CHECK-NEXT: %1 = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
		; CHECK-NEXT: ret <8 x i32> %1
		%1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
		ret <8 x i32> %1
		}

		define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) nounwind readnone uwtable {
		; CHECK-LABEL: @avx2_psra_d_64
		; CHECK-NEXT: %1 = ashr <8 x i32> %v, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
		; CHECK-NEXT: ret <8 x i32> %1
		%1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> <i32 64, i32 0, i32 9999, i32 9999>)
		ret <8 x i32> %1
		}

		;
; LSHR - Constant Vector		; LSHR - Constant Vector
;		;

define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) nounwind readnone uwtable {		define <8 x i16> @sse2_psrl_w_0(<8 x i16> %v) nounwind readnone uwtable {
; CHECK-LABEL: @sse2_psrl_w_0		; CHECK-LABEL: @sse2_psrl_w_0
; CHECK-NEXT: ret <8 x i16> %v		; CHECK-NEXT: ret <8 x i16> %v
%1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer)		%1 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> zeroinitializer)
ret <8 x i16> %1		ret <8 x i16> %1
▲ Show 20 Lines • Show All 315 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret <4 x i64> zeroinitializer
%1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)		%1 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %v, <2 x i64> <i64 64, i64 9999>)
ret <4 x i64> %1		ret <4 x i64> %1
}		}

;		;
; Constant Folding		; Constant Folding
;		;

		define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) {
		; CHECK-LABEL: @test_sse2_psra_w_0
		; CHECK-NEXT: ret <8 x i16> %A
		%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
		%2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
		%3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
		ret <8 x i16> %3
		}

		define <8 x i16> @test_sse2_psra_w_8() {
		; CHECK-LABEL: @test_sse2_psra_w_8
		; CHECK-NEXT: ret <8 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
		%1 = bitcast <2 x i64> <i64 1152956690052710400, i64 1152956690052710400> to <8 x i16>
		%2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3)
		%3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
		%4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2)
		ret <8 x i16> %4
		}

		define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) {
		; CHECK-LABEL: @test_sse2_psra_d_0
		; CHECK-NEXT: ret <4 x i32> %A
		%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
		%2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
		%3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0)
		ret <4 x i32> %3
		}

		define <4 x i32> @sse2_psra_d_8() {
		; CHECK-LABEL: @sse2_psra_d_8
		; CHECK-NEXT: ret <4 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608>
		%1 = bitcast <2 x i64> <i64 1152956690052710400, i64 1152956690052710400> to <4 x i32>
		%2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3)
		%3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
		%4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2)
		ret <4 x i32> %4
		}

		define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) {
		; CHECK-LABEL: @test_avx2_psra_w_0
		; CHECK-NEXT: ret <16 x i16> %A
		%1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
		%2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
		%3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
		ret <16 x i16> %3
		}

		define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) {
		; CHECK-LABEL: @test_avx2_psra_w_8
		; CHECK-NEXT: ret <16 x i16> <i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16, i16 -128, i16 64, i16 32, i16 16>
		%1 = bitcast <4 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <16 x i16>
		%2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3)
		%3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
		%4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2)
		ret <16 x i16> %4
		}

		define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) {
		; CHECK-LABEL: @test_avx2_psra_d_0
		; CHECK-NEXT: ret <8 x i32> %A
		%1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
		%2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
		%3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
		ret <8 x i32> %3
		}

		define <8 x i32> @test_avx2_psra_d_8() {
		; CHECK-LABEL: @test_avx2_psra_d_8
		; CHECK-NEXT: ret <8 x i32> <i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608, i32 4194432, i32 1048608>
		%1 = bitcast <4 x i64> <i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400, i64 1152956690052710400> to <8 x i32>
		%2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3)
		%3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
		%4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2)
		ret <8 x i32> %4
		}

define <2 x i64> @test_sse2_1() nounwind readnone uwtable {		define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
%S = bitcast i32 1 to i32		%S = bitcast i32 1 to i32
%1 = zext i32 %S to i64		%1 = zext i32 %S to i64
%2 = insertelement <2 x i64> undef, i64 %1, i32 0		%2 = insertelement <2 x i64> undef, i64 %1, i32 0
%3 = insertelement <2 x i64> %2, i64 0, i32 1		%3 = insertelement <2 x i64> %2, i64 0, i32 1
%4 = bitcast <2 x i64> %3 to <8 x i16>		%4 = bitcast <2 x i64> %3 to <8 x i16>
%5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)		%5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
%6 = bitcast <8 x i16> %5 to <4 x i32>		%6 = bitcast <8 x i16> %5 to <4 x i32>
▲ Show 20 Lines • Show All 193 Lines • ▼ Show 20 Lines
declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1		declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1		declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1		declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1		declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1		declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1		declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1		declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1

		declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1
		declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1
		declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1
		declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1
		declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1
		declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1
		declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1
		declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1

attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }