This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/X86/
-
Target/
-
X86/
-
X86ISelLowering.cpp
-
test/CodeGen/X86/
-
CodeGen/
-
X86/
-
combine-shl.ll
2
lower-vec-shift.ll
-
vec_shift6.ll
-
widen_arith-4.ll

Differential D48936

[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
ClosedPublic

Authored by RKSimon on Jul 4 2018, 6:40 AM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel
andreadb
lebedev.ri

Commits

rGd32ca2c0b78d: [X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
rL336642: [X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)

Summary

Now that rL336250 has (hopefully) landed, I'd like to prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we do increase register pressure,. The code size will go up a little but by less than what we save on the constant pool data.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Jul 4 2018, 6:40 AM

Dammit, just realised that pre-SSE41 targets might introduce AND/ANDN/OR blend masks which will even more costly - I'll see if there is a better way to do this

RKSimon mentioned this in rL336271: [X86][SSE] Add SSE2 target to some shift tests.Jul 4 2018, 7:03 AM

Make vXi16 "2shifts+select" more selective - only do it on pre-SSE41 if the shuffle can be widened. Only do on SSE41+ if a single PBLENDW can be used.

lebedev.ri added inline comments.Jul 8 2018, 9:46 AM

test/CodeGen/X86/lower-vec-shift.ll
211–231	Subj only talks about `mul`, but this is `div`. This is intended to be changed by this patch? If yes, there is no `lshr` test as far as i can tell.

RKSimon added inline comments.Jul 8 2018, 10:20 AM

test/CodeGen/X86/lower-vec-shift.ll
211–231	This is a side effect of only accepting v8i16 2shifts+blend on pre-SSE41 (no PBLENDW) if the shuffle can be widened to v4i32, as without PBLENDW we have to perform a bitmask with OR(ANDN,AND) - but for other shifts we'd end up doing that anyway - I suppose I could limit this to SHL cases only?

Still perform non-SHL shifts without PBLENDW / v4i32 widening

LGTM

This revision is now accepted and ready to land.Jul 9 2018, 12:00 PM

Closed by commit rL336642: [X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3) (authored by RKSimon). · Explain WhyJul 10 2018, 1:03 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 336515)

33 lines

test/

CodeGen/

X86/

	combine-shl.ll
	combine-shl.ll (revision 336515)

34 lines

	lower-vec-shift.ll
	lower-vec-shift.ll (revision 336515)

25 lines

	vec_shift6.ll
	vec_shift6.ll (revision 336515)

17 lines

	widen_arith-4.ll
	widen_arith-4.ll (revision 336515)

12 lines

Diff 154525

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,955 Lines • ▼ Show 20 Lines	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
return false;		return false;
}		}
assert(WidenedMask.size() == Mask.size() / 2 &&		assert(WidenedMask.size() == Mask.size() / 2 &&
"Incorrect size of mask after widening the elements!");		"Incorrect size of mask after widening the elements!");

return true;		return true;
}		}

		static bool canWidenShuffleElements(ArrayRef<int> Mask) {
		SmallVector<int, 32> WidenedMask;
		return canWidenShuffleElements(Mask, WidenedMask);
		}

/// Returns true if Elt is a constant zero or a floating point constant +0.0.		/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {		bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) \|\| isNullFPConstant(Elt);		return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
}		}

// Build a vector of constants.		// Build a vector of constants.
// Use an UNDEF node if MaskElt == -1.		// Use an UNDEF node if MaskElt == -1.
// Split 64-bit constants in the 32-bit mode.		// Split 64-bit constants in the 32-bit mode.
▲ Show 20 Lines • Show All 3,963 Lines • ▼ Show 20 Lines

/// Test whether a shuffle mask is equivalent within each 128-bit lane.		/// Test whether a shuffle mask is equivalent within each 128-bit lane.
static bool		static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,		is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {		SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);		return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}		}

		static bool
		is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
		SmallVector<int, 32> RepeatedMask;
		return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
		}

/// Test whether a shuffle mask is equivalent within each 256-bit lane.		/// Test whether a shuffle mask is equivalent within each 256-bit lane.
static bool		static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,		is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {		SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);		return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
}		}

/// Test whether a target shuffle mask is equivalent within each sub-lane.		/// Test whether a target shuffle mask is equivalent within each sub-lane.
▲ Show 20 Lines • Show All 14,467 Lines • ▼ Show 20 Lines	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);		SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);		SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);		R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
R = DAG.getNode(ISD::XOR, dl, VT, R, M);		R = DAG.getNode(ISD::XOR, dl, VT, R, M);
R = DAG.getNode(ISD::SUB, dl, VT, R, M);		R = DAG.getNode(ISD::SUB, dl, VT, R, M);
return R;		return R;
}		}

// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
if (Op.getOpcode() == ISD::SHL)
if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

// If possible, lower this shift as a sequence of two shifts by		// If possible, lower this shift as a sequence of two shifts by
// constant plus a BLENDing shuffle instead of scalarizing it.		// constant plus a BLENDing shuffle instead of scalarizing it.
// Example:		// Example:
// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))		// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
//		//
// Could be rewritten as:		// Could be rewritten as:
// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))		// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
//		//
// The advantage is that the two shifts from the example would be		// The advantage is that the two shifts from the example would be
// lowered as X86ISD::VSRLI nodes in parallel before blending.		// lowered as X86ISD::VSRLI nodes in parallel before blending.
if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {		if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
		(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
SDValue Amt1, Amt2;		SDValue Amt1, Amt2;
unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask;		SmallVector<int, 8> ShuffleMask;
for (unsigned i = 0; i != NumElts; ++i) {		for (unsigned i = 0; i != NumElts; ++i) {
SDValue A = Amt->getOperand(i);		SDValue A = Amt->getOperand(i);
if (A.isUndef()) {		if (A.isUndef()) {
ShuffleMask.push_back(SM_SentinelUndef);		ShuffleMask.push_back(SM_SentinelUndef);
continue;		continue;
}		}
if (!Amt1 \|\| Amt1 == A) {		if (!Amt1 \|\| Amt1 == A) {
ShuffleMask.push_back(i);		ShuffleMask.push_back(i);
Amt1 = A;		Amt1 = A;
continue;		continue;
}		}
if (!Amt2 \|\| Amt2 == A) {		if (!Amt2 \|\| Amt2 == A) {
ShuffleMask.push_back(i + NumElts);		ShuffleMask.push_back(i + NumElts);
Amt2 = A;		Amt2 = A;
continue;		continue;
}		}
break;		break;
}		}

		// Only perform this blend if we can perform it without loading a mask.
if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&		if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) {		isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
		(VT != MVT::v16i16 \|\|
		is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
		(VT == MVT::v4i32 \|\| Subtarget.hasSSE41() \|\|
		Op.getOpcode() != ISD::SHL \|\| canWidenShuffleElements(ShuffleMask))) {
SDValue Splat1 =		SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);		DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);		SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
SDValue Splat2 =		SDValue Splat2 =
DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);		DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);		SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);		return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
}		}
}		}

		// If possible, lower this packed shift into a vector multiply instead of
		// expanding it into a sequence of scalar shifts.
		if (Op.getOpcode() == ISD::SHL)
		if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
		return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

// v4i32 Non Uniform Shifts.		// v4i32 Non Uniform Shifts.
// If the shift amount is constant we can shift each lane using the SSE2		// If the shift amount is constant we can shift each lane using the SSE2
// immediate shifts, else we need to zero-extend each lane to the lower i64		// immediate shifts, else we need to zero-extend each lane to the lower i64
// and shift using the SSE2 variable shifts.		// and shift using the SSE2 variable shifts.
// The separate results can then be blended together.		// The separate results can then be blended together.
if (VT == MVT::v4i32) {		if (VT == MVT::v4i32) {
unsigned Opc = Op.getOpcode();		unsigned Opc = Op.getOpcode();
SDValue Amt0, Amt1, Amt2, Amt3;		SDValue Amt0, Amt1, Amt2, Amt3;
▲ Show 20 Lines • Show All 17,064 Lines • Show Last 20 Lines

test/CodeGen/X86/combine-shl.ll

	Show First 20 Lines • Show All 258 Lines • ▼ Show 20 Lines
	define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {			define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
	; SSE2-LABEL: combine_vec_shl_ext_shl1:			; SSE2-LABEL: combine_vec_shl_ext_shl1:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0			; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
	; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]			; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; SSE2-NEXT: psrad $16, %xmm1			; SSE2-NEXT: psrad $16, %xmm1
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]			; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSE2-NEXT: psrad $16, %xmm0			; SSE2-NEXT: psrad $16, %xmm0
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,1073741824,1073741824]			; SSE2-NEXT: movdqa %xmm0, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]			; SSE2-NEXT: pslld $31, %xmm2
	; SSE2-NEXT: pmuludq %xmm2, %xmm0			; SSE2-NEXT: pslld $30, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]			; SSE2-NEXT: movdqa %xmm1, %xmm2
	; SSE2-NEXT: pmuludq %xmm3, %xmm2			; SSE2-NEXT: pslld $29, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]			; SSE2-NEXT: pslld $28, %xmm1
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]			; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [536870912,536870912,268435456,268435456]
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
	; SSE2-NEXT: pmuludq %xmm2, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSE2-NEXT: pmuludq %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: combine_vec_shl_ext_shl1:			; SSE41-LABEL: combine_vec_shl_ext_shl1:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0			; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; SSE41-NEXT: pmovsxwd %xmm1, %xmm1			; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
	; SSE41-NEXT: pmovsxwd %xmm0, %xmm0			; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
	; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0			; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1			; SSE41-NEXT: pslld $30, %xmm2
				; SSE41-NEXT: pslld $31, %xmm0
				; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
				; SSE41-NEXT: movdqa %xmm1, %xmm2
				; SSE41-NEXT: pslld $28, %xmm2
				; SSE41-NEXT: pslld $29, %xmm1
				; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: combine_vec_shl_ext_shl1:			; AVX-LABEL: combine_vec_shl_ext_shl1:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0			; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: vpmovsxwd %xmm0, %ymm0			; AVX-NEXT: vpmovsxwd %xmm0, %ymm0
	; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0			; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	▲ Show 20 Lines • Show All 578 Lines • Show Last 20 Lines

test/CodeGen/X86/lower-vec-shift.ll

	Show First 20 Lines • Show All 202 Lines • ▼ Show 20 Lines
	; AVX2-LABEL: test8:			; AVX2-LABEL: test8:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0			; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>			%lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
	ret <4 x i32> %lshr			ret <4 x i32> %lshr
	}			}

	define <8 x i16> @test9(<8 x i16> %a) {			define <8 x i16> @test9(<8 x i16> %a) {
	; SSE-LABEL: test9:			; SSE-LABEL: test9:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movdqa %xmm0, %xmm1			; SSE-NEXT: movdqa %xmm0, %xmm1
	; SSE-NEXT: psraw $3, %xmm1			; SSE-NEXT: psraw $3, %xmm1
	; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]			; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
	; SSE-NEXT: psraw $1, %xmm0			; SSE-NEXT: psraw $1, %xmm0
	; SSE-NEXT: pand %xmm2, %xmm0			; SSE-NEXT: pand %xmm2, %xmm0
	; SSE-NEXT: pandn %xmm1, %xmm2			; SSE-NEXT: pandn %xmm1, %xmm2
	; SSE-NEXT: por %xmm2, %xmm0			; SSE-NEXT: por %xmm2, %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: test9:			; AVX-LABEL: test9:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpsraw $3, %xmm0, %xmm1			; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
	; AVX-NEXT: vpsraw $1, %xmm0, %xmm0			; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]			; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>			%lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
	ret <8 x i16> %lshr			ret <8 x i16> %lshr
	}			}
				lebedev.riUnsubmitted Not Done Reply Inline Actions Subj only talks about `mul`, but this is `div`. This is intended to be changed by this patch? If yes, there is no `lshr` test as far as i can tell. lebedev.ri: Subj only talks about `mul`, but this is `div`. This is intended to be changed by this patch?
				RKSimonAuthorUnsubmitted Not Done Reply Inline Actions This is a side effect of only accepting v8i16 2shifts+blend on pre-SSE41 (no PBLENDW) if the shuffle can be widened to v4i32, as without PBLENDW we have to perform a bitmask with OR(ANDN,AND) - but for other shifts we'd end up doing that anyway - I suppose I could limit this to SHL cases only? RKSimon: This is a side effect of only accepting v8i16 2shifts+blend on pre-SSE41 (no PBLENDW) if the…

	define <8 x i32> @test10(<8 x i32>* %a) {			define <8 x i32> @test10(<8 x i32>* %a) {
	; SSE-LABEL: test10:			; SSE-LABEL: test10:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movdqa (%rdi), %xmm0			; SSE-NEXT: movdqa (%rdi), %xmm0
	; SSE-NEXT: movdqa 16(%rdi), %xmm1			; SSE-NEXT: movdqa 16(%rdi), %xmm1
	; SSE-NEXT: psrad %xmm0, %xmm1			; SSE-NEXT: psrad %xmm0, %xmm1
	; SSE-NEXT: psrad $1, %xmm0			; SSE-NEXT: psrad $1, %xmm0
	Show All 21 Lines
	; SSE-LABEL: test11:			; SSE-LABEL: test11:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0			; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
	; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1			; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: test11:			; AVX1-LABEL: test11:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
	; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0			; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
				; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
				; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
				; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
				; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: test11:			; AVX2-LABEL: test11:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0			; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>			%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>
	ret <16 x i16> %lshr			ret <16 x i16> %lshr
	}			}

	define <16 x i16> @test12(<16 x i16> %a) {			define <16 x i16> @test12(<16 x i16> %a) {
	; SSE-LABEL: test12:			; SSE-LABEL: test12:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]			; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
	; SSE-NEXT: pmullw %xmm2, %xmm0			; SSE-NEXT: pmullw %xmm2, %xmm0
	; SSE-NEXT: pmullw %xmm2, %xmm1			; SSE-NEXT: pmullw %xmm2, %xmm1
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX1-LABEL: test12:			; AVX1-LABEL: test12:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]			; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
	; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1			; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
	; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0			; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
				; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
				; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
				; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: test12:			; AVX2-LABEL: test12:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0			; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
				; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
				; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>			%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
	ret <16 x i16> %lshr			ret <16 x i16> %lshr
	}			}

test/CodeGen/X86/vec_shift6.ll

	Show First 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>			%shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
	ret <4 x i32> %shl			ret <4 x i32> %shl
	}			}

	define <4 x i32> @test4(<4 x i32> %a) {			define <4 x i32> @test4(<4 x i32> %a) {
	; SSE2-LABEL: test4:			; SSE2-LABEL: test4:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,2]			; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]			; SSE2-NEXT: pslld $1, %xmm1
	; SSE2-NEXT: pmuludq %xmm1, %xmm0			; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]			; SSE2-NEXT: movapd %xmm1, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; SSE2-NEXT: pmuludq %xmm2, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: test4:			; SSE41-LABEL: test4:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0			; SSE41-NEXT: movdqa %xmm0, %xmm1
				; SSE41-NEXT: pslld $1, %xmm1
				; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
				; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: test4:			; AVX-LABEL: test4:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0			; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>			%shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
	ret <4 x i32> %shl			ret <4 x i32> %shl
	▲ Show 20 Lines • Show All 191 Lines • Show Last 20 Lines

test/CodeGen/X86/widen_arith-4.ll

	Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; SSE41-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)			; SSE41-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)			; SSE41-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: movl %edx, -{{[0-9]+}}(%rsp)			; SSE41-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: movq {{.*}}(%rip), %rax			; SSE41-NEXT: movq {{.*}}(%rip), %rax
	; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp)			; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp)			; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp)			; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
	; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
	; SSE41-NEXT: jmp .LBB0_1			; SSE41-NEXT: jmp .LBB0_1
	; SSE41-NEXT: .p2align 4, 0x90			; SSE41-NEXT: .p2align 4, 0x90
	; SSE41-NEXT: .LBB0_2: # %forbody			; SSE41-NEXT: .LBB0_2: # %forbody
	; SSE41-NEXT: # in Loop: Header=BB0_1 Depth=1			; SSE41-NEXT: # in Loop: Header=BB0_1 Depth=1
	; SSE41-NEXT: movslq -{{[0-9]+}}(%rsp), %rax			; SSE41-NEXT: movslq -{{[0-9]+}}(%rsp), %rax
	; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx			; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
	; SSE41-NEXT: shlq $4, %rax			; SSE41-NEXT: shlq $4, %rax
	; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx			; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
	; SSE41-NEXT: movdqa (%rdx,%rax), %xmm2			; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1
	; SSE41-NEXT: psubw %xmm0, %xmm2			; SSE41-NEXT: psubw %xmm0, %xmm1
	; SSE41-NEXT: pmullw %xmm1, %xmm2			; SSE41-NEXT: movdqa %xmm1, %xmm2
	; SSE41-NEXT: pextrw $4, %xmm2, 8(%rcx,%rax)			; SSE41-NEXT: psllw $2, %xmm2
				; SSE41-NEXT: psllw $1, %xmm1
				; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
				; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)
	; SSE41-NEXT: movq %xmm2, (%rcx,%rax)			; SSE41-NEXT: movq %xmm2, (%rcx,%rax)
	; SSE41-NEXT: incl -{{[0-9]+}}(%rsp)			; SSE41-NEXT: incl -{{[0-9]+}}(%rsp)
	; SSE41-NEXT: .LBB0_1: # %forcond			; SSE41-NEXT: .LBB0_1: # %forcond
	; SSE41-NEXT: # =>This Inner Loop Header: Depth=1			; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
	; SSE41-NEXT: movl -{{[0-9]+}}(%rsp), %eax			; SSE41-NEXT: movl -{{[0-9]+}}(%rsp), %eax
	; SSE41-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax			; SSE41-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
	; SSE41-NEXT: jl .LBB0_2			; SSE41-NEXT: jl .LBB0_2
	; SSE41-NEXT: # %bb.3: # %afterfor			; SSE41-NEXT: # %bb.3: # %afterfor
	▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines