This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Add INSERTPS target shuffle combines.
ClosedPublic

Authored by RKSimon on Jan 11 2016, 10:00 AM.

Download Raw Diff

Details

Reviewers

spatel
chandlerc
andreadb

Commits

rGe74653b67abf: [X86][SSE] Add INSERTPS target shuffle combines.
rL258205: [X86][SSE] Add INSERTPS target shuffle combines.

Summary

As vector shuffles can only reference two inputs many (V)INSERTPS patterns end up being split over two targets shuffles.

This patch adds combines to attempt to combine (V)INSERTPS nodes with input/output nodes that are just zeroing out these additional vector elements.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 44524.Jan 11 2016, 10:00 AM

RKSimon retitled this revision from to [X86][SSE] Add INSERTPS target shuffle combines..

RKSimon updated this object.

RKSimon added reviewers: spatel, andreadb, chandlerc.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

I applied the patch to r258047, and I get a 'make check' failure on:
FAIL: LLVM :: CodeGen/X86/merge-consecutive-loads-128.ll (6972 of 15632)

lib/Target/X86/X86ISelLowering.cpp
7021	Why do we need this intermediate variable? Ie, couldn't we just set the appropriate elements of the mask rather than this bitvector which then gets copied to the mask?
23709	"resolveTarget..." or if the previous inline comment applies, combine the 2 helper functions and call it "setShuffleMaskZeroElements" ?
23882	warning: comparison of integers of different signs

Thanks Sanjay, I'll get an updated patch out soon - the consecutive merge loads failures were due to some extra tests I added for D16217.

lib/Target/X86/X86ISelLowering.cpp
23709	I'll merge them - computeKnownZeroShuffleElements was over zealous future proofing on my part.

RKSimon updated this revision to Diff 45261.Jan 19 2016, 7:53 AM

RKSimon edited edge metadata.

LGTM.

This revision is now accepted and ready to land.Jan 19 2016, 10:29 AM

Closed by commit rL258205: [X86][SSE] Add INSERTPS target shuffle combines. (authored by RKSimon). · Explain WhyJan 19 2016, 2:28 PM

This revision was automatically updated to reflect the committed changes.

RKSimon marked 3 inline comments as done.

RKSimon mentioned this in D16652: [X86][SSE] Find source of the inserted element of INSERTPS.Jan 27 2016, 2:53 PM

Revision Contents

Path

Size

lib/

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 258113)

137 lines

test/

CodeGen/

X86/

	insertps-combine.ll
	insertps-combine.ll (revision 258113)

16 lines

	merge-consecutive-loads-128.ll
	merge-consecutive-loads-128.ll (revision 258113)

12 lines

	vector-shuffle-128-v4.ll
	vector-shuffle-128-v4.ll (revision 258113)

8 lines

Diff 45261

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,012 Lines • ▼ Show 20 Lines	static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
}		}

return Zeroable;		return Zeroable;
}		}

// X86 has dedicated unpack instructions that can handle specific blend		// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.		// operations: UNPCKH and UNPCKL.
static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,		static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
SDValue V1, SDValue V2,		SDValue V1, SDValue V2,
		spatelUnsubmitted Not Done Reply Inline Actions Why do we need this intermediate variable? Ie, couldn't we just set the appropriate elements of the mask rather than this bitvector which then gets copied to the mask? spatel: Why do we need this intermediate variable? Ie, couldn't we just set the appropriate elements of…
SelectionDAG &DAG) {		SelectionDAG &DAG) {
int NumElts = VT.getVectorNumElements();		int NumElts = VT.getVectorNumElements();
int NumEltsInLane = 128 / VT.getScalarSizeInBits();		int NumEltsInLane = 128 / VT.getScalarSizeInBits();
SmallVector<int, 8> Unpckl;		SmallVector<int, 8> Unpckl;
SmallVector<int, 8> Unpckh;		SmallVector<int, 8> Unpckh;

for (int i = 0; i < NumElts; ++i) {		for (int i = 0; i < NumElts; ++i) {
unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;		unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
▲ Show 20 Lines • Show All 16,668 Lines • ▼ Show 20 Lines	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
if (Old != V)		if (Old != V)
// Replace the combinable shuffle with the combined one, updating all users		// Replace the combinable shuffle with the combined one, updating all users
// so that we re-evaluate the chain here.		// so that we re-evaluate the chain here.
DCI.CombineTo(Old.getNode(), V, /AddTo/ true);		DCI.CombineTo(Old.getNode(), V, /AddTo/ true);

return true;		return true;
}		}

		/// Check a target shuffle mask's inputs to see if we can set any values to
		/// SM_SentinelZero - this is for elements that are known to be zero
		/// (not just zeroable) from their inputs.
		static bool setTargetShuffleZeroElements(SDValue N,
		spatelUnsubmitted Done Reply Inline Actions "resolveTarget..." or if the previous inline comment applies, combine the 2 helper functions and call it "setShuffleMaskZeroElements" ? spatel: "resolveTarget..." or if the previous inline comment applies, combine the 2 helper functions…
		RKSimonAuthorUnsubmitted Done Reply Inline Actions I'll merge them - computeKnownZeroShuffleElements was over zealous future proofing on my part. RKSimon: I'll merge them - computeKnownZeroShuffleElements was over zealous future proofing on my part.
		SmallVectorImpl<int> &Mask) {
		bool IsUnary;
		if (!isTargetShuffle(N.getOpcode()))
		return false;
		if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Mask,
		IsUnary))
		return false;

		SDValue V1 = N.getOperand(0);
		SDValue V2 = IsUnary ? V1 : N.getOperand(1);

		while (V1.getOpcode() == ISD::BITCAST)
		V1 = V1->getOperand(0);
		while (V2.getOpcode() == ISD::BITCAST)
		V2 = V2->getOperand(0);

		for (int i = 0, Size = Mask.size(); i != Size; ++i) {
		int M = Mask[i];

		// Already decoded as SM_SentinelZero / SM_SentinelUndef.
		if (M < 0)
		continue;

		SDValue V = M < Size ? V1 : V2;

		// We are referencing an UNDEF input.
		if (V.isUndef()) {
		Mask[i] = SM_SentinelUndef;
		continue;
		}

		// TODO - handle the Size != (int)V.getNumOperands() cases in future.
		if (V.getOpcode() != ISD::BUILD_VECTOR \|\| Size != (int)V.getNumOperands())
		continue;
		if (!X86::isZeroNode(V.getOperand(M % Size)))
		continue;
		Mask[i] = SM_SentinelZero;
		}

		return true;
		}

/// \brief Try to combine x86 target specific shuffles.		/// \brief Try to combine x86 target specific shuffles.
static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,		static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {		const X86Subtarget *Subtarget) {
SDLoc DL(N);		SDLoc DL(N);
MVT VT = N.getSimpleValueType();		MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;		SmallVector<int, 4> Mask;

▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	case X86ISD::BLENDI: {

if (VT == MVT::v2f64)		if (VT == MVT::v2f64)
if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))		if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {		if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);		SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);		return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
}		}

		// Attempt to merge blend(insertps(x,y),zero).
		if (V0.getOpcode() == X86ISD::INSERTPS \|\|
		V1.getOpcode() == X86ISD::INSERTPS) {
		assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");

		// Determine which elements are known to be zero.
		SmallVector<int, 8> TargetMask;
		if (!setTargetShuffleZeroElements(N, TargetMask))
		return SDValue();

		// Helper function to take inner insertps node and attempt to
		// merge the blend with zero into its zero mask.
		auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
		if (V.getOpcode() != X86ISD::INSERTPS)
		return SDValue();
		SDValue Op0 = V.getOperand(0);
		SDValue Op1 = V.getOperand(1);
		SDValue Op2 = V.getOperand(2);
		unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();

		// Check each element of the blend node's target mask - must either
		// be zeroable (and update the zero mask) or selects the element from
		// the inner insertps node.
		for (int i = 0; i != 4; ++i)
		if (TargetMask[i] < 0)
		InsertPSMask \|= (1u << i);
		else if (TargetMask[i] != (i + Offset))
		return SDValue();
		return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
		DAG.getConstant(InsertPSMask, DL, MVT::i8));
		};

		if (SDValue V = MergeInsertPSAndBlend(V0, 0))
		return V;
		if (SDValue V = MergeInsertPSAndBlend(V1, 4))
		return V;
		}
		return SDValue();
		}
		case X86ISD::INSERTPS: {
		assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
		SDValue Op0 = N.getOperand(0);
		SDValue Op1 = N.getOperand(1);
		SDValue Op2 = N.getOperand(2);
		unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
		unsigned DstIdx = (InsertPSMask >> 4) & 3;

		// Attempt to merge insertps with an inner target shuffle node.
		SmallVector<int, 8> TargetMask;
		if (!setTargetShuffleZeroElements(Op0, TargetMask))
		return SDValue();

		bool Updated = false;
		bool UseInput00 = false;
		bool UseInput01 = false;
		for (int i = 0; i != 4; ++i) {
		int M = TargetMask[i];
		if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
		spatelUnsubmitted Done Reply Inline Actions warning: comparison of integers of different signs spatel: warning: comparison of integers of different signs
		// No change if element is already zero or the inserted element.
		continue;
		} else if (M < 0) {
		// If the target mask is undef/zero then we must zero the element.
		InsertPSMask \|= (1u << i);
		Updated = true;
		continue;
		}

		// The input vector element must be inline.
		if (M != i && M != (i + 4))
		return SDValue();

		// Determine which inputs of the target shuffle we're using.
		UseInput00 \|= (0 <= M && M < 4);
		UseInput01 \|= (4 <= M);
		}

		// If we're not using both inputs of the target shuffle then use the
		// referenced input directly.
		if (UseInput00 && !UseInput01) {
		Updated = true;
		Op0 = Op0.getOperand(0);
		} else if (!UseInput00 && UseInput01) {
		Updated = true;
		Op0 = Op0.getOperand(1);
		}

		if (Updated)
		return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
		DAG.getConstant(InsertPSMask, DL, MVT::i8));

return SDValue();		return SDValue();
}		}
default:		default:
return SDValue();		return SDValue();
}		}

// Nuke no-op shuffles that show up after combining.		// Nuke no-op shuffles that show up after combining.
if (isNoopShuffleMask(Mask))		if (isNoopShuffleMask(Mask))
▲ Show 20 Lines • Show All 4,341 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);		case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG:		case ISD::SIGN_EXTEND_INREG:
return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);		return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);		case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);		case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);		case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);		case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles		case X86ISD::SHUFP: // Handle all target specific shuffles
		case X86ISD::INSERTPS:
case X86ISD::PALIGNR:		case X86ISD::PALIGNR:
case X86ISD::BLENDI:		case X86ISD::BLENDI:
case X86ISD::UNPCKH:		case X86ISD::UNPCKH:
case X86ISD::UNPCKL:		case X86ISD::UNPCKL:
case X86ISD::MOVHLPS:		case X86ISD::MOVHLPS:
case X86ISD::MOVLHPS:		case X86ISD::MOVLHPS:
case X86ISD::PSHUFB:		case X86ISD::PSHUFB:
case X86ISD::PSHUFD:		case X86ISD::PSHUFD:
▲ Show 20 Lines • Show All 870 Lines • Show Last 20 Lines

test/CodeGen/X86/insertps-combine.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41		; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1		; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2		; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2

define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {		define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
; SSE-LABEL: shuffle_v4f32_0z27:		; SSE-LABEL: shuffle_v4f32_0z27:
; SSE: # BB#0:		; SSE: # BB#0:
; SSE-NEXT: xorps %xmm2, %xmm2		; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: shuffle_v4f32_0z27:		; AVX-LABEL: shuffle_v4f32_0z27:
; AVX: # BB#0:		; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2		; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
; AVX-NEXT: retq		; AVX-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0		%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0		%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1		%vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
%vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>		%vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
%vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>		%vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
ret <4 x float> %vecinit5		ret <4 x float> %vecinit5
}		}
Show All 18 Lines	; AVX-NEXT: retq
%vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2		%vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
%vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>		%vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x float> %vecinit4		ret <4 x float> %vecinit4
}		}

define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {		define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0z24:		; SSE-LABEL: shuffle_v4f32_0z24:
; SSE: # BB#0:		; SSE: # BB#0:
; SSE-NEXT: xorps %xmm2, %xmm2		; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: shuffle_v4f32_0z24:		; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:		; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2		; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-NEXT: retq		; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0		%vecext = extractelement <4 x float> %xyzw, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0		%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1		%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
%vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %xyzw, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>		%vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %xyzw, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
%vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>		%vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x float> %vecinit5		ret <4 x float> %vecinit5
}		}
▲ Show 20 Lines • Show All 74 Lines • Show Last 20 Lines

test/CodeGen/X86/merge-consecutive-loads-128.ll

	Show First 20 Lines • Show All 153 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero			; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]			; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: merge_4f32_f32_012u:			; SSE41-LABEL: merge_4f32_f32_012u:
	; SSE41: # BB#0:			; SSE41: # BB#0:
	; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
	; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]			; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: merge_4f32_f32_012u:			; AVX-LABEL: merge_4f32_f32_012u:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]			; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%ptr0 = getelementptr inbounds float, float* %ptr, i64 0			%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
	%ptr1 = getelementptr inbounds float, float* %ptr, i64 1			%ptr1 = getelementptr inbounds float, float* %ptr, i64 1
	%ptr2 = getelementptr inbounds float, float* %ptr, i64 2			%ptr2 = getelementptr inbounds float, float* %ptr, i64 2
	%val0 = load float, float* %ptr0			%val0 = load float, float* %ptr0
	%val1 = load float, float* %ptr1			%val1 = load float, float* %ptr1
	%val2 = load float, float* %ptr2			%val2 = load float, float* %ptr2
	Show All 11 Lines
	; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero			; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]			; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: merge_4f32_f32_019u:			; SSE41-LABEL: merge_4f32_f32_019u:
	; SSE41: # BB#0:			; SSE41: # BB#0:
	; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
	; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]			; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: merge_4f32_f32_019u:			; AVX-LABEL: merge_4f32_f32_019u:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]			; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%ptr0 = getelementptr inbounds float, float* %ptr, i64 0			%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
	%ptr1 = getelementptr inbounds float, float* %ptr, i64 1			%ptr1 = getelementptr inbounds float, float* %ptr, i64 1
	%ptr2 = getelementptr inbounds float, float* %ptr, i64 9			%ptr2 = getelementptr inbounds float, float* %ptr, i64 9
	%val0 = load float, float* %ptr0			%val0 = load float, float* %ptr0
	%val1 = load float, float* %ptr1			%val1 = load float, float* %ptr1
	%val2 = load float, float* %ptr2			%val2 = load float, float* %ptr2
	▲ Show 20 Lines • Show All 334 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-128-v4.ll

	Show First 20 Lines • Show All 1,074 Lines • ▼ Show 20 Lines
	; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]			; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
	; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]			; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
	; SSSE3-NEXT: movaps %xmm1, %xmm0			; SSSE3-NEXT: movaps %xmm1, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: shuffle_v4f32_0zz6:			; SSE41-LABEL: shuffle_v4f32_0zz6:
	; SSE41: # BB#0:			; SSE41: # BB#0:
	; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]			; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
	; SSE41-NEXT: xorps %xmm1, %xmm1
	; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: shuffle_v4f32_0zz6:			; AVX-LABEL: shuffle_v4f32_0zz6:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]			; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
	; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>			%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
	%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>			%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
	ret <4 x float> %shuffle1			ret <4 x float> %shuffle1
	}			}

	define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {			define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
	; SSE2-LABEL: shuffle_v4f32_0z24:			; SSE2-LABEL: shuffle_v4f32_0z24:
	Show All 24 Lines
	; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]			; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
	; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]			; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
	; SSSE3-NEXT: movaps %xmm1, %xmm0			; SSSE3-NEXT: movaps %xmm1, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: shuffle_v4f32_0z24:			; SSE41-LABEL: shuffle_v4f32_0z24:
	; SSE41: # BB#0:			; SSE41: # BB#0:
	; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]			; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
	; SSE41-NEXT: xorps %xmm1, %xmm1
	; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: shuffle_v4f32_0z24:			; AVX-LABEL: shuffle_v4f32_0z24:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]			; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
	; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>			%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
	%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>			%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
	ret <4 x float> %shuffle1			ret <4 x float> %shuffle1
	}			}

	define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {			define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
	; SSE2-LABEL: shuffle_v4i32_4zzz:			; SSE2-LABEL: shuffle_v4i32_4zzz:
	▲ Show 20 Lines • Show All 991 Lines • Show Last 20 Lines