Diff 279163

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 44,352 Lines • ▼ Show 20 Lines
/// and		/// and
/// B = < float b0, float b1, float b2, float b3 >		/// B = < float b0, float b1, float b2, float b3 >
/// then the result of doing a horizontal operation on A and B is		/// then the result of doing a horizontal operation on A and B is
/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.		/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form		/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is		/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.		/// set to A, RHS to B, and the routine returns 'true'.
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,		static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget, bool IsCommutative,
bool IsCommutative) {		SmallVectorImpl<int> &PostShuffleMask) {
// If either operand is undef, bail out. The binop should be simplified.		// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() \|\| RHS.isUndef())		if (LHS.isUndef() \|\| RHS.isUndef())
return false;		return false;

// Look for the following pattern:		// Look for the following pattern:
// A = < float a0, float a1, float a2, float a3 >		// A = < float a0, float a1, float a2, float a3 >
// B = < float b0, float b1, float b2, float b3 >		// B = < float b0, float b1, float b2, float b3 >
// and		// and
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
}		}

if (RMask.empty()) {		if (RMask.empty()) {
C = RHS;		C = RHS;
for (unsigned i = 0; i != NumElts; ++i)		for (unsigned i = 0; i != NumElts; ++i)
RMask.push_back(i);		RMask.push_back(i);
}		}

		if (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) \|\|
		isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask))
		return false;
		spatelUnsubmitted Not Done Reply Inline Actions Reduce negative logic? // Avoid 128-bit lane crossing if this is pre-AVX2 and FP (integer will be split). if (!Subtarget.hasAVX2 && VT.isFloatingPoint() && ...) spatel: Reduce negative logic? // Avoid 128-bit lane crossing if this is pre-AVX2 and FP (integer will…

// If A and B occur in reverse order in RHS, then canonicalize by commuting		// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.		// RHS operands and shuffle mask.
if (A != C) {		if (A != C) {
std::swap(C, D);		std::swap(C, D);
ShuffleVectorSDNode::commuteMask(RMask);		ShuffleVectorSDNode::commuteMask(RMask);
}		}
// Check that the shuffles are both shuffling the same vectors.		// Check that the shuffles are both shuffling the same vectors.
if (!(A == C && B == D))		if (!(A == C && B == D))
return false;		return false;

		PostShuffleMask.clear();
		PostShuffleMask.append(NumElts, SM_SentinelUndef);

// LHS and RHS are now:		// LHS and RHS are now:
// LHS = shuffle A, B, LMask		// LHS = shuffle A, B, LMask
// RHS = shuffle A, B, RMask		// RHS = shuffle A, B, RMask
// Check that the masks correspond to performing a horizontal operation.		// Check that the masks correspond to performing a horizontal operation.
// AVX defines horizontal add/sub to operate independently on 128-bit lanes,		// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
// so we just repeat the inner loop if this is a 256-bit op.		// so we just repeat the inner loop if this is a 256-bit op.
unsigned Num128BitChunks = VT.getSizeInBits() / 128;		unsigned Num128BitChunks = VT.getSizeInBits() / 128;
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;		unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
		unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
assert((NumEltsPer128BitChunk % 2 == 0) &&		assert((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane");		"Vector type should have an even number of elements in each lane");
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {		for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {		for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
// Ignore undefined components.		// Ignore undefined components.
int LIdx = LMask[i + j], RIdx = RMask[i + j];		int LIdx = LMask[i + j], RIdx = RMask[i + j];
if (LIdx < 0 \|\| RIdx < 0 \|\|		if (LIdx < 0 \|\| RIdx < 0 \|\|
(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|		(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))		(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
continue;		continue;

		// Check that successive odd/even elements are being operated on. If not,
		// this is not a horizontal operation.
		if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
		!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
		return false;

		// Compute the post-shuffle mask index based on where the element
		// is stored in the HOP result, and where it needs to be moved to.
		int Base = LIdx & ~1u;
		int Index = ((Base % NumEltsPer128BitChunk) / 2) +
		((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

// The low half of the 128-bit result must choose from A.		// The low half of the 128-bit result must choose from A.
// The high half of the 128-bit result must choose from B,		// The high half of the 128-bit result must choose from B,
// unless B is undef. In that case, we are always choosing from A.		// unless B is undef. In that case, we are always choosing from A.
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;		if ((B && Base >= (int)NumElts) \|\| (!B && i >= NumEltsPer64BitChunk))
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;		Index += NumEltsPer64BitChunk;
		PostShuffleMask[i + j] = Index;
// Check that successive elements are being operated on. If not, this is
// not a horizontal operation.
int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
if (!(LIdx == Index && RIdx == Index + 1) &&
!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
return false;
}		}
}		}

LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.		LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.		RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))		bool IsIdentityPostShuffle =
		isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
		if (IsIdentityPostShuffle)
		PostShuffleMask.clear();

		// Assume a SingleSource HOP if we only shuffle one input and don't need to
		// shuffle the result.
		if (!shouldUseHorizontalOp(LHS == RHS &&
		(NumShuffles < 2 \|\| !IsIdentityPostShuffle),
		DAG, Subtarget))
return false;		return false;

LHS = DAG.getBitcast(VT, LHS);		LHS = DAG.getBitcast(VT, LHS);
RHS = DAG.getBitcast(VT, RHS);		RHS = DAG.getBitcast(VT, RHS);
return true;		return true;
}		}

/// Do target-specific dag combines on floating-point adds/subs.		/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,		static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);		SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);		SDValue RHS = N->getOperand(1);
bool IsFadd = N->getOpcode() == ISD::FADD;		bool IsFadd = N->getOpcode() == ISD::FADD;
auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;		auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");		assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

// Try to synthesize horizontal add/sub from adds/subs of shuffles.		// Try to synthesize horizontal add/sub from adds/subs of shuffles.
		SmallVector<int, 8> PostShuffleMask;
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|		if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&		(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))		isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);		SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
		if (!PostShuffleMask.empty())
		HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
		DAG.getUNDEF(VT), PostShuffleMask);
		return HorizBinOp;
		}

// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.		// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.

return SDValue();		return SDValue();
}		}

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify		/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
/// the codegen.		/// the codegen.
▲ Show 20 Lines • Show All 3,085 Lines • ▼ Show 20 Lines
static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,		static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);		SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);		SDValue Op1 = N->getOperand(1);
bool IsAdd = N->getOpcode() == ISD::ADD;		bool IsAdd = N->getOpcode() == ISD::ADD;
assert((IsAdd \|\| N->getOpcode() == ISD::SUB) && "Wrong opcode");		assert((IsAdd \|\| N->getOpcode() == ISD::SUB) && "Wrong opcode");

		SmallVector<int, 8> PostShuffleMask;
if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|		if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
VT == MVT::v8i32) &&		VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&		Subtarget.hasSSSE3() &&
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {		isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,		auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {		ArrayRef<SDValue> Ops) {
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,		return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
DL, Ops[0].getValueType(), Ops);		Ops[0].getValueType(), Ops);
};		};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},		SDValue HorizBinOp =
HOpBuilder);		SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
		if (!PostShuffleMask.empty())
		HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
		DAG.getUNDEF(VT), PostShuffleMask);
		return HorizBinOp;
}		}

return SDValue();		return SDValue();
}		}

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,		static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
▲ Show 20 Lines • Show All 2,577 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/haddsub-3.ll

	Show All 11 Lines
	; SSE2-NEXT: movaps %xmm0, %xmm1			; SSE2-NEXT: movaps %xmm0, %xmm1
	; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]			; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
	; SSE2-NEXT: addps %xmm0, %xmm1			; SSE2-NEXT: addps %xmm0, %xmm1
	; SSE2-NEXT: movaps %xmm1, %xmm0			; SSE2-NEXT: movaps %xmm1, %xmm0
	; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]			; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
	; SSE2-NEXT: addss %xmm1, %xmm0			; SSE2-NEXT: addss %xmm1, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: pr26491:			; SSSE3-SLOW-LABEL: pr26491:
	; SSSE3: # %bb.0:			; SSSE3-SLOW: # %bb.0:
	; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
	; SSSE3-NEXT: addps %xmm0, %xmm1			; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
	; SSSE3-NEXT: movaps %xmm1, %xmm0			; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
	; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]			; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
	; SSSE3-NEXT: addss %xmm1, %xmm0			; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-SLOW-NEXT: retq
	;			;
	; AVX-LABEL: pr26491:			; SSSE3-FAST-LABEL: pr26491:
	; AVX: # %bb.0:			; SSSE3-FAST: # %bb.0:
	; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
	; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0			; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
	; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]			; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
	; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0			; SSSE3-FAST-NEXT: addss %xmm0, %xmm1
	; AVX-NEXT: retq			; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
				; SSSE3-FAST-NEXT: retq
				;
				; AVX1-SLOW-LABEL: pr26491:
				; AVX1-SLOW: # %bb.0:
				; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
				; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
				; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
				; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
				; AVX1-SLOW-NEXT: retq
				;
				; AVX1-FAST-LABEL: pr26491:
				; AVX1-FAST: # %bb.0:
				; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
				; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
				; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
				; AVX1-FAST-NEXT: retq
				;
				; AVX2-LABEL: pr26491:
				; AVX2: # %bb.0:
				; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
				; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
				; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: retq
	%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>			%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
	%2 = fadd <4 x float> %1, %a0			%2 = fadd <4 x float> %1, %a0
	%3 = extractelement <4 x float> %2, i32 2			%3 = extractelement <4 x float> %2, i32 2
	%4 = extractelement <4 x float> %2, i32 0			%4 = extractelement <4 x float> %2, i32 0
	%5 = fadd float %3, %4			%5 = fadd float %3, %4
	ret float %5			ret float %5
	}			}

	▲ Show 20 Lines • Show All 93 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/haddsub-shuf.ll

Show First 20 Lines • Show All 873 Lines • ▼ Show 20 Lines	; AVX2-NEXT: retq
ret <4 x float> %2		ret <4 x float> %2
}		}

declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)		declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)

define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {		define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
; SSSE3_SLOW-LABEL: PR34724_1:		; SSSE3_SLOW-LABEL: PR34724_1:
; SSSE3_SLOW: # %bb.0:		; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSSE3_SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]		; SSSE3_SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSSE3_SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]		; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0
; SSSE3_SLOW-NEXT: addps %xmm0, %xmm2		; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]		; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2
; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0		; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[1,0]		; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0
; SSSE3_SLOW-NEXT: retq		; SSSE3_SLOW-NEXT: retq
;		;
; SSSE3_FAST-LABEL: PR34724_1:		; SSSE3_FAST-LABEL: PR34724_1:
; SSSE3_FAST: # %bb.0:		; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2]
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSSE3_FAST-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]		; SSSE3_FAST-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSSE3_FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]		; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0
; SSSE3_FAST-NEXT: addps %xmm0, %xmm2
; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1		; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0]		; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]		; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0
; SSSE3_FAST-NEXT: retq		; SSSE3_FAST-NEXT: retq
;		;
; AVX1_SLOW-LABEL: PR34724_1:		; AVX1_SLOW-LABEL: PR34724_1:
; AVX1_SLOW: # %bb.0:		; AVX1_SLOW: # %bb.0:
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]		; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero		; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]		; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1		; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]		; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX1_SLOW-NEXT: retq		; AVX1_SLOW-NEXT: retq
;		;
; AVX1_FAST-LABEL: PR34724_1:		; AVX1_FAST-LABEL: PR34724_1:
; AVX1_FAST: # %bb.0:		; AVX1_FAST: # %bb.0:
; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]		; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero		; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1		; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]		; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX1_FAST-NEXT: retq		; AVX1_FAST-NEXT: retq
;		;
; AVX2_SLOW-LABEL: PR34724_1:		; AVX2_SLOW-LABEL: PR34724_1:
; AVX2_SLOW: # %bb.0:		; AVX2_SLOW: # %bb.0:
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]		; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero		; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]		; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1		; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]		; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX2_SLOW-NEXT: retq		; AVX2_SLOW-NEXT: retq
;		;
; AVX2_FAST-LABEL: PR34724_1:		; AVX2_FAST-LABEL: PR34724_1:
; AVX2_FAST: # %bb.0:		; AVX2_FAST: # %bb.0:
; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]		; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero		; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1		; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]		; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX2_FAST-NEXT: retq		; AVX2_FAST-NEXT: retq
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>		%t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
%t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>		%t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
%t2 = fadd <2 x float> %t0, %t1		%t2 = fadd <2 x float> %t0, %t1
%vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>		%vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
%t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>		%t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%t4 = fadd <4 x float> %t3, %b		%t4 = fadd <4 x float> %t3, %b
%vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>		%vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
ret <4 x float> %vecinit13		ret <4 x float> %vecinit13
}		}

define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {		define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
; SSSE3_SLOW-LABEL: PR34724_2:		; SSSE3_SLOW-LABEL: PR34724_2:
; SSSE3_SLOW: # %bb.0:		; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2		; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0
; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]		; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; SSSE3_SLOW-NEXT: addps %xmm1, %xmm3
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2		; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[1,0]		; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[2,0]		; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0
; SSSE3_SLOW-NEXT: retq		; SSSE3_SLOW-NEXT: retq
;		;
; SSSE3_FAST-LABEL: PR34724_2:		; SSSE3_FAST-LABEL: PR34724_2:
; SSSE3_FAST: # %bb.0:		; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: movaps %xmm1, %xmm3		; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0]
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0]
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
; SSSE3_FAST-NEXT: addps %xmm3, %xmm2
; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1		; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0]		; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]		; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0
; SSSE3_FAST-NEXT: retq		; SSSE3_FAST-NEXT: retq
;		;
; AVX1_SLOW-LABEL: PR34724_2:		; AVX1_SLOW-LABEL: PR34724_2:
; AVX1_SLOW: # %bb.0:		; AVX1_SLOW: # %bb.0:
; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]		; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]		; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1		; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
; AVX1_SLOW-NEXT: retq		; AVX1_SLOW-NEXT: retq
;		;
; AVX1_FAST-LABEL: PR34724_2:		; AVX1_FAST-LABEL: PR34724_2:
; AVX1_FAST: # %bb.0:		; AVX1_FAST: # %bb.0:
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]		; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1		; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
; AVX1_FAST-NEXT: retq		; AVX1_FAST-NEXT: retq
;		;
; AVX2_SLOW-LABEL: PR34724_2:		; AVX2_SLOW-LABEL: PR34724_2:
; AVX2_SLOW: # %bb.0:		; AVX2_SLOW: # %bb.0:
; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]		; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]		; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1		; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
; AVX2_SLOW-NEXT: retq		; AVX2_SLOW-NEXT: retq
;		;
; AVX2_FAST-LABEL: PR34724_2:		; AVX2_FAST-LABEL: PR34724_2:
; AVX2_FAST: # %bb.0:		; AVX2_FAST: # %bb.0:
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]		; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1		; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]		; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
; AVX2_FAST-NEXT: retq		; AVX2_FAST-NEXT: retq
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>		%t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
%t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>		%t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
%t2 = fadd <4 x float> %t0, %t1		%t2 = fadd <4 x float> %t0, %t1
%vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>		%vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
%t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>		%t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%t4 = fadd <4 x float> %t3, %b		%t4 = fadd <4 x float> %t3, %b
%vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>		%vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
ret <4 x float> %vecinit13		ret <4 x float> %vecinit13
}		}

llvm/test/CodeGen/X86/haddsub-undef.ll

	Show First 20 Lines • Show All 825 Lines • ▼ Show 20 Lines
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>			%3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
	%4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>			%4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
	%5 = fadd <4 x double> %3, %4			%5 = fadd <4 x double> %3, %4
	ret <4 x double> %5			ret <4 x double> %5
	}			}

	define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {			define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
	; SSE-LABEL: PR45747_1:			; SSE-SLOW-LABEL: PR45747_1:
	; SSE: # %bb.0:			; SSE-SLOW: # %bb.0:
	; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
	; SSE-NEXT: addps %xmm0, %xmm1			; SSE-SLOW-NEXT: addps %xmm0, %xmm1
	; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3]			; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; SSE-NEXT: movaps %xmm1, %xmm0			; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
	; SSE-NEXT: retq			; SSE-SLOW-NEXT: retq
	;			;
	; AVX-LABEL: PR45747_1:			; SSE-FAST-LABEL: PR45747_1:
	; AVX: # %bb.0:			; SSE-FAST: # %bb.0:
	; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSE-FAST-NEXT: haddps %xmm0, %xmm0
	; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0			; SSE-FAST-NEXT: retq
	; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]			;
	; AVX-NEXT: retq			; AVX-SLOW-LABEL: PR45747_1:
				; AVX-SLOW: # %bb.0:
				; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
				; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
				; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; AVX-SLOW-NEXT: retq
				;
				; AVX-FAST-LABEL: PR45747_1:
				; AVX-FAST: # %bb.0:
				; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
				; AVX-FAST-NEXT: retq
	%t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>			%t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
	%t1 = fadd <4 x float> %t0, %a			%t1 = fadd <4 x float> %t0, %a
	%shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>			%shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}

	define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {			define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
	; SSE-LABEL: PR45747_2:			; SSE-SLOW-LABEL: PR45747_2:
	; SSE: # %bb.0:			; SSE-SLOW: # %bb.0:
	; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]			; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
	; SSE-NEXT: addps %xmm1, %xmm0			; SSE-SLOW-NEXT: addps %xmm1, %xmm0
	; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]			; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
	; SSE-NEXT: retq			; SSE-SLOW-NEXT: retq
	;			;
	; AVX-LABEL: PR45747_2:			; SSE-FAST-LABEL: PR45747_2:
	; AVX: # %bb.0:			; SSE-FAST: # %bb.0:
	; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]			; SSE-FAST-NEXT: movaps %xmm1, %xmm0
	; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0			; SSE-FAST-NEXT: haddps %xmm1, %xmm0
	; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]			; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
	; AVX-NEXT: retq			; SSE-FAST-NEXT: retq
				;
				; AVX-SLOW-LABEL: PR45747_2:
				; AVX-SLOW: # %bb.0:
				; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
				; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
				; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
				; AVX-SLOW-NEXT: retq
				;
				; AVX-FAST-LABEL: PR45747_2:
				; AVX-FAST: # %bb.0:
				; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0
				; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
				; AVX-FAST-NEXT: retq
	%t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>			%t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
	%t1 = fadd <4 x float> %t0, %b			%t1 = fadd <4 x float> %t0, %b
	%shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>			%shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y))
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 279163

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/haddsub-3.ll

llvm/test/CodeGen/X86/haddsub-shuf.ll

llvm/test/CodeGen/X86/haddsub-undef.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y))ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 279163

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/haddsub-3.ll

llvm/test/CodeGen/X86/haddsub-shuf.ll

llvm/test/CodeGen/X86/haddsub-undef.ll

[X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y))
ClosedPublic