Diff 129789

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,741 Lines • ▼ Show 20 Lines	return DAG.getVectorShuffle(
VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,		VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
DL, VT, V1, V2),		DL, VT, V1, V2),
DAG.getUNDEF(VT), PermMask);		DAG.getUNDEF(VT), PermMask);
}		}

return SDValue();		return SDValue();
}		}

		// Attempt to lower a shuffle where one lane comes from V1 and the other
		// lane comes from V2 and the lanes do the same operation. We can create
		// a new V1 with the lower lane of V1 and the lower lane of V2. And a new
		// V2 with the upper lane of V1 and the upper lane of V2 and then do a
		// repeated lane shuffle.
		static SDValue lowerVectorShuffleSplitLowHigh(const SDLoc &DL,
		MVT VT, SDValue V1,
		SDValue V2,
		ArrayRef<int> Mask,
		SelectionDAG &DAG) {
		int Size = Mask.size();
		SmallVector<int, 8> RepeatMask(Size, -1);
		RKSimonUnsubmitted Not Done Reply Inline Actions Should't RepeatMask be just LaneSize wide? RKSimon: Should't RepeatMask be just LaneSize wide?
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions Yes it should. craig.topper: Yes it should.

		int LaneSize = 128 / VT.getScalarSizeInBits();
		for (int i = 0; i != Size; ++i) {
		int M = Mask[i];
		if (M < 0)
		continue;

		// Make sure the element comes from the same source as the half.
		if ((M / Size) != (i / LaneSize))
		return SDValue();

		int LocalM = M % Size;
		if (RepeatMask[i % LaneSize] < 0)
		RepeatMask[i % LaneSize] = LocalM;
		else if (RepeatMask[i % LaneSize] != LocalM)
		return SDValue();
		}

		SmallVector<int, 8> PermuteMask(Size, -1);
		for (int i = 0; i != Size; ++i)
		PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size;
		SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask);
		for (int i = 0; i != Size; ++i)
		PermuteMask[i] = (i % LaneSize) + (i / LaneSize) * Size + LaneSize;
		SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, PermuteMask);

		for (int i = 0; i != Size; ++i) {
		int M = RepeatMask[i % LaneSize];
		PermuteMask[i] = M;
		if (PermuteMask[i] < 0)
		RKSimonUnsubmitted Not Done Reply Inline Actions Do we gain anything by relaxing this and keeping PermuteMask[i] as UNDEF if the original Mask[i] was UNDEF? RKSimon: Do we gain anything by relaxing this and keeping PermuteMask[i] as UNDEF if the original Mask…
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions Not sure. I was trying to create a repeated lane shuffle so its based on both lanes. If its undef in both lanes it will be undef here. craig.topper: Not sure. I was trying to create a repeated lane shuffle so its based on both lanes. If its…
		continue;

		if (PermuteMask[i] >= LaneSize)
		PermuteMask[i] += Size - LaneSize;

		PermuteMask[i] += (i / LaneSize) * LaneSize;
		}

		return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, PermuteMask);
		}

/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.		/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///		///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full		/// This is the basis function for the 2-lane 64-bit shuffles as we have full
/// support for floating point shuffles but not integer shuffles. These		/// support for floating point shuffles but not integer shuffles. These
/// instructions will incur a domain crossing penalty on some chips though so		/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where		/// it is better to avoid lowering through this for integer vectors where
/// possible.		/// possible.
static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,		static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
▲ Show 20 Lines • Show All 1,924 Lines • ▼ Show 20 Lines
/// This will only succeed when the result of fixing the 128-bit lanes results		/// This will only succeed when the result of fixing the 128-bit lanes results
/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in		/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
/// each 128-bit lanes. This handles many cases where we can quickly blend away		/// each 128-bit lanes. This handles many cases where we can quickly blend away
/// the lane crosses early and then use simpler shuffles within each lane.		/// the lane crosses early and then use simpler shuffles within each lane.
///		///
/// FIXME: It might be worthwhile at some point to support this without		/// FIXME: It might be worthwhile at some point to support this without
/// requiring the 128-bit lane-relative shuffles to be repeating, but currently		/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
/// in x86 only floating point has interesting non-repeating shuffles, and even		/// in x86 only floating point has interesting non-repeating shuffles, and even
/// those are still marginally more expensive.		/// those are still marginally more expensive.
		RKSimonUnsubmitted Not Done Reply Inline Actions This comment + FIXME needs updating RKSimon: This comment + FIXME needs updating
static SDValue lowerVectorShuffleByMerging128BitLanes(		static SDValue lowerVectorShuffleByMerging128BitLanes(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,		const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {		const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");		assert(!V2.isUndef() && "This is only useful with multiple inputs.");

int Size = Mask.size();		int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();		int LaneSize = 128 / VT.getScalarSizeInBits();
int NumLanes = Size / LaneSize;		int NumLanes = Size / LaneSize;
		RKSimonUnsubmitted Not Done Reply Inline Actions There is a is128BitLaneRepeatedShuffleMask(VT, Mask) version that you can use instead RKSimon: There is a is128BitLaneRepeatedShuffleMask(VT, Mask) version that you can use instead
assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");		assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");

// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also		// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
// check whether the in-128-bit lane shuffles share a repeating pattern.		// check whether the in-128-bit lane shuffles share a repeating pattern.
SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);		SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);		SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
for (int i = 0; i < Size; ++i) {		for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)		if (Mask[i] < 0)
		RKSimonUnsubmitted Not Done Reply Inline Actions This seems easier to grok (at least to me): int Src; if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc) Src = 0; else if (Srcs[1] < 0 \|\| Srcs[1] == LaneSrc) Src = 1; else return SDValue(); Srcs[Src] = LaneSrc; InLaneMask[i] = (M % LaneSize) + Src * Size; RKSimon: This seems easier to grok (at least to me): ``` int Src; if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc)…
continue;		continue;

int j = i / LaneSize;		int j = i / LaneSize;

if (Lanes[j] < 0) {		if (Lanes[j] < 0) {
// First entry we've seen for this lane.		// First entry we've seen for this lane.
Lanes[j] = Mask[i] / LaneSize;		Lanes[j] = Mask[i] / LaneSize;
} else if (Lanes[j] != Mask[i] / LaneSize) {		} else if (Lanes[j] != Mask[i] / LaneSize) {
// This doesn't match the lane selected previously!		// This doesn't match the lane selected previously!
return SDValue();		return SDValue();
}		}

		RKSimonUnsubmitted Not Done Reply Inline Actions for (int i = 0, e = M1.size(); i != e; ++i) RKSimon: for (int i = 0, e = M1.size(); i != e; ++i)
// Check that within each lane we have a consistent shuffle mask.		// Check that within each lane we have a consistent shuffle mask.
int k = i % LaneSize;		int k = i % LaneSize;
if (InLaneMask[k] < 0) {		if (InLaneMask[k] < 0) {
InLaneMask[k] = Mask[i] % LaneSize;		InLaneMask[k] = Mask[i] % LaneSize;
} else if (InLaneMask[k] != Mask[i] % LaneSize) {		} else if (InLaneMask[k] != Mask[i] % LaneSize) {
// This doesn't fit a repeating in-lane mask.		// This doesn't fit a repeating in-lane mask.
return SDValue();		return SDValue();
}		}
		RKSimonUnsubmitted Not Done Reply Inline Actions for (int i = 0, e = MergedMask.size(); i != e; ++i) { RKSimon: for (int i = 0, e = MergedMask.size(); i != e; ++i) {
}		}

// First shuffle the lanes into place.		// First shuffle the lanes into place.
MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,		MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
VT.getSizeInBits() / 64);		VT.getSizeInBits() / 64);
SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);		SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
for (int i = 0; i < NumLanes; ++i)		for (int i = 0; i < NumLanes; ++i)
if (Lanes[i] >= 0) {		if (Lanes[i] >= 0) {
▲ Show 20 Lines • Show All 468 Lines • ▼ Show 20 Lines	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
return V;		return V;

// If we have AVX2 then we always want to lower with a blend because an v4 we		// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.		// can fully permute the elements.
if (Subtarget.hasAVX2())		if (Subtarget.hasAVX2())
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,		return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
Mask, DAG);		Mask, DAG);

		// Attempt to lower a shuffle where one lane comes from V1 and the other
		// lane comes from V2 and the lanes do the same operation. We can create
		// a new V1 with the lower lane of V1 and the lower lane of V2. And a new
		// V2 with the upper lane of V1 and the upper lane of V2 and then do a
		// repeated lane shuffle.
		if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v4f64, V1, V2,
		Mask, DAG))
		return V;

// Otherwise fall back on generic lowering.		// Otherwise fall back on generic lowering.
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);		return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
}		}

/// \brief Handle lowering of 4-lane 64-bit integer shuffles.		/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
///		///
/// This routine is only called when we have AVX2 and thus a reasonable		/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..		/// instruction set for v4i64 shuffling..
▲ Show 20 Lines • Show All 177 Lines • ▼ Show 20 Lines	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
return V;		return V;

// If we have AVX2 then we always want to lower with a blend because at v8 we		// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.		// can fully permute the elements.
if (Subtarget.hasAVX2())		if (Subtarget.hasAVX2())
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,		return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
Mask, DAG);		Mask, DAG);

		// Attempt to lower a shuffle where one lane comes from V1 and the other
		// lane comes from V2 and the lanes do the same operation. We can create
		// a new V1 with the lower lane of V1 and the lower lane of V2. And a new
		// V2 with the upper lane of V1 and the upper lane of V2 and then do a
		// repeated lane shuffle.
		if (SDValue V = lowerVectorShuffleSplitLowHigh(DL, MVT::v8f32, V1, V2,
		Mask, DAG))
		return V;

// Otherwise fall back on generic lowering.		// Otherwise fall back on generic lowering.
return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);		return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
}		}

/// \brief Handle lowering of 8-lane 32-bit integer shuffles.		/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
///		///
/// This routine is only called when we have AVX2 and thus a reasonable		/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..		/// instruction set for v8i32 shuffling..
▲ Show 20 Lines • Show All 25,417 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-256-v4.ll

	Show First 20 Lines • Show All 1,665 Lines • ▼ Show 20 Lines
	;			;
	; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z:			; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z:
	; AVX512VL-FAST: # %bb.0:			; AVX512VL-FAST: # %bb.0:
	; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero			; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
	; AVX512VL-FAST-NEXT: retq			; AVX512VL-FAST-NEXT: retq
	%1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>			%1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
	ret <4 x i64> %1			ret <4 x i64> %1
	}			}

				define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) {
				; AVX1-LABEL: add_v4f64_0246_1357:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
				; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
				; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
				; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v4f64_0246_1357:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v4f64_0246_1357:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vaddpd %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v4f64_0246_1357:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6]
				; AVX512VL-FAST-NEXT: vpermi2pd %ymm1, %ymm0, %ymm2
				; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,3,5,7]
				; AVX512VL-FAST-NEXT: vpermi2pd %ymm1, %ymm0, %ymm3
				; AVX512VL-FAST-NEXT: vaddpd %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				%shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
				%add = fadd <4 x double> %shuffle, %shuffle1
				ret <4 x double> %add
				}

				define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) {
				; AVX1-LABEL: add_v4f64_4602_5713:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
				; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
				; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
				; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v4f64_4602_5713:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v4f64_4602_5713:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vaddpd %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v4f64_4602_5713:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6]
				; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2
				; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,3,5,7]
				; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm3
				; AVX512VL-FAST-NEXT: vaddpd %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
				%shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
				%add = fadd <4 x double> %shuffle, %shuffle1
				ret <4 x double> %add
				}

				define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) {
				; AVX1-LABEL: add_v4i64_0246_1357:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
				; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
				; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
				; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
				; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
				; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v4i64_0246_1357:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
				; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
				; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v4i64_0246_1357:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
				; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
				; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v4i64_0246_1357:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6]
				; AVX512VL-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7]
				; AVX512VL-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
				; AVX512VL-FAST-NEXT: vpaddq %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				%shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
				%add = add <4 x i64> %shuffle, %shuffle1
				ret <4 x i64> %add
				}

				define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) {
				; AVX1-LABEL: add_v4i64_4602_5713:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
				; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
				; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
				; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
				; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
				; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v4i64_4602_5713:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
				; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v4i64_4602_5713:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
				; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v4i64_4602_5713:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6]
				; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7]
				; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm3
				; AVX512VL-FAST-NEXT: vpaddq %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
				%shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
				%add = add <4 x i64> %shuffle, %shuffle1
				ret <4 x i64> %add
				}

test/CodeGen/X86/vector-shuffle-256-v8.ll

	Show First 20 Lines • Show All 854 Lines • ▼ Show 20 Lines
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

	define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {			define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
	; AVX1-LABEL: PR21138:			; AVX1-LABEL: PR21138:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
	; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1			; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: PR21138:			; AVX2-LABEL: PR21138:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]			; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
	; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]			; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	▲ Show 20 Lines • Show All 1,537 Lines • ▼ Show 20 Lines
	;			;
	; AVX512VL-LABEL: shuffle_v8i32_12345670:			; AVX512VL-LABEL: shuffle_v8i32_12345670:
	; AVX512VL: # %bb.0:			; AVX512VL: # %bb.0:
	; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0]			; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0]
	; AVX512VL-NEXT: retq			; AVX512VL-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
	ret <8 x i32> %shuffle			ret <8 x i32> %shuffle
	}			}

				define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) {
				; AVX1-LABEL: add_v8f32_02468ACE_13579BDF:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
				; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
				; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
				; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v8f32_02468ACE_13579BDF:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
				; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm2
				; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
				; AVX512VL-FAST-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3
				; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%add = fadd <8 x float> %shuffle, %shuffle1
				ret <8 x float> %add
				}

				define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) {
				; AVX1-LABEL: add_v8f32_8ACE0246_9BDF1357:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
				; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
				; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
				; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vaddps %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v8f32_8ACE0246_9BDF1357:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vaddps %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v8f32_8ACE0246_9BDF1357:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
				; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2
				; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
				; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm3
				; AVX512VL-FAST-NEXT: vaddps %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
				%shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7>
				%add = fadd <8 x float> %shuffle, %shuffle1
				ret <8 x float> %add
				}

				define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) {
				; AVX1-LABEL: add_v8i32_02468ACE_13579BDF:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
				; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
				; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
				; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
				; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
				; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v8i32_02468ACE_13579BDF:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
				; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
				; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
				; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				%shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
				%add = add <8 x i32> %shuffle, %shuffle1
				ret <8 x i32> %add
				}

				define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) {
				; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357:
				; AVX1: # %bb.0: # %entry
				; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
				; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
				; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
				; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
				; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
				; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
				; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357:
				; AVX2: # %bb.0: # %entry
				; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
				; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
				; AVX2-NEXT: retq
				;
				; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357:
				; AVX512VL-SLOW: # %bb.0: # %entry
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
				; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
				; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0
				; AVX512VL-SLOW-NEXT: retq
				;
				; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357:
				; AVX512VL-FAST: # %bb.0: # %entry
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
				; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
				; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
				; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3
				; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0
				; AVX512VL-FAST-NEXT: retq
				entry:
				%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
				%shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7>
				%add = add <8 x i32> %shuffle, %shuffle1
				ret <8 x i32> %add
				}

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Improve AVX1 shuffle lowering for v8f32 shuffles where the low half comes from V1 and the high half comes from V2 and the halves do the same operation
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129789

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/vector-shuffle-256-v4.ll

test/CodeGen/X86/vector-shuffle-256-v8.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Improve AVX1 shuffle lowering for v8f32 shuffles where the low half comes from V1 and the high half comes from V2 and the halves do the same operationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129789

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/vector-shuffle-256-v4.ll

test/CodeGen/X86/vector-shuffle-256-v8.ll

[X86] Improve AVX1 shuffle lowering for v8f32 shuffles where the low half comes from V1 and the high half comes from V2 and the halves do the same operation
ClosedPublic