Diff 375932

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,686 Lines • ▼ Show 20 Lines
/// blends and permutes.		/// blends and permutes.
///		///
/// This matches the extremely common pattern for handling combined		/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend		/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and		/// operations. It will try to pick the best arrangement of shuffles and
/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.		/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
static SDValue lowerShuffleAsDecomposedShuffleMerge(		static SDValue lowerShuffleAsDecomposedShuffleMerge(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,		const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {		const X86Subtarget &Subtarget, SelectionDAG &DAG,
		bool BlendOfIdentitiesOnly = false) {
int NumElts = Mask.size();		int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;		int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;		int NumEltsPerLane = NumElts / NumLanes;

// Shuffle the input elements into the desired positions in V1 and V2 and		// Shuffle the input elements into the desired positions in V1 and V2 and
// unpack/blend them together.		// unpack/blend them together.
bool IsAlternating = true;		bool IsAlternating = true;
SmallVector<int, 32> V1Mask(NumElts, -1);		SmallVector<int, 32> V1Mask(NumElts, -1);
Show All 39 Lines	static SDValue lowerShuffleAsDecomposedShuffleMerge(
// See if we can simplify non-no-op shuffles into broadcasts,		// See if we can simplify non-no-op shuffles into broadcasts,
// which we consider to be strictly better than an arbitrary shuffle.		// which we consider to be strictly better than an arbitrary shuffle.
if (isNoopOrBroadcastShuffleMask(V1Mask) &&		if (isNoopOrBroadcastShuffleMask(V1Mask) &&
isNoopOrBroadcastShuffleMask(V2Mask)) {		isNoopOrBroadcastShuffleMask(V2Mask)) {
canonicalizeBroadcastableInput(V1, V1Mask);		canonicalizeBroadcastableInput(V1, V1Mask);
canonicalizeBroadcastableInput(V2, V2Mask);		canonicalizeBroadcastableInput(V2, V2Mask);
}		}

		if (BlendOfIdentitiesOnly &&
		!(isNoopShuffleMask(V1Mask) && isNoopShuffleMask(V2Mask)))
		return SDValue();

// Try to lower with the simpler initial blend/unpack/rotate strategies unless		// Try to lower with the simpler initial blend/unpack/rotate strategies unless
// one of the input shuffles would be a no-op. We prefer to shuffle inputs as		// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
// the shuffle may be able to fold with a load or other benefit. However, when		// the shuffle may be able to fold with a load or other benefit. However, when
// we'll have to do 2x as many shuffles in order to achieve this, a 2-input		// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
// pre-shuffle first is a better strategy.		// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {		if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.		// Only prefer immediate blends to unpack/rotate.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,		if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
Show All 32 Lines	for (int i = 0; i != NumElts; i += NumEltsPerLane)
}		}
}		}

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);		V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);		V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);		return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
}		}

		static SDValue lowerShuffleAsBlendOfIdentities(
		const SDLoc &DL, ArrayRef<int> Mask, const APInt &Zeroable, MVT VT,
		SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
		return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
		DAG,
		/BlendOfIdentitiesOnly=/true);
		}
		RKSimonUnsubmitted Done Reply Inline Actions lowerShuffleAsDecomposedShuffleMerge does some of this already - can we not extend that? RKSimon: lowerShuffleAsDecomposedShuffleMerge does some of this already - can we not extend that?
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions I'm not following. That is exactly what i'm doing here, extending `lowerShuffleAsDecomposedShuffleMerge()`. The point of `lowerShuffleAsBlendOfIdentities()` is to be able to run the relevant portion of `lowerShuffleAsDecomposedShuffleMerge()` at some earlier point (before the full `lowerShuffleAsDecomposedShuffleMerge()` invocation), catching the target pattern only. lebedev.ri: I'm not following. That is exactly what i'm doing here, extending…
		RKSimonUnsubmitted Done Reply Inline Actions I'm sorry, that was very badly worded - I was looking at the MVT::v4i64 case which is calling lowerShuffleAsBlendOfIdentities directly before lowerShuffleAsDecomposedShuffleMerge - not sure why I put the comment up here instead. RKSimon: I'm sorry, that was very badly worded - I was looking at the MVT::v4i64 case which is calling…
		lebedev.riAuthorUnsubmitted Done Reply Inline Actions Ah! lebedev.ri: Ah!

/// Try to lower a vector shuffle as a bit rotation.		/// Try to lower a vector shuffle as a bit rotation.
///		///
/// Look for a repeated rotation pattern in each sub group.		/// Look for a repeated rotation pattern in each sub group.
/// Returns a ISD::ROTL element rotation amount or -1 if failed.		/// Returns a ISD::ROTL element rotation amount or -1 if failed.
static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {		static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
int NumElts = Mask.size();		int NumElts = Mask.size();
assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");		assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");

▲ Show 20 Lines • Show All 4,419 Lines • ▼ Show 20 Lines	if (Subtarget.hasAVX2())
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));		getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

// Try to create an in-lane repeating shuffle mask and then shuffle the		// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.		// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(		if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;		return V;

		// FIXME: we should run `lowerShuffleAsBlendOfIdentities()` here.

// Try to permute the lanes and then use a per-lane permute.		// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,		if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
Mask, DAG, Subtarget))		Mask, DAG, Subtarget))
return V;		return V;

// Otherwise, fall back.		// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,		return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
DAG, Subtarget);		DAG, Subtarget);
}		}

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))		if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
return V;		return V;

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,		if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))		Zeroable, Subtarget, DAG))
return Blend;		return Blend;

// Check if the blend happens to exactly fit that of SHUFPD.		// Check if the blend happens to exactly fit that of SHUFPD.
if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,		if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))		Zeroable, Subtarget, DAG))
return Op;		return Op;

		// See if this shuffle can be represented as a broadcast of 0'th element
		// of some input, and a blend between said broadcast and an input.
		if (SDValue Blend = lowerShuffleAsBlendOfIdentities(
		DL, Mask, Zeroable, MVT::v4f64, V1, V2, Subtarget, DAG))
		return Blend;

// If we have lane crossing shuffles AND they don't all come from the lower		// If we have lane crossing shuffles AND they don't all come from the lower
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).		// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently		// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
// canonicalize to a blend of splat which isn't necessary for this combine.		// canonicalize to a blend of splat which isn't necessary for this combine.
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&		if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
!all_of(Mask, [](int M) { return M < 2 \|\| (4 <= M && M < 6); }) &&		!all_of(Mask, [](int M) { return M < 2 \|\| (4 <= M && M < 6); }) &&
(V1.getOpcode() != ISD::BUILD_VECTOR) &&		(V1.getOpcode() != ISD::BUILD_VECTOR) &&
(V2.getOpcode() != ISD::BUILD_VECTOR))		(V2.getOpcode() != ISD::BUILD_VECTOR))
if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,		if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
Mask, DAG))		Mask, DAG))
return Op;		return Op;

// If we have one input in place, then we can permute the other input and		// If we have one input in place, then we can permute the other input and
// blend the result.		// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))		if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,		return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
		RKSimonUnsubmitted Not Done Reply Inline Actions We're doing something very similar here as well - but I have a suspicion that the lowerShuffleAsLanePermuteAndSHUFP is placed very specifically ..... RKSimon: We're doing something very similar here as well - but I have a suspicion that the…
Subtarget, DAG);		Subtarget, DAG);

// Try to create an in-lane repeating shuffle mask and then shuffle the		// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.		// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(		if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;		return V;

▲ Show 20 Lines • Show All 36,722 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll

	Show First 20 Lines • Show All 145 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>			%r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
	ret <4 x double> %r			ret <4 x double> %r
	}			}

	define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {			define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
	; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:			; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1			; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1
	; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]			; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>			%r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
	ret <4 x double> %r			ret <4 x double> %r
	}			}

	define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_3_unary(<4 x double> %x) nounwind {			define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_3_unary(<4 x double> %x) nounwind {
	; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_3_unary:			; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_3_unary:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	▲ Show 20 Lines • Show All 702 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/horizontal-sum.ll

	Show First 20 Lines • Show All 267 Lines • ▼ Show 20 Lines
	; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1			; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
	; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4			; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
	; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2			; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
	; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]			; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
	; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]			; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
	; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]			; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
	; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]			; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
	; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1			; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
	; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]			; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
	; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]			; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2			; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1
	; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2			; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
	; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1			; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
	; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]			; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
	; AVX2-SLOW-NEXT: retq			; AVX2-SLOW-NEXT: retq
				pengfeiUnsubmitted Not Done Reply Inline Actions Why is these been affected? pengfei: Why is these been affected?
	;			;
	; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:			; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
	; AVX2-FAST: # %bb.0:			; AVX2-FAST: # %bb.0:
	; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0			; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0			; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
	; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1			; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
	; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4			; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
	; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2			; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
	; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]			; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
	; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]			; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
	; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]			; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
	; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]			; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
	; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1			; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
	; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]			; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
	; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]			; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2			; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1
	; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2			; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1
	; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1			; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
	; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]			; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
	; AVX2-FAST-NEXT: retq			; AVX2-FAST-NEXT: retq
	%9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>			%9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
	%10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>			%10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
	%11 = fadd <2 x float> %9, %10			%11 = fadd <2 x float> %9, %10
	%12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>			%12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
	%13 = fadd <2 x float> %11, %12			%13 = fadd <2 x float> %11, %12
	%14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>			%14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
	%15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>			%15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
	▲ Show 20 Lines • Show All 849 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/subvector-broadcast.ll

	Show First 20 Lines • Show All 1,671 Lines • ▼ Show 20 Lines
	; X64-NEXT: retq			; X64-NEXT: retq
	%vec = load <2 x double>, <2 x double>* %vp			%vec = load <2 x double>, <2 x double>* %vp
	%shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>			%shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
	%res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default			%res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
	ret <4 x double> %res			ret <4 x double> %res
	}			}

	define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {			define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
	; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:			; X86-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
	; X86: # %bb.0:			; X86-AVX1: # %bb.0:
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vbroadcastsd (%eax), %ymm1			; X86-AVX1-NEXT: vbroadcastsd (%eax), %ymm1
	; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]			; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
	; X86-NEXT: retl			; X86-AVX1-NEXT: retl
	;			;
	; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:			; X86-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
	; X64: # %bb.0:			; X86-AVX2: # %bb.0:
	; X64-NEXT: vbroadcastsd (%rdi), %ymm1			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]			; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
	; X64-NEXT: retq			; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; X86-AVX2-NEXT: retl
				;
				; X86-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
				; X86-AVX512: # %bb.0:
				; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
				; X86-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; X86-AVX512-NEXT: retl
				;
				; X64-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
				; X64-AVX1: # %bb.0:
				; X64-AVX1-NEXT: vbroadcastsd (%rdi), %ymm1
				; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
				; X64-AVX1-NEXT: retq
				;
				; X64-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
				; X64-AVX2: # %bb.0:
				; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
				; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; X64-AVX2-NEXT: retq
				;
				; X64-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
				; X64-AVX512: # %bb.0:
				; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
				; X64-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
				; X64-AVX512-NEXT: retq
	%vec = load <2 x float>, <2 x float>* %vp			%vec = load <2 x float>, <2 x float>* %vp
	%shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>			%shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
	%res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default			%res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
	ret <8 x float> %res			ret <8 x float> %res
	}			}

	define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {			define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {
	; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:			; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
	▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

	Show First 20 Lines • Show All 695 Lines • ▼ Show 20 Lines
	; AVX1-LABEL: shuffle_v4f64_0044:			; AVX1-LABEL: shuffle_v4f64_0044:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]			; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v4f64_0044:			; AVX2-LABEL: shuffle_v4f64_0044:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
	; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]			; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
				; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
				pengfeiUnsubmitted Not Done Reply Inline Actions Is this a regression? pengfei: Is this a regression?
	;			;
	; AVX512VL-SLOW-LABEL: shuffle_v4f64_0044:			; AVX512VL-LABEL: shuffle_v4f64_0044:
	; AVX512VL-SLOW: # %bb.0:			; AVX512VL: # %bb.0:
	; AVX512VL-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4]
	; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]			; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
	; AVX512VL-SLOW-NEXT: retq			; AVX512VL-NEXT: retq
	;
	; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044:
	; AVX512VL-FAST-ALL: # %bb.0:
	; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4]
	; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
	; AVX512VL-FAST-ALL-NEXT: retq
	;
	; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_0044:
	; AVX512VL-FAST-PERLANE: # %bb.0:
	; AVX512VL-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
	; AVX512VL-FAST-PERLANE-NEXT: retq
	%1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 4>			%1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
				pengfeiUnsubmitted Not Done Reply Inline Actions Not sure if there is the right direction. Does the `vmovlhps` + `vpermpd` have better performance for cases AVX512VL-SLOW and AVX512VL-FAST-PERLANE? Besides, this does show relationship with broadcast. pengfei: Not sure if there is the right direction. Does the `vmovlhps` + `vpermpd` have better…
	ret <4 x double> %1			ret <4 x double> %1
	}			}

	define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) {			define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) {
	; ALL-LABEL: shuffle_v4f64_0044_v2f64:			; ALL-LABEL: shuffle_v4f64_0044_v2f64:
	; ALL: # %bb.0:			; ALL: # %bb.0:
	; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]			; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
	; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]			; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
	▲ Show 20 Lines • Show All 1,417 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

	Show First 20 Lines • Show All 462 Lines • ▼ Show 20 Lines
	; X86-AVX1-NEXT: vzeroupper			; X86-AVX1-NEXT: vzeroupper
	; X86-AVX1-NEXT: retl			; X86-AVX1-NEXT: retl
	;			;
	; X86-AVX2-LABEL: PR48908:			; X86-AVX2-LABEL: PR48908:
	; X86-AVX2: # %bb.0:			; X86-AVX2: # %bb.0:
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3			; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
	; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]			; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
	; X86-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]			; X86-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
	; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]			; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
	; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]			; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
	; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]			; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
	; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)			; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)
	; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]			; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
	; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]			; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
	; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]			; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
	; X86-AVX2-NEXT: vmovapd %ymm3, (%ecx)			; X86-AVX2-NEXT: vmovapd %ymm3, (%ecx)
	; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0			; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
	; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]			; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
	; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]			; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
	; X86-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]			; X86-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
	; X86-AVX2-NEXT: vmovapd %ymm0, (%eax)			; X86-AVX2-NEXT: vmovapd %ymm0, (%eax)
	; X86-AVX2-NEXT: vzeroupper			; X86-AVX2-NEXT: vzeroupper
	; X86-AVX2-NEXT: retl			; X86-AVX2-NEXT: retl
	;			;
	; X86-AVX512-LABEL: PR48908:			; X86-AVX512-LABEL: PR48908:
	; X86-AVX512: # %bb.0:			; X86-AVX512: # %bb.0:
	; X86-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2			; X86-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
	; X86-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1			; X86-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
	; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0			; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
	; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx			; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3			; X86-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3
	; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]			; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3]
	; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]			; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
	; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]			; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
	; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0]			; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0]
	; X86-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3			; X86-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3
	; X86-AVX512-NEXT: vmovapd %ymm3, (%edx)			; X86-AVX512-NEXT: vmovapd %ymm3, (%edx)
	; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0]			; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0]
	; X86-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4			; X86-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4
	; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)			; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
	Show All 25 Lines
	; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]			; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
	; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]			; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
	; X64-AVX1-NEXT: vmovapd %ymm0, (%rdx)			; X64-AVX1-NEXT: vmovapd %ymm0, (%rdx)
	; X64-AVX1-NEXT: vzeroupper			; X64-AVX1-NEXT: vzeroupper
	; X64-AVX1-NEXT: retq			; X64-AVX1-NEXT: retq
	;			;
	; X64-AVX2-LABEL: PR48908:			; X64-AVX2-LABEL: PR48908:
	; X64-AVX2: # %bb.0:			; X64-AVX2: # %bb.0:
	; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3			; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
	; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]			; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
	; X64-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]			; X64-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
	; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]			; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
	; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]			; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
	; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]			; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
	; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)			; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)
	; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]			; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
	; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]			; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
	; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]			; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
	; X64-AVX2-NEXT: vmovapd %ymm3, (%rsi)			; X64-AVX2-NEXT: vmovapd %ymm3, (%rsi)
	; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0			; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
	; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]			; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
	; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]			; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
	; X64-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]			; X64-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
	; X64-AVX2-NEXT: vmovapd %ymm0, (%rdx)			; X64-AVX2-NEXT: vmovapd %ymm0, (%rdx)
	; X64-AVX2-NEXT: vzeroupper			; X64-AVX2-NEXT: vzeroupper
	; X64-AVX2-NEXT: retq			; X64-AVX2-NEXT: retq
	;			;
	; X64-AVX512-LABEL: PR48908:			; X64-AVX512-LABEL: PR48908:
	; X64-AVX512: # %bb.0:			; X64-AVX512: # %bb.0:
	; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2			; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
	; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1			; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
	; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0			; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
	; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3			; X64-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3
	; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]			; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3]
	; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]			; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
	; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]			; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
	; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,3,8,1]			; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,3,8,1]
	; X64-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3			; X64-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3
	; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi)			; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi)
	; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1]			; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1]
	; X64-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4			; X64-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4
	; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)			; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
	▲ Show 20 Lines • Show All 75 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[WIP][X86] Introduce 'blend with broadcast' shuffle lowering strategy (PR50971)
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 375932

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll

llvm/test/CodeGen/X86/horizontal-sum.ll

llvm/test/CodeGen/X86/subvector-broadcast.ll

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

This is an archive of the discontinued LLVM Phabricator instance.

[WIP][X86] Introduce 'blend with broadcast' shuffle lowering strategy (PR50971)AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 375932

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll

llvm/test/CodeGen/X86/horizontal-sum.ll

llvm/test/CodeGen/X86/subvector-broadcast.ll

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

[WIP][X86] Introduce 'blend with broadcast' shuffle lowering strategy (PR50971)
AbandonedPublic