Diff 56151

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 23,981 Lines • ▼ Show 20 Lines	if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);		SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);		SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
return DCI.CombineTo(N, InsV);		return DCI.CombineTo(N, InsV);
}		}

return SDValue();		return SDValue();
}		}

		// Attempt to match a combined shuffle mask against supported unary shuffle
		// instructions.
		// TODO: Investigate sharing more of this with shuffle lowering.
		// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
		static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
		const X86Subtarget &Subtarget,
		unsigned &Shuffle, MVT &ShuffleVT) {
		bool FloatDomain = SrcVT.isFloatingPoint();

		// Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
		if (!FloatDomain && SrcVT.is128BitVector() && Mask.size() == 2 &&
		Mask[0] == 0 && Mask[1] < 0) {
		Shuffle = X86ISD::VZEXT_MOVL;
		ShuffleVT = MVT::v2i64;
		return true;
		}

		if (!FloatDomain)
		delenaUnsubmitted Not Done Reply Inline Actions Why float/int is so important? I think that a one cycle penalty for switching domains was on old (SSE) processors. delena: Why float/int is so important? I think that a one cycle penalty for switching domains was on…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Intel SB/IV, NW/BW + SK still exhibit it (although according to Agner the number of cases is diminishing with each iteration) and AMD Jaguar / Bulldozer families still have this as well. http://www.agner.org/optimize/microarchitecture.pdf In the longer term this may turn into a case where we should move as much of the shuffle combining as possible to the MC pass (as discussed on PR26183) to allow shuffle combines based on target machine scheduler but nobody seems keen to take on that job...... RKSimon: Intel SB/IV, NW/BW + SK still exhibit it (although according to Agner the number of cases is…
		delenaUnsubmitted Not Done Reply Inline Actions I still think that when you replace a shuffle with loading indices to MOVDDUP, the MOVDDUP is better. delena: I still think that when you replace a shuffle with loading indices to MOVDDUP, the MOVDDUP is…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions The intention is that those will be dealt with by a permute instruction (PSHUFD-int or VPERMILPS-fp) in a future patch. This patch is just about trying to set up a 'shuffle matching' framework - later patches will then add support for additional matches (permute / broadcast being early targets). I only added the 256/512 versions as it was a minor extension to whats already there for 128. RKSimon: The intention is that those will be dealt with by a permute instruction (PSHUFD-int or…
		return false;

		// Check if we have SSE3 which will let us use MOVDDUP etc. The
		// instructions are no slower than UNPCKLPD but has the option to
		// fold the input operand into even an unaligned memory load.
		if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) {
		if (Mask.equals({0, 0})) {
		Shuffle = X86ISD::MOVDDUP;
		ShuffleVT = MVT::v2f64;
		return true;
		}
		if (Mask.equals({0, 0, 2, 2})) {
		Shuffle = X86ISD::MOVSLDUP;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({1, 1, 3, 3})) {
		Shuffle = X86ISD::MOVSHDUP;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		}

		if (SrcVT.is256BitVector()) {
		assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
		if (Mask.equals({0, 0, 2, 2})) {
		Shuffle = X86ISD::MOVDDUP;
		ShuffleVT = MVT::v4f64;
		return true;
		}
		if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
		Shuffle = X86ISD::MOVSLDUP;
		ShuffleVT = MVT::v8f32;
		return true;
		}
		if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7})) {
		Shuffle = X86ISD::MOVSHDUP;
		ShuffleVT = MVT::v8f32;
		return true;
		}
		}

		if (SrcVT.is512BitVector()) {
		assert(Subtarget.hasAVX512() &&
		"AVX512 required for 512-bit vector shuffles");
		if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
		Shuffle = X86ISD::MOVDDUP;
		delenaUnsubmitted Not Done Reply Inline Actions Can you keep the original VT here? delena: Can you keep the original VT here?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Annoyingly no but between bitcast combines and target shuffle combines it doesn't seem to cause any problems. RKSimon: Annoyingly no but between bitcast combines and target shuffle combines it doesn't seem to cause…
		delenaUnsubmitted Not Done Reply Inline Actions The type may vary from v16f32 to v16i32, or from v8i64 to v8f64, but the vector should remain the same. Otherwise we can't put mask on this operation. delena: The type may vary from v16f32 to v16i32, or from v8i64 to v8f64, but the vector should remain…
		ShuffleVT = MVT::v8f64;
		return true;
		}
		if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
		Shuffle = X86ISD::MOVSLDUP;
		ShuffleVT = MVT::v16f32;
		return true;
		}
		if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
		Shuffle = X86ISD::MOVSHDUP;
		ShuffleVT = MVT::v16f32;
		return true;
		}
		}

		return false;
		}

		// Attempt to match a combined unary shuffle mask against supported binary
		// shuffle instructions.
		// TODO: Investigate sharing more of this with shuffle lowering.
		// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
		static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
		unsigned &Shuffle, MVT &ShuffleVT) {
		bool FloatDomain = SrcVT.isFloatingPoint();

		if (SrcVT.is128BitVector()) {
		if (Mask.equals({0, 0}) && FloatDomain) {
		Shuffle = X86ISD::MOVLHPS;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({1, 1}) && FloatDomain) {
		Shuffle = X86ISD::MOVHLPS;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({0, 0, 1, 1}) && FloatDomain) {
		delenaUnsubmitted Not Done Reply Inline Actions Unpack operation is not in FP domain. Again, even if you switch domains, the penalty in old processors is very low. You still may see slowdown on some benchmarks, since while switching from load+shuffle to shuffle you reach another port. It is just FYI. delena: Unpack operation is not in FP domain. Again, even if you switch domains, the penalty in old…
		Shuffle = X86ISD::UNPCKL;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({2, 2, 3, 3}) && FloatDomain) {
		Shuffle = X86ISD::UNPCKH;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
		Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
		Shuffle = X86ISD::UNPCKL;
		ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
		return true;
		}
		if (Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) \|\|
		Mask.equals(
		{8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15})) {
		Shuffle = X86ISD::UNPCKH;
		ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
		return true;
		}
		}

		return false;
		}

/// \brief Combine an arbitrary chain of shuffles into a single instruction if		/// \brief Combine an arbitrary chain of shuffles into a single instruction if
/// possible.		/// possible.
///		///
/// This is the leaf of the recursive combine below. When we have found some		/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined		/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask		/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction		/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general		/// for this operation, or into a PSHUFB instruction which is a fully general
Show All 25 Lines	static bool combineX86ShuffleChain(SDValue Input, SDValue Root,

unsigned RootSizeInBits = RootVT.getSizeInBits();		unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;		unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

// TODO - handle 128/256-bit wide vector shuffles.		// TODO - handle 128/256-bit wide vector shuffles.
if (MaskEltSizeInBits > 64)		if (MaskEltSizeInBits > 64)
return false;		return false;

// Use the float domain if the operand type is a floating point type.		// Attempt to match the mask against known shuffle patterns.
bool FloatDomain = VT.isFloatingPoint();

// For floating point shuffles, we don't have free copies in the shuffle
// instructions or the ability to load as part of the instruction, so
// canonicalize their shuffles to UNPCK or MOV variants.
//
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
//
// FIXME: Should teach these routines about AVX vector widths.
if (FloatDomain && VT.is128BitVector()) {
if (Mask.equals({0, 0}) \|\| Mask.equals({1, 1})) {
bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
MVT ShuffleVT;		MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction		unsigned Shuffle;
// is no slower than UNPCKLPD but has the option to fold the input operand
// into even an unaligned memory load.
if (Lo && Subtarget.hasSSE3()) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v2f64;
} else {
// We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
// than the UNPCK variants.
Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
}
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
if (Shuffle == X86ISD::MOVDDUP)
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
else
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);
return true;
}
if (Subtarget.hasSSE3() &&
(Mask.equals({0, 0, 2, 2}) \|\| Mask.equals({1, 1, 3, 3}))) {
bool Lo = Mask.equals({0, 0, 2, 2});
unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);
return true;
}
if (Mask.equals({0, 0, 1, 1}) \|\| Mask.equals({2, 2, 3, 3})) {
bool Lo = Mask.equals({0, 0, 1, 1});
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);
return true;
}
}

// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK		if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
if (!FloatDomain && VT.is128BitVector() &&
(Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) \|\|
Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) \|\|
Mask.equals(
{8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
bool Lo = Mask[0] == 0;
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
if (Depth == 1 && Root.getOpcode() == Shuffle)		if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!		return false; // Nothing to do!
MVT ShuffleVT;
switch (NumMaskElts) {
case 8:
ShuffleVT = MVT::v8i16;
break;
case 16:
ShuffleVT = MVT::v16i8;
break;
default:
llvm_unreachable("Impossible mask size!");
};
Res = DAG.getBitcast(ShuffleVT, Input);		Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);		Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),		DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);		/AddTo/ true);
return true;		return true;
}		}

// Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.		if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
if (!FloatDomain && VT.is128BitVector() &&
Mask.size() == 2 && Mask[0] == 0 && Mask[1] < 0) {
unsigned Shuffle = X86ISD::VZEXT_MOVL;
MVT ShuffleVT = MVT::v2i64;
if (Depth == 1 && Root.getOpcode() == Shuffle)		if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!		return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);		Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);		Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),		DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);		/AddTo/ true);
▲ Show 20 Lines • Show All 6,313 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-combining-avx.ll

Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)		%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
ret <8 x float> %2		ret <8 x float> %2
}		}

define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {		define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movddup:		; ALL-LABEL: combine_vpermilvar_8f32_movddup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]		; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
ret <8 x float> %1		ret <8 x float> %1
}		}

define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {		define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movshdup:		; ALL-LABEL: combine_vpermilvar_8f32_movshdup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]		; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>)
ret <8 x float> %1		ret <8 x float> %1
}		}

define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {		define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movsldup:		; ALL-LABEL: combine_vpermilvar_8f32_movsldup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]		; ALL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
ret <8 x float> %1		ret <8 x float> %1
}		}

define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {		define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
; ALL-LABEL: combine_vpermilvar_2f64_identity:		; ALL-LABEL: combine_vpermilvar_2f64_identity:
; ALL: # BB#0:		; ALL: # BB#0:
Show All 20 Lines	; ALL-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)		%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
%2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)		%2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
ret <4 x double> %2		ret <4 x double> %2
}		}

define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {		define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
; ALL-LABEL: combine_vpermilvar_4f64_movddup:		; ALL-LABEL: combine_vpermilvar_4f64_movddup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]		; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)		%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
ret <4 x double> %1		ret <4 x double> %1
}		}

define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {		define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_4stage:		; ALL-LABEL: combine_vpermilvar_4f32_4stage:
; ALL: # BB#0:		; ALL: # BB#0:
Show All 34 Lines

test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

Show All 16 Lines	; CHECK-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)		%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)		%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
ret <8 x double> %res1		ret <8 x double> %res1
}		}

define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {		define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
; CHECK-LABEL: combine_vpermt2var_8f64_movddup:		; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6]		; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
		delenaUnsubmitted Not Done Reply Inline Actions What happens when mask is not -1? delena: What happens when mask is not -1?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions I'll add tests to check what is going on. RKSimon: I'll add tests to check what is going on.
; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)		%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
ret <8 x double> %res0		ret <8 x double> %res0
}		}

define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {		define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
; CHECK-LABEL: combine_vpermt2var_8i64_identity:		; CHECK-LABEL: combine_vpermt2var_8i64_identity:
; CHECK: # BB#0:		; CHECK: # BB#0:
Show All 10 Lines	; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
%res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)		%res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
ret <16 x float> %res1		ret <16 x float> %res1
}		}

define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {		define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]		; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0		ret <16 x float> %res0
}		}

define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {		define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]		; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0		ret <16 x float> %res0
}		}

define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {		define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
; CHECK-LABEL: combine_vpermt2var_16i32_identity:		; CHECK-LABEL: combine_vpermt2var_16i32_identity:
; CHECK: # BB#0:		; CHECK: # BB#0:
Show All 25 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Generalized matching for target shuffle combines
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 56151

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/vector-shuffle-combining-avx.ll

test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Generalized matching for target shuffle combinesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 56151

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/vector-shuffle-combining-avx.ll

test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

[X86][AVX] Generalized matching for target shuffle combines
ClosedPublic