Diff 57939

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 24,048 Lines • ▼ Show 20 Lines	if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);		SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);		SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
return DCI.CombineTo(N, InsV);		return DCI.CombineTo(N, InsV);
}		}

return SDValue();		return SDValue();
}		}

		// Attempt to match a combined shuffle mask against supported unary shuffle
		// instructions.
		// TODO: Investigate sharing more of this with shuffle lowering.
		// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
		static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
		const X86Subtarget &Subtarget,
		unsigned &Shuffle, MVT &ShuffleVT) {
		bool FloatDomain = SrcVT.isFloatingPoint();

		// Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
		if (!FloatDomain && SrcVT.is128BitVector() && Mask.size() == 2 &&
		Mask[0] == 0 && Mask[1] < 0) {
		Shuffle = X86ISD::VZEXT_MOVL;
		ShuffleVT = MVT::v2i64;
		return true;
		}

		if (!FloatDomain)
		return false;

		// Check if we have SSE3 which will let us use MOVDDUP etc. The
		// instructions are no slower than UNPCKLPD but has the option to
		// fold the input operand into even an unaligned memory load.
		if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) {
		if (Mask.equals({0, 0})) {
		Shuffle = X86ISD::MOVDDUP;
		ShuffleVT = MVT::v2f64;
		return true;
		}
		if (Mask.equals({0, 0, 2, 2})) {
		Shuffle = X86ISD::MOVSLDUP;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({1, 1, 3, 3})) {
		Shuffle = X86ISD::MOVSHDUP;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		}

		if (SrcVT.is256BitVector()) {
		assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
		if (Mask.equals({0, 0, 2, 2})) {
		Shuffle = X86ISD::MOVDDUP;
		ShuffleVT = MVT::v4f64;
		return true;
		}
		if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
		Shuffle = X86ISD::MOVSLDUP;
		ShuffleVT = MVT::v8f32;
		return true;
		}
		if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7})) {
		Shuffle = X86ISD::MOVSHDUP;
		ShuffleVT = MVT::v8f32;
		return true;
		}
		}

		if (SrcVT.is512BitVector()) {
		assert(Subtarget.hasAVX512() &&
		"AVX512 required for 512-bit vector shuffles");
		if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
		Shuffle = X86ISD::MOVDDUP;
		ShuffleVT = MVT::v8f64;
		return true;
		}
		if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
		Shuffle = X86ISD::MOVSLDUP;
		ShuffleVT = MVT::v16f32;
		return true;
		}
		if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
		Shuffle = X86ISD::MOVSHDUP;
		ShuffleVT = MVT::v16f32;
		return true;
		}
		}

		return false;
		}

		// Attempt to match a combined unary shuffle mask against supported binary
		// shuffle instructions.
		// TODO: Investigate sharing more of this with shuffle lowering.
		// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
		static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
		unsigned &Shuffle, MVT &ShuffleVT) {
		bool FloatDomain = SrcVT.isFloatingPoint();

		if (SrcVT.is128BitVector()) {
		if (Mask.equals({0, 0}) && FloatDomain) {
		Shuffle = X86ISD::MOVLHPS;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({1, 1}) && FloatDomain) {
		Shuffle = X86ISD::MOVHLPS;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({0, 0, 1, 1}) && FloatDomain) {
		Shuffle = X86ISD::UNPCKL;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({2, 2, 3, 3}) && FloatDomain) {
		Shuffle = X86ISD::UNPCKH;
		ShuffleVT = MVT::v4f32;
		return true;
		}
		if (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
		Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
		Shuffle = X86ISD::UNPCKL;
		ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
		return true;
		}
		if (Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) \|\|
		Mask.equals(
		{8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15})) {
		Shuffle = X86ISD::UNPCKH;
		ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
		return true;
		}
		}

		return false;
		}

/// \brief Combine an arbitrary chain of shuffles into a single instruction if		/// \brief Combine an arbitrary chain of shuffles into a single instruction if
/// possible.		/// possible.
///		///
/// This is the leaf of the recursive combine below. When we have found some		/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined		/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask		/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction		/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general		/// for this operation, or into a PSHUFB instruction which is a fully general
Show All 25 Lines	static bool combineX86ShuffleChain(SDValue Input, SDValue Root,

unsigned RootSizeInBits = RootVT.getSizeInBits();		unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;		unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

// TODO - handle 128/256-bit wide vector shuffles.		// TODO - handle 128/256-bit wide vector shuffles.
if (MaskEltSizeInBits > 64)		if (MaskEltSizeInBits > 64)
return false;		return false;

// Use the float domain if the operand type is a floating point type.		// Don't combine if we are a AVX512/EVEX target and the mask element size
bool FloatDomain = VT.isFloatingPoint();		// is different from the root element size - this would prevent writemasks
		// from being reused.
		// TODO - check for writemasks usage instead of always preventing combining.
		// TODO - attempt to narrow Mask back to writemask size.
		if (RootVT.getScalarSizeInBits() != MaskEltSizeInBits &&
		(RootSizeInBits == 512 \|\|
		(Subtarget.hasVLX() && RootSizeInBits >= 128))) {
		return false;
		}

// For floating point shuffles, we don't have free copies in the shuffle		// Attempt to match the mask against known shuffle patterns.
// instructions or the ability to load as part of the instruction, so
// canonicalize their shuffles to UNPCK or MOV variants.
//
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
//
// FIXME: Should teach these routines about AVX vector widths.
if (FloatDomain && VT.is128BitVector()) {
if (Mask.equals({0, 0}) \|\| Mask.equals({1, 1})) {
bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
MVT ShuffleVT;		MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction		unsigned Shuffle;
// is no slower than UNPCKLPD but has the option to fold the input operand
// into even an unaligned memory load.
if (Lo && Subtarget.hasSSE3()) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v2f64;
} else {
// We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
// than the UNPCK variants.
Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
}
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
if (Shuffle == X86ISD::MOVDDUP)
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
else
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);
return true;
}
if (Subtarget.hasSSE3() &&
(Mask.equals({0, 0, 2, 2}) \|\| Mask.equals({1, 1, 3, 3}))) {
bool Lo = Mask.equals({0, 0, 2, 2});
unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);
return true;
}
if (Mask.equals({0, 0, 1, 1}) \|\| Mask.equals({2, 2, 3, 3})) {
bool Lo = Mask.equals({0, 0, 1, 1});
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);
return true;
}
}

// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK		if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
if (!FloatDomain && VT.is128BitVector() &&
(Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) \|\|
Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) \|\|
Mask.equals(
{8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
bool Lo = Mask[0] == 0;
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
if (Depth == 1 && Root.getOpcode() == Shuffle)		if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!		return false; // Nothing to do!
MVT ShuffleVT;
switch (NumMaskElts) {
case 8:
ShuffleVT = MVT::v8i16;
break;
case 16:
ShuffleVT = MVT::v16i8;
break;
default:
llvm_unreachable("Impossible mask size!");
};
Res = DAG.getBitcast(ShuffleVT, Input);		Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);		Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),		DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);		/AddTo/ true);
return true;		return true;
}		}

// Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.		if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
if (!FloatDomain && VT.is128BitVector() &&
Mask.size() == 2 && Mask[0] == 0 && Mask[1] < 0) {
unsigned Shuffle = X86ISD::VZEXT_MOVL;
MVT ShuffleVT = MVT::v2i64;
if (Depth == 1 && Root.getOpcode() == Shuffle)		if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!		return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);		Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);		Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
DCI.AddToWorklist(Res.getNode());		DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),		DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/AddTo/ true);		/AddTo/ true);
▲ Show 20 Lines • Show All 6,310 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Show First 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)		%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
ret <8 x float> %2		ret <8 x float> %2
}		}

define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {		define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movddup:		; ALL-LABEL: combine_vpermilvar_8f32_movddup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]		; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
ret <8 x float> %1		ret <8 x float> %1
}		}
define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {		define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movddup_load:		; ALL-LABEL: combine_vpermilvar_8f32_movddup_load:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vmovaps (%rdi), %ymm0		; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = load <8 x float>, <8 x float> *%a0		%1 = load <8 x float>, <8 x float> *%a0
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)		%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
ret <8 x float> %2		ret <8 x float> %2
}		}

define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {		define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movshdup:		; ALL-LABEL: combine_vpermilvar_8f32_movshdup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]		; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>)
ret <8 x float> %1		ret <8 x float> %1
}		}

define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {		define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_8f32_movsldup:		; ALL-LABEL: combine_vpermilvar_8f32_movsldup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]		; ALL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)		%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
ret <8 x float> %1		ret <8 x float> %1
}		}

define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {		define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
; ALL-LABEL: combine_vpermilvar_2f64_identity:		; ALL-LABEL: combine_vpermilvar_2f64_identity:
; ALL: # BB#0:		; ALL: # BB#0:
Show All 20 Lines	; ALL-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)		%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
%2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)		%2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
ret <4 x double> %2		ret <4 x double> %2
}		}

define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {		define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
; ALL-LABEL: combine_vpermilvar_4f64_movddup:		; ALL-LABEL: combine_vpermilvar_4f64_movddup:
; ALL: # BB#0:		; ALL: # BB#0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]		; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq		; ALL-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)		%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
ret <4 x double> %1		ret <4 x double> %1
}		}

define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {		define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_4stage:		; ALL-LABEL: combine_vpermilvar_4f32_4stage:
; ALL: # BB#0:		; ALL: # BB#0:
Show All 34 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

Show All 29 Lines	; CHECK-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)		%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)		%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
ret <8 x double> %res1		ret <8 x double> %res1
}		}

define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {		define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
; CHECK-LABEL: combine_vpermt2var_8f64_movddup:		; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6]		; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)		%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
ret <8 x double> %res0		ret <8 x double> %res0
}		}
define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {		define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
; CHECK-LABEL: combine_vpermt2var_8f64_movddup_load:		; CHECK-LABEL: combine_vpermt2var_8f64_movddup_load:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovapd (%rdi), %zmm1		; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6]
; CHECK-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%x0 = load <8 x double>, <8 x double> *%p0		%x0 = load <8 x double>, <8 x double> *%p0
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)		%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
ret <8 x double> %res0		ret <8 x double> %res0
}		}
define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {		define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
; CHECK-LABEL: combine_vpermt2var_8f64_movddup_mask:		; CHECK-LABEL: combine_vpermt2var_8f64_movddup_mask:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: kmovw %edi, %k1		; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6]		; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)		%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
ret <8 x double> %res0		ret <8 x double> %res0
}		}

define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {		define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
; CHECK-LABEL: combine_vpermt2var_8i64_identity:		; CHECK-LABEL: combine_vpermt2var_8i64_identity:
; CHECK: # BB#0:		; CHECK: # BB#0:
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0		%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0		ret <16 x float> %res0
}		}

define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {		define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]		; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0		ret <16 x float> %res0
}		}
define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {		define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_load:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovaps (%rdi), %zmm1		; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0		%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0		ret <16 x float> %res0
}		}
define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {		define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: kmovw %edi, %k1		; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]		; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0		ret <16 x float> %res0
}		}

define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {		define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]		; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0		ret <16 x float> %res0
}		}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {		define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_load:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: vmovaps (%rdi), %zmm1		; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0		%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0		ret <16 x float> %res0
}		}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {		define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: kmovw %edi, %k1		; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]		; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0		ret <16 x float> %res0
}		}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {		define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:		; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
; CHECK: # BB#0:		; CHECK: # BB#0:
; CHECK-NEXT: kmovw %esi, %k1		; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %zmm1		; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0		%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)		%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0		ret <16 x float> %res0
}		}

define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {		define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
; CHECK-LABEL: combine_vpermt2var_16i32_identity:		; CHECK-LABEL: combine_vpermt2var_16i32_identity:
▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Generalized matching for target shuffle combines
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 57939

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Generalized matching for target shuffle combinesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 57939

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll

llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

[X86][AVX] Generalized matching for target shuffle combines
ClosedPublic