Diff 43391

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
#include "llvm/Support/ErrorHandling.h"		#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"		#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"		#include "llvm/Target/TargetOptions.h"
#include "X86IntrinsicsInfo.h"		#include "X86IntrinsicsInfo.h"
#include <bitset>		#include <bitset>
#include <numeric>		#include <numeric>
#include <cctype>		#include <cctype>
using namespace llvm;		using namespace llvm;

		delenaUnsubmitted Not Done Reply Inline Actions Hi Simon, Why you are doing this in PerformShuffleCombine and not in the lowerVectorShuffle()? As far as I understand, we call "combine" in order to combine multiple nodes. In this case, you just optimize one node. delena: Hi Simon, Why you are doing this in PerformShuffleCombine and not in the lowerVectorShuffle()?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Mainly because this has more in common with the 2 extract/insert patterns above than the canonicalization in lowerVectorShuffle. But I'm happy to move it (and the other 2?) there if you think necessary. RKSimon: Mainly because this has more in common with the 2 extract/insert patterns above than the…
		delenaUnsubmitted Not Done Reply Inline Actions Yes. Thank you. It is the compilation time first of all. delena: Yes. Thank you. It is the compilation time first of all.
#define DEBUG_TYPE "x86-isel"		#define DEBUG_TYPE "x86-isel"
		delenaUnsubmitted Not Done Reply Inline Actions Could, you, please take it into a static function? May be call it from lower256BitVectorShuffle ? delena: Could, you, please take it into a static function? May be call it from lower256BitVectorShuffle…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions No problem. RKSimon: No problem.

STATISTIC(NumTailCalls, "Number of tail calls");		STATISTIC(NumTailCalls, "Number of tail calls");

static cl::opt<bool> ExperimentalVectorWideningLegalization(		static cl::opt<bool> ExperimentalVectorWideningLegalization(
"x86-experimental-vector-widening-legalization", cl::init(false),		"x86-experimental-vector-widening-legalization", cl::init(false),
cl::desc("Enable an experimental vector type legalization through widening "		cl::desc("Enable an experimental vector type legalization through widening "
"rather than promotion."),		"rather than promotion."),
cl::Hidden);		cl::Hidden);

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,		X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
		conghUnsubmitted Not Done Reply Inline Actions Early exit if AllLowerHalf is false? congh: Early exit if AllLowerHalf is false?
const X86Subtarget &STI)		const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {		: TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();		X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();		X86ScalarSSEf32 = Subtarget->hasSSE1();
MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());		MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

// Set up the TargetLowering object.		// Set up the TargetLowering object.

// X86 is weird. It always uses i8 for shift amounts and setcc results.		// X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);		setBooleanContents(ZeroOrOneBooleanContent);
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.		// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);		setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

		conghUnsubmitted Not Done Reply Inline Actions Can we always guarantee that V is a 128-bit vector? I remember VECTOR_SHUFFLE can have different types for its operands and result. congh: Can we always guarantee that V is a 128-bit vector? I remember VECTOR_SHUFFLE can have…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions At this stage yes we know that the result and operands are all 256-bit vectors - shuffles in the DAG have to have consistent types. I did add the FIXME comment mentioning that this could be generalised to 256 or 512 bit vectors though - at that point it would need to be refactored. RKSimon: At this stage yes we know that the result and operands are all 256-bit vectors - shuffles in…
// For 64-bit, since we have so many registers, use the ILP scheduler.		// For 64-bit, since we have so many registers, use the ILP scheduler.
// For 32-bit, use the register pressure specific scheduling.		// For 32-bit, use the register pressure specific scheduling.
		delenaUnsubmitted Not Done Reply Inline Actions Why do you check only UndefUpper? What about UndefLower? delena: Why do you check only UndefUpper? What about UndefLower?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions OK - I can add this - it will still initially just support shuffling with the lower half vectors is that OK? RKSimon: OK - I can add this - it will still initially just support shuffling with the lower half…
// For Atom, always use ILP scheduling.		// For Atom, always use ILP scheduling.
if (Subtarget->isAtom())		if (Subtarget->isAtom())
setSchedulingPreference(Sched::ILP);		setSchedulingPreference(Sched::ILP);
else if (Subtarget->is64Bit())		else if (Subtarget->is64Bit())
setSchedulingPreference(Sched::ILP);		setSchedulingPreference(Sched::ILP);
else		else
setSchedulingPreference(Sched::RegPressure);		setSchedulingPreference(Sched::RegPressure);
const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();		const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());		setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

// Bypass expensive divides on Atom when compiling with O2.		// Bypass expensive divides on Atom when compiling with O2.
if (TM.getOptLevel() >= CodeGenOpt::Default) {		if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget->hasSlowDivide32())		if (Subtarget->hasSlowDivide32())
addBypassSlowDiv(32, 8);		addBypassSlowDiv(32, 8);
if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())		if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
		delenaUnsubmitted Not Done Reply Inline Actions I don't understand this code. You are running inside loop. for (unsigned i = 0; i != HalfNumElts; ++i) for v32i8 you have 16 iterations. Do you create EXTRACT_SUBVECTOR 16 times? delena: I don't understand this code. You are running inside loop. for (unsigned i = 0; i !=…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions The DAG.getNode logic will find a equivalent node if it already exists in the DAG (search for FindNodeOrInsertPos) so although we call getNode(ISD::EXTRACT_SUBVECTOR, ...) upto HalfNumElts times, it will return at most 2 values on success - on fail case it will return a 3rd value which will then fail to match either of the Half variables. But I can see it won't be clear, and slower then necessary, so I'll replace it with an integer index approach. RKSimon: The DAG.getNode logic will find a equivalent node if it already exists in the DAG (search for…
addBypassSlowDiv(64, 16);		addBypassSlowDiv(64, 16);
}		}

if (Subtarget->isTargetKnownWindowsMSVC()) {		if (Subtarget->isTargetKnownWindowsMSVC()) {
// Setup Windows compiler runtime calls.		// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");		setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");		setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
setLibcallName(RTLIB::SREM_I64, "_allrem");		setLibcallName(RTLIB::SREM_I64, "_allrem");
▲ Show 20 Lines • Show All 10,241 Lines • ▼ Show 20 Lines	for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)		if (Mask[i] >= 0)
NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;		NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&		assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
"Must not introduce lane crosses at this point!");		"Must not introduce lane crosses at this point!");

return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);		return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
}		}

		/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
		/// This allows for fast cases such as subvector extraction/insertion
		/// or shuffling smaller vector types which can lower more efficiently.
		static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
		SDValue V2, ArrayRef<int> Mask,
		const X86Subtarget *Subtarget,
		SelectionDAG &DAG) {
		assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");

		unsigned NumElts = VT.getVectorNumElements();
		unsigned HalfNumElts = NumElts / 2;
		MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

		bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
		bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
		if (!UndefLower && !UndefUpper)
		return SDValue();

		// Upper half is undef and lower half is whole upper subvector.
		// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
		if (UndefUpper &&
		isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
		SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
		DAG.getIntPtrConstant(HalfNumElts, DL));
		return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
		DAG.getIntPtrConstant(0, DL));
		}

		// Lower half is undef and upper half is whole lower subvector.
		// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
		if (UndefLower &&
		isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
		SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
		DAG.getIntPtrConstant(0, DL));
		return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
		DAG.getIntPtrConstant(HalfNumElts, DL));
		}

		// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
		if (UndefLower && Subtarget->hasAVX2() &&
		(VT == MVT::v4f64 \|\| VT == MVT::v4i64))
		return SDValue();

		// If the shuffle only uses the lower halves of the input operands,
		// then extract them and perform the 'half' shuffle at half width.
		// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
		int HalfIdx1 = -1, HalfIdx2 = -1;
		SmallVector<int, 8> HalfMask;
		unsigned Offset = UndefLower ? HalfNumElts : 0;
		for (unsigned i = 0; i != HalfNumElts; ++i) {
		int M = Mask[i + Offset];
		if (M < 0) {
		HalfMask.push_back(M);
		continue;
		}
		delenaUnsubmitted Not Done Reply Inline Actions You can exit from function in this case. I mean "return SDValue()", right? delena: You can exit from function in this case. I mean "return SDValue()", right?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Yes - I'll change it to return SDValue(). RKSimon: Yes - I'll change it to return SDValue().

		// Determine which of the 4 half vectors this element is from.
		// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
		delenaUnsubmitted Not Done Reply Inline Actions Let's assume that the original mask was: 0, 1, 3, 3, 8, 8, 10, 11 You want to take V1-Lo and V2-Lo the new mask should be 0, 1, 3, 3, 4, 4, 6, 7 But M %= NumElts will not convert 8 to 4 and 10 to 6. delena: Let's assume that the original mask was: 0, 1, 3, 3, 8, 8, 10, 11 You want to take V1-Lo and V2…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions I think this is for a v816 - then the modulo will create: 0, 1, 3, 3, 0, 0, 2, 3 The lines below then offset the second half vector by halfnumelts: 0, 1, 3, 3, 4, 4, 6, 7 RKSimon: I think this is for a v816 - then the modulo will create: 0, 1, 3, 3, 0, 0, 2, 3 The lines…
		delenaUnsubmitted Not Done Reply Inline Actions Lets assume that original VT was v8i32. NumElts = 8. M %= NumElts -? may be M %= HalfNumElts ? delena: Lets assume that original VT was v8i32. NumElts = 8. M %= NumElts -? may be M %= HalfNumElts ?
		int HalfIdx = M / HalfNumElts;

		// Only shuffle using the lower halves of the inputs.
		// TODO: Investigate usefulness of shuffling with upper halves.
		if (HalfIdx != 0 && HalfIdx != 2)
		return SDValue();

		// Determine the element index into its half vector source.
		int HalfElt = M % HalfNumElts;

		// We can shuffle with up to 2 half vectors, set the new 'half'
		// shuffle mask accordingly.
		delenaUnsubmitted Not Done Reply Inline Actions Do you leave the loop at this point? delena: Do you leave the loop at this point?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Yes - I'll change it to return SDValue(). RKSimon: Yes - I'll change it to return SDValue().
		if (-1 == HalfIdx1 \|\| HalfIdx1 == HalfIdx) {
		HalfMask.push_back(HalfElt);
		HalfIdx1 = HalfIdx;
		continue;
		}
		if (-1 == HalfIdx2 \|\| HalfIdx2 == HalfIdx) {
		HalfMask.push_back(HalfElt + HalfNumElts);
		HalfIdx2 = HalfIdx;
		continue;
		}

		// Too many half vectors referenced.
		return SDValue();
		}
		assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
		delenaUnsubmitted Not Done Reply Inline Actions extra line delena: extra line

		auto GetHalfVector = [&](int HalfIdx) {
		if (HalfIdx < 0)
		return DAG.getUNDEF(HalfVT);
		SDValue V = (HalfIdx < 2 ? V1 : V2);
		HalfIdx = (HalfIdx % 2) * HalfNumElts;
		return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
		DAG.getIntPtrConstant(HalfIdx, DL));
		};

		SDValue Half1 = GetHalfVector(HalfIdx1);
		SDValue Half2 = GetHalfVector(HalfIdx2);
		SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
		return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
		DAG.getIntPtrConstant(Offset, DL));
		}

/// \brief Test whether the specified input (0 or 1) is in-place blended by the		/// \brief Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.		/// given mask.
///		///
/// This returns true if the elements from a particular input are already in the		/// This returns true if the elements from a particular input are already in the
/// slot required by the given mask and require no permutation.		/// slot required by the given mask and require no permutation.
static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {		static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");		assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
int Size = Mask.size();		int Size = Mask.size();
▲ Show 20 Lines • Show All 553 Lines • ▼ Show 20 Lines	int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
return M >= NumElts;		return M >= NumElts;
});		});

if (NumV2Elements == 1 && Mask[0] >= NumElts)		if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(		if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Subtarget, DAG))		DL, VT, V1, V2, Mask, Subtarget, DAG))
return Insertion;		return Insertion;

		// Handle special cases where the lower or upper half is UNDEF.
		if (SDValue V =
		lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
		return V;

// There is a really nice hard cut-over between AVX1 and AVX2 that means we		// There is a really nice hard cut-over between AVX1 and AVX2 that means we
// can check for those subtargets here and avoid much of the subtarget		// can check for those subtargets here and avoid much of the subtarget
// querying in the per-vector-type lowering routines. With AVX1 we have		// querying in the per-vector-type lowering routines. With AVX1 we have
// essentially zero ability to manipulate a 256-bit vector with integer		// essentially zero ability to manipulate a 256-bit vector with integer
// types. Since we'll use floating point types there eventually, just		// types. Since we'll use floating point types there eventually, just
// immediately cast everything to a float and operate entirely in that domain.		// immediately cast everything to a float and operate entirely in that domain.
if (VT.isInteger() && !Subtarget->hasAVX2()) {		if (VT.isInteger() && !Subtarget->hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();		int ElementBits = VT.getScalarSizeInBits();
▲ Show 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();		ArrayRef<int> Mask = SVOp->getMask();
assert(Subtarget->hasAVX512() &&		assert(Subtarget->hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");		"Cannot lower 512-bit vectors w/ basic ISA!");

// Check for being able to broadcast a single element.		// Check for being able to broadcast a single element.
if (SDValue Broadcast =		if (SDValue Broadcast =
		delenaUnsubmitted Not Done Reply Inline Actions I'm not sure that that is a beneficial variant for AVX-512. Some reasons: AVX-512 has many additional shuffles, we plan to add more patterns in the future In KNL target (no VLX, BWI, DQ) you are moving down to AVX2, we have less registers, less shuffles. SKX (withVLX, BWI, DQ) supports 256-bit vectors, but we currently don't have any additional optimization for 256-bit vectors on SKX. delena: I'm not sure that that is a beneficial variant for AVX-512. Some reasons: - AVX-512 has many…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions OK - I'll drop AVX512 support. RKSimon: OK - I'll drop AVX512 support.
lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))		lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
return Broadcast;		return Broadcast;

// Dispatch to each element type for lowering. If we don't have supprot for		// Dispatch to each element type for lowering. If we don't have supprot for
// specific element type shuffles at 512 bits, immediately split them and		// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that		// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.		// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {		switch (VT.SimpleTy) {
▲ Show 20 Lines • Show All 8,710 Lines • ▼ Show 20 Lines	else {
if (IndexVT.getScalarType() == MVT::i32)		if (IndexVT.getScalarType() == MVT::i32)
Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);		Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

// Mask		// Mask
// At this point we have promoted mask operand		// At this point we have promoted mask operand
assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");		assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);		MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
// Use the original mask here, do not modify the mask twice		// Use the original mask here, do not modify the mask twice
Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);		Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

// The value that should be stored		// The value that should be stored
MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);		MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
Src = ExtendToType(Src, NewVT, DAG);		Src = ExtendToType(Src, NewVT, DAG);
}		}
}		}
// If the mask is "wide" at this point - truncate it to i1 vector		// If the mask is "wide" at this point - truncate it to i1 vector
MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);		MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
▲ Show 20 Lines • Show All 2,726 Lines • ▼ Show 20 Lines	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();		GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();		Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
return true;		return true;
}		}
}		}
return TargetLowering::isGAPlusOffset(N, GA, Offset);		return TargetLowering::isGAPlusOffset(N, GA, Offset);
}		}

/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
/// same as extracting the high 128-bit part of 256-bit vector and then
/// inserting the result into the low part of a new 256-bit vector
static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();

// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
if (!isUndefOrEqual(SVOp->getMaskElt(i), j) \|\|
SVOp->getMaskElt(j) >= 0)
return false;

return true;
}

/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
/// same as extracting the low 128-bit part of 256-bit vector and then
/// inserting the result into the high part of a new 256-bit vector
static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();

// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
if (!isUndefOrEqual(SVOp->getMaskElt(i), j) \|\|
SVOp->getMaskElt(j) >= 0)
return false;

return true;
}

/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.		/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
		/// FIXME: This could be expanded to support 512 bit vectors as well.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,		static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget* Subtarget) {		const X86Subtarget* Subtarget) {
SDLoc dl(N);		SDLoc dl(N);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
SDValue V1 = SVOp->getOperand(0);		SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);		SDValue V2 = SVOp->getOperand(1);
MVT VT = SVOp->getSimpleValueType(0);		MVT VT = SVOp->getSimpleValueType(0);
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	if (V1.getOpcode() == ISD::CONCAT_VECTORS &&

// Emit a zeroed vector and insert the desired subvector on its		// Emit a zeroed vector and insert the desired subvector on its
// first half.		// first half.
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);		SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);		SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
return DCI.CombineTo(N, InsV);		return DCI.CombineTo(N, InsV);
}		}

//===--------------------------------------------------------------------===//
// Combine some shuffles into subvector extracts and inserts:
//

// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
if (isShuffleHigh128VectorInsertLow(SVOp)) {
SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
return DCI.CombineTo(N, InsV);
}

// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
if (isShuffleLow128VectorInsertHigh(SVOp)) {
SDValue V = Extract128BitVector(V1, 0, DAG, dl);
SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
return DCI.CombineTo(N, InsV);
}

return SDValue();		return SDValue();
}		}

/// \brief Combine an arbitrary chain of shuffles into a single instruction if		/// \brief Combine an arbitrary chain of shuffles into a single instruction if
/// possible.		/// possible.
///		///
/// This is the leaf of the recursive combinine below. When we have found some		/// This is the leaf of the recursive combinine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined		/// chain of single-use x86 shuffle instructions and accumulated the combined
▲ Show 20 Lines • Show All 3,810 Lines • ▼ Show 20 Lines	static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);		SDLoc DL(N);

// Let legalize expand this if it isn't a legal type yet.		// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))		if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();		return SDValue();

// If we're negating a FMUL node on a target with FMA, then we can avoid the		// If we're negating a FMUL node on a target with FMA, then we can avoid the
// use of a constant by performing (-0 - A*B) instead.		// use of a constant by performing (-0 - A*B) instead.
// FIXME: Check rounding control flags as well once it becomes available.		// FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&		if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {		Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);		SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),		return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Zero);		Arg.getOperand(1), Zero);
}		}

// If we're negating a FMA node, then we can adjust the		// If we're negating a FMA node, then we can adjust the
▲ Show 20 Lines • Show All 1,853 Lines • Show Last 20 Lines

test/CodeGen/X86/avx-splat.ll

Show First 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499
%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]		%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
ret <8 x float> %load_broadcast12281250		ret <8 x float> %load_broadcast12281250
}		}

define <8 x float> @funcF(i32 %val) nounwind {		define <8 x float> @funcF(i32 %val) nounwind {
; CHECK-LABEL: funcF:		; CHECK-LABEL: funcF:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmovd %edi, %xmm0		; CHECK-NEXT: vmovd %edi, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,0]		; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq		; CHECK-NEXT: retq
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6		%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7		%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
%tmp = bitcast <8 x i32> %ret7 to <8 x float>		%tmp = bitcast <8 x i32> %ret7 to <8 x float>
ret <8 x float> %tmp		ret <8 x float> %tmp
}		}

▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-256-v16.ll

	Show All 26 Lines
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]			; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]			; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:			; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1			; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]			; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
				delenaUnsubmitted Not Done Reply Inline Actions This test and all tests bellow check the situation when one of the input vectors is not in use. But you, actually, optimize the case when the shuffle uses a half of V1 and a half of V2, right? delena: This test and all tests bellow check the situation when one of the input vectors is not in use.
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>			%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
	ret <16 x i16> %shuffle			ret <16 x i16> %shuffle
	}			}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {			define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:			; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]			; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]			; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]			; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:			; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]			; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]			; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0			; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]			; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0			; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle			ret <16 x i16> %shuffle
	}			}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {			define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:			; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	▲ Show 20 Lines • Show All 3,070 Lines • ▼ Show 20 Lines
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15]			; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31]			; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>			%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>
	ret <16 x i16> %shuffle			ret <16 x i16> %shuffle
	}			}

				define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) {
				; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
				; AVX1: # BB#0:
				; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
				; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
				; AVX2: # BB#0:
				; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
				; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
				; AVX2-NEXT: retq
				%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19>
				ret <16 x i16> %shuffle
				}

	define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {			define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:			; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]			; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:			; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
	Show All 18 Lines
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0			; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0			; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>			%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle			ret <16 x i16> %shuffle
	}			}

				define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
				; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u:
				; ALL: # BB#0:
				; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
				; ALL-NEXT: retq
				%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i16> %shuffle
				}

				define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
				; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
				; ALL: # BB#0:
				; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
				; ALL-NEXT: retq
				%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i16> %shuffle
				}

	define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {			define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:			; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]			; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:			; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
	▲ Show 20 Lines • Show All 128 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-256-v32.ll

	Show First 20 Lines • Show All 319 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:			; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]			; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2			; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1			; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0			; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]			; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0			; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle			ret <32 x i8> %shuffle
	}			}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {			define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	▲ Show 20 Lines • Show All 1,671 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0			; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0			; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>			%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle			ret <32 x i8> %shuffle
	}			}

				define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
				; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
				; AVX1: # BB#0:
				; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
				; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
				; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
				; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX1-NEXT: retq
				;
				; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
				; AVX2: # BB#0:
				; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
				; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX2-NEXT: retq
				%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <32 x i8> %shuffle
				}

				define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
				; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
				; ALL: # BB#0:
				; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; ALL-NEXT: retq
				%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <32 x i8> %shuffle
				}

	define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {			define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:			; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]			; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:			; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	▲ Show 20 Lines • Show All 111 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-256-v4.ll

	Show First 20 Lines • Show All 230 Lines • ▼ Show 20 Lines
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]			; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>			%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
	ret <4 x double> %shuffle			ret <4 x double> %shuffle
	}			}

	define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {			define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
	; AVX1-LABEL: shuffle_v4f64_0423:			; ALL-LABEL: shuffle_v4f64_0423:
	; AVX1: # BB#0:			; ALL: # BB#0:
	; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]			; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]			; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
	; AVX1-NEXT: retq			; ALL-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v4f64_0423:
	; AVX2: # BB#0:
	; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
	; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v4f64_0423:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1
	; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>			%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
	ret <4 x double> %shuffle			ret <4 x double> %shuffle
	}			}

	define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {			define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
	; ALL-LABEL: shuffle_v4f64_0462:			; ALL-LABEL: shuffle_v4f64_0462:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]			; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
	▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines
	; ALL-LABEL: shuffle_v4f64_u062:			; ALL-LABEL: shuffle_v4f64_u062:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]			; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2>			%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2>
	ret <4 x double> %shuffle			ret <4 x double> %shuffle
	}			}

				define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
				; ALL-LABEL: shuffle_v4f64_15uu:
				; ALL: # BB#0:
				; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
				; ALL-NEXT: retq
				%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
				ret <4 x double> %shuffle
				}

	define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {			define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
	; ALL-LABEL: shuffle_v4f64_11uu:			; ALL-LABEL: shuffle_v4f64_11uu:
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]			; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>			%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
	ret <4 x double> %shuffle			ret <4 x double> %shuffle
	}			}

	define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {			define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
	; AVX1-LABEL: shuffle_v4f64_22uu:			; AVX1-LABEL: shuffle_v4f64_22uu:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	▲ Show 20 Lines • Show All 242 Lines • ▼ Show 20 Lines
	; AVX512VL-NEXT: retq			; AVX512VL-NEXT: retq
	%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>			%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
	ret <4 x i64> %shuffle			ret <4 x i64> %shuffle
	}			}

	define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {			define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
	; AVX1-LABEL: shuffle_v4i64_0142:			; AVX1-LABEL: shuffle_v4i64_0142:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
	; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]			; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]			; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v4i64_0142:			; AVX2-LABEL: shuffle_v4i64_0142:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1			; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]			; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]			; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512VL-LABEL: shuffle_v4i64_0142:			; AVX512VL-LABEL: shuffle_v4i64_0142:
	; AVX512VL: # BB#0:			; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1			; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]			; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]			; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
	; AVX512VL-NEXT: retq			; AVX512VL-NEXT: retq
	%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>			%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
	ret <4 x i64> %shuffle			ret <4 x i64> %shuffle
	}			}

	define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {			define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
	; AVX1-LABEL: shuffle_v4i64_0412:			; AVX1-LABEL: shuffle_v4i64_0412:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]			; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]			; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]			; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v4i64_0412:			; AVX2-LABEL: shuffle_v4i64_0412:
	; AVX2: # BB#0:			; AVX2: # BB#0:
				; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]			; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
	; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]			; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512VL-LABEL: shuffle_v4i64_0412:			; AVX512VL-LABEL: shuffle_v4i64_0412:
	; AVX512VL: # BB#0:			; AVX512VL: # BB#0:
				; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]			; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
	; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]			; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
	; AVX512VL-NEXT: retq			; AVX512VL-NEXT: retq
	%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>			%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
	ret <4 x i64> %shuffle			ret <4 x i64> %shuffle
	}			}

	define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {			define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
	; AVX1-LABEL: shuffle_v4i64_4012:			; AVX1-LABEL: shuffle_v4i64_4012:
	▲ Show 20 Lines • Show All 325 Lines • ▼ Show 20 Lines
	; AVX512VL-LABEL: shuffle_v4i64_40u2:			; AVX512VL-LABEL: shuffle_v4i64_40u2:
	; AVX512VL: # BB#0:			; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]			; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
	; AVX512VL-NEXT: retq			; AVX512VL-NEXT: retq
	%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>			%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>
	ret <4 x i64> %shuffle			ret <4 x i64> %shuffle
	}			}

				define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
				; ALL-LABEL: shuffle_v4i64_15uu:
				; ALL: # BB#0:
				; ALL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
				; ALL-NEXT: retq
				%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
				ret <4 x i64> %shuffle
				}

	define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {			define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
	; AVX1-LABEL: shuffle_v4i64_11uu:			; ALL-LABEL: shuffle_v4i64_11uu:
	; AVX1: # BB#0:			; ALL: # BB#0:
	; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]			; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
	; AVX1-NEXT: retq			; ALL-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v4i64_11uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v4i64_11uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>			%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
	ret <4 x i64> %shuffle			ret <4 x i64> %shuffle
	}			}

	define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {			define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
	; AVX1-LABEL: shuffle_v4i64_22uu:			; AVX1-LABEL: shuffle_v4i64_22uu:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	▲ Show 20 Lines • Show All 352 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-256-v8.ll

	Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

	define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {			define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
	; AVX1-LABEL: shuffle_v8f32_00040000:			; AVX1-LABEL: shuffle_v8f32_00040000:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]			; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]			; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]			; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
	; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]			; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8f32_00040000:			; AVX2-LABEL: shuffle_v8f32_00040000:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]			; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
	; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
	▲ Show 20 Lines • Show All 739 Lines • ▼ Show 20 Lines
	; ALL: # BB#0:			; ALL: # BB#0:
	; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]			; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
	; ALL-NEXT: retq			; ALL-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 10, i32 2, i32 undef, i32 3, i32 14, i32 6, i32 15, i32 7>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 10, i32 2, i32 undef, i32 3, i32 14, i32 6, i32 15, i32 7>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

	define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {			define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
	; AVX1-LABEL: shuffle_v8f32_uuuu1111:			; ALL-LABEL: shuffle_v8f32_uuuu1111:
	; AVX1: # BB#0:			; ALL: # BB#0:
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]			; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; ALL-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v8f32_uuuu1111:
	; AVX2: # BB#0:
	; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
	; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

	define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {			define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
	; AVX1-LABEL: shuffle_v8f32_44444444:			; AVX1-LABEL: shuffle_v8f32_44444444:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]			; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8f32_44444444:			; AVX2-LABEL: shuffle_v8f32_44444444:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1			; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
	; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

				define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
				; ALL-LABEL: shuffle_v8f32_1188uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
				; ALL-NEXT: retq
				%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x float> %shuffle
				}

				define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
				; ALL-LABEL: shuffle_v8f32_uuuu3210:
				; ALL: # BB#0:
				; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
				; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
				; ALL-NEXT: retq
				%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
				ret <8 x float> %shuffle
				}

				define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
				; ALL-LABEL: shuffle_v8f32_uuuu1188:
				; ALL: # BB#0:
				; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
				; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
				; ALL-NEXT: retq
				%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8>
				ret <8 x float> %shuffle
				}

				define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
				; ALL-LABEL: shuffle_v8f32_1111uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
				; ALL-NEXT: retq
				%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x float> %shuffle
				}

	define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {			define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
	; AVX1-LABEL: shuffle_v8f32_5555uuuu:			; AVX1-LABEL: shuffle_v8f32_5555uuuu:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]			; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8f32_5555uuuu:			; AVX2-LABEL: shuffle_v8f32_5555uuuu:
	▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
	ret <8 x i32> %shuffle			ret <8 x i32> %shuffle
	}			}

	define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {			define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
	; AVX1-LABEL: shuffle_v8i32_00040000:			; AVX1-LABEL: shuffle_v8i32_00040000:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]			; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]			; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]			; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
	; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]			; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8i32_00040000:			; AVX2-LABEL: shuffle_v8i32_00040000:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]			; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
	; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
	▲ Show 20 Lines • Show All 979 Lines • ▼ Show 20 Lines
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 undef>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 undef>
	ret <8 x i32> %shuffle			ret <8 x i32> %shuffle
	}			}

	define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {			define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
	; AVX1-LABEL: shuffle_v8i32_uuuu1111:			; AVX1-LABEL: shuffle_v8i32_uuuu1111:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]			; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8i32_uuuu1111:			; AVX2-LABEL: shuffle_v8i32_uuuu1111:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1			; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
	; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
	ret <8 x i32> %shuffle			ret <8 x i32> %shuffle
	}			}

				define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
				; ALL-LABEL: shuffle_v8i32_2222uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
				; ALL-NEXT: retq
				%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i32> %shuffle
				}

				define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
				; ALL-LABEL: shuffle_v8i32_2A3Buuuu:
				; ALL: # BB#0:
				; ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
				; ALL-NEXT: retq
				%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i32> %shuffle
				}

	define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {			define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
	; AVX1-LABEL: shuffle_v8i32_44444444:			; AVX1-LABEL: shuffle_v8i32_44444444:
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]			; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	▲ Show 20 Lines • Show All 284 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-zext.ll

	Show First 20 Lines • Show All 1,161 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero			; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero			; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero			; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:			; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero			; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero			; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	entry:			entry:
	%B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>			%B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	%Z = bitcast <32 x i8> %B to <4 x i64>			%Z = bitcast <32 x i8> %B to <4 x i64>
	ret <4 x i64> %Z			ret <4 x i64> %Z
	}			}

	▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero			; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero			; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:			; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7]			; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
	; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero			; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	entry:			entry:
	%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>			%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
	%Z = bitcast <16 x i16> %B to <4 x i64>			%Z = bitcast <16 x i16> %B to <4 x i64>
	ret <4 x i64> %Z			ret <4 x i64> %Z
	}			}

	▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2			; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]			; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero			; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:			; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero			; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	entry:			entry:
	%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>			%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
	%Z = bitcast <16 x i16> %B to <8 x i32>			%Z = bitcast <16 x i16> %B to <8 x i32>
	ret <8 x i32> %Z			ret <8 x i32> %Z
	}			}

	Show All 30 Lines
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2			; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]			; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
	; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:			; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]			; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	entry:			entry:
	%B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>			%B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
	%Z = bitcast <16 x i16> %B to <8 x i32>			%Z = bitcast <16 x i16> %B to <8 x i32>
	ret <8 x i32> %Z			ret <8 x i32> %Z
	}			}

	▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
	; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[3],zero,zero,zero			; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[3],zero,zero,zero
	; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2			; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]			; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:			; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
	; AVX2: # BB#0: # %entry			; AVX2: # BB#0: # %entry
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7]			; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
	; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero			; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	entry:			entry:
	%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>			%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
	%Z = bitcast <8 x i32> %B to <4 x i64>			%Z = bitcast <8 x i32> %B to <4 x i64>
	ret <4 x i64> %Z			ret <4 x i64> %Z
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Only shuffle the lower half of vectors if the upper half is undefined
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 43391

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/avx-splat.ll

test/CodeGen/X86/vector-shuffle-256-v16.ll

test/CodeGen/X86/vector-shuffle-256-v32.ll

test/CodeGen/X86/vector-shuffle-256-v4.ll

test/CodeGen/X86/vector-shuffle-256-v8.ll

test/CodeGen/X86/vector-zext.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Only shuffle the lower half of vectors if the upper half is undefinedClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 43391

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/avx-splat.ll

test/CodeGen/X86/vector-shuffle-256-v16.ll

test/CodeGen/X86/vector-shuffle-256-v32.ll

test/CodeGen/X86/vector-shuffle-256-v4.ll

test/CodeGen/X86/vector-shuffle-256-v8.ll

test/CodeGen/X86/vector-zext.ll

[X86][AVX] Only shuffle the lower half of vectors if the upper half is undefined
ClosedPublic