This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions
ClosedPublic

Authored by RKSimon on May 30 2015, 12:22 PM.

Download Raw Diff

Details

Reviewers

spatel
chandlerc
andreadb

Commits

rGd85cae3d52a3: [X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions
rL241508: [X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions

Summary

This patch adds support for v8i16 and v16i8 shuffle lowering using the immediate versions of the SSE4A EXTRQ and INSERTQ instructions. Although rather limited (they can only act on the lower 64-bits of the source vectors, leave the upper 64-bits of the result vector undefined and don't have VEX encoded variants), the instructions are still useful for the zero extension of any lane (EXTRQ) or inserting a lane into another vector (INSERTQ). Testing demonstrated that it wasn't typically worth it to use these instructions for v2i64 or v4i32 vector shuffles although they are capable of it.

As well as adding specific pattern matching for the shuffles, the patch uses EXTRQ for zero extension cases where SSE41 isn't available and its more efficient than the SSE2 'unpack' default approach. It also adds shuffle decode support for the EXTRQ / INSERTQ cases when the instructions are handling full byte-sized extractions / insertions.

From this foundation, future patches will be able to make use of the instructions for situations that use their ability to extract/insert at the bit level.

As with any AMD-only instructions - if you have experience with these, please consider reviewing as we need all the help we can get ;-)

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 26848.May 30 2015, 12:22 PM

RKSimon retitled this revision from to [X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions.

RKSimon updated this object.

RKSimon edited the test plan for this revision. (Show Details)

RKSimon added reviewers: andreadb, spatel, chandlerc.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: Unknown Object (MLST).

ping

LGTM (although I've never used extrq/insertq).

For the test file, would you please specify the CPU attributes directly rather than using btver1 / btver2? So -mattr=sse4a and -mattr=sse4a,sse4.2 (or avx)?

This revision is now accepted and ready to land.Jun 30 2015, 3:32 PM

Closed by commit rL241508: [X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions (authored by RKSimon). · Explain WhyJul 6 2015, 1:47 PM

This revision was automatically updated to reflect the committed changes.

In D10146#197416, @spatel wrote:

For the test file, would you please specify the CPU attributes directly rather than using btver1 / btver2? So -mattr=sse4a and -mattr=sse4a,sse4.2 (or avx)?

Thanks Sanjay, I've committed with your requested mattr fix

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

InstPrinter/

X86InstComments.cpp

23 lines

Utils/

8 lines

74 lines

3 lines

180 lines

X86InstrFragmentsSIMD.td

8 lines

X86InstrSSE.td

6 lines

X86IntrinsicsInfo.h

4 lines

test/

CodeGen/

X86/

vector-shuffle-sse4a.ll

221 lines

Diff 29122

llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp

Show First 20 Lines • Show All 872 Lines • ▼ Show 20 Lines	case X86::VMOVZPQILo2PQIrm:
DestName = getRegName(MI->getOperand(0).getReg());		DestName = getRegName(MI->getOperand(0).getReg());
break;		break;
case X86::MOVDI2PDIrm:		case X86::MOVDI2PDIrm:
case X86::VMOVDI2PDIrm:		case X86::VMOVDI2PDIrm:
DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);		DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());		DestName = getRegName(MI->getOperand(0).getReg());
break;		break;

		case X86::EXTRQI:
		if (MI->getOperand(2).isImm() &&
		MI->getOperand(3).isImm())
		DecodeEXTRQIMask(MI->getOperand(2).getImm(),
		MI->getOperand(3).getImm(),
		ShuffleMask);

		DestName = getRegName(MI->getOperand(0).getReg());
		Src1Name = getRegName(MI->getOperand(1).getReg());
		break;

		case X86::INSERTQI:
		if (MI->getOperand(3).isImm() &&
		MI->getOperand(4).isImm())
		DecodeINSERTQIMask(MI->getOperand(3).getImm(),
		MI->getOperand(4).getImm(),
		ShuffleMask);

		DestName = getRegName(MI->getOperand(0).getReg());
		Src1Name = getRegName(MI->getOperand(1).getReg());
		Src2Name = getRegName(MI->getOperand(2).getReg());
		break;

case X86::PMOVZXBWrr:		case X86::PMOVZXBWrr:
case X86::PMOVZXBDrr:		case X86::PMOVZXBDrr:
case X86::PMOVZXBQrr:		case X86::PMOVZXBQrr:
case X86::PMOVZXWDrr:		case X86::PMOVZXWDrr:
case X86::PMOVZXWQrr:		case X86::PMOVZXWQrr:
case X86::PMOVZXDQrr:		case X86::PMOVZXDQrr:
case X86::VPMOVZXBWrr:		case X86::VPMOVZXBWrr:
case X86::VPMOVZXBDrr:		case X86::VPMOVZXBDrr:
▲ Show 20 Lines • Show All 93 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h

Show First 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
SmallVectorImpl<int> &ShuffleMask);		SmallVectorImpl<int> &ShuffleMask);

/// \brief Decode a move lower and zero upper instruction as a shuffle mask.		/// \brief Decode a move lower and zero upper instruction as a shuffle mask.
void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);		void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);

/// \brief Decode a scalar float move instruction as a shuffle mask.		/// \brief Decode a scalar float move instruction as a shuffle mask.
void DecodeScalarMoveMask(MVT VT, bool IsLoad,		void DecodeScalarMoveMask(MVT VT, bool IsLoad,
SmallVectorImpl<int> &ShuffleMask);		SmallVectorImpl<int> &ShuffleMask);

		/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
		void DecodeEXTRQIMask(int Len, int Idx,
		SmallVectorImpl<int> &ShuffleMask);

		/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
		void DecodeINSERTQIMask(int Len, int Idx,
		SmallVectorImpl<int> &ShuffleMask);
} // llvm namespace		} // llvm namespace

#endif		#endif

llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp

	Show First 20 Lines • Show All 425 Lines • ▼ Show 20 Lines
	void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {			void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
	// First element comes from the first element of second source.			// First element comes from the first element of second source.
	// Remaining elements: Load zero extends / Move copies from first source.			// Remaining elements: Load zero extends / Move copies from first source.
	unsigned NumElts = VT.getVectorNumElements();			unsigned NumElts = VT.getVectorNumElements();
	Mask.push_back(NumElts);			Mask.push_back(NumElts);
	for (unsigned i = 1; i < NumElts; i++)			for (unsigned i = 1; i < NumElts; i++)
	Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);			Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
	}			}

				void DecodeEXTRQIMask(int Len, int Idx,
				SmallVectorImpl<int> &ShuffleMask) {
				// Only the bottom 6 bits are valid for each immediate.
				Len &= 0x3F;
				Idx &= 0x3F;

				// We can only decode this bit extraction instruction as a shuffle if both the
				// length and index work with whole bytes.
				if (0 != (Len % 8) \|\| 0 != (Idx % 8))
				return;

				// A length of zero is equivalent to a bit length of 64.
				if (Len == 0)
				Len = 64;

				// If the length + index exceeds the bottom 64 bits the result is undefined.
				if ((Len + Idx) > 64) {
				ShuffleMask.append(16, SM_SentinelUndef);
				return;
				}

				// Convert index and index to work with bytes.
				Len /= 8;
				Idx /= 8;

				// EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
				// of the lower 64-bits. The upper 64-bits are undefined.
				for (int i = 0; i != Len; ++i)
				ShuffleMask.push_back(i + Idx);
				for (int i = Len; i != 8; ++i)
				ShuffleMask.push_back(SM_SentinelZero);
				for (int i = 8; i != 16; ++i)
				ShuffleMask.push_back(SM_SentinelUndef);
				}

				void DecodeINSERTQIMask(int Len, int Idx,
				SmallVectorImpl<int> &ShuffleMask) {
				// Only the bottom 6 bits are valid for each immediate.
				Len &= 0x3F;
				Idx &= 0x3F;

				// We can only decode this bit insertion instruction as a shuffle if both the
				// length and index work with whole bytes.
				if (0 != (Len % 8) \|\| 0 != (Idx % 8))
				return;

				// A length of zero is equivalent to a bit length of 64.
				if (Len == 0)
				Len = 64;

				// If the length + index exceeds the bottom 64 bits the result is undefined.
				if ((Len + Idx) > 64) {
				ShuffleMask.append(16, SM_SentinelUndef);
				return;
				}

				// Convert index and index to work with bytes.
				Len /= 8;
				Idx /= 8;

				// INSERTQ: Extract lowest Len bytes from lower half of second source and
				// insert over first source starting at Idx byte. The upper 64-bits are
				// undefined.
				for (int i = 0; i != Idx; ++i)
				ShuffleMask.push_back(i);
				for (int i = 0; i != Len; ++i)
				ShuffleMask.push_back(i + 16);
				for (int i = Idx + Len; i != 8; ++i)
				ShuffleMask.push_back(i);
				for (int i = 8; i != 16; ++i)
				ShuffleMask.push_back(SM_SentinelUndef);
				}

	} // llvm namespace			} // llvm namespace

llvm/trunk/lib/Target/X86/X86ISelLowering.h

Show First 20 Lines • Show All 388 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
// Broadcast scalar to vector		// Broadcast scalar to vector
VBROADCAST,		VBROADCAST,
// Broadcast subvector to vector		// Broadcast subvector to vector
SUBV_BROADCAST,		SUBV_BROADCAST,
// Insert/Extract vector element		// Insert/Extract vector element
VINSERT,		VINSERT,
VEXTRACT,		VEXTRACT,

		/// SSE4A Extraction and Insertion.
		EXTRQI, INSERTQI,

// Vector multiply packed unsigned doubleword integers		// Vector multiply packed unsigned doubleword integers
PMULUDQ,		PMULUDQ,
// Vector multiply packed signed doubleword integers		// Vector multiply packed signed doubleword integers
PMULDQ,		PMULDQ,
// Vector Multiply Packed UnsignedIntegers with Round and Scale		// Vector Multiply Packed UnsignedIntegers with Round and Scale
MULHRS,		MULHRS,

// FMA nodes		// FMA nodes
▲ Show 20 Lines • Show All 718 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,932 Lines • ▼ Show 20 Lines	bool X86TargetLowering::isCheapToSpeculateCttz() const {
return Subtarget->hasBMI();		return Subtarget->hasBMI();
}		}

bool X86TargetLowering::isCheapToSpeculateCtlz() const {		bool X86TargetLowering::isCheapToSpeculateCtlz() const {
// Speculate ctlz only if we can directly use LZCNT.		// Speculate ctlz only if we can directly use LZCNT.
return Subtarget->hasLZCNT();		return Subtarget->hasLZCNT();
}		}

		/// isUndefInRange - Return true if every element in Mask, beginning
		/// from position Pos and ending in Pos+Size is undef.
		static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
		for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
		if (0 <= Mask[i])
		return false;
		return true;
		}

/// isUndefOrInRange - Return true if Val is undef or if its value falls within		/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].		/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {		static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val < 0) \|\| (Val >= Low && Val < Hi);		return (Val < 0) \|\| (Val >= Low && Val < Hi);
}		}

/// isUndefOrEqual - Val is either less than zero (undef) or equal to the		/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.		/// specified value.
▲ Show 20 Lines • Show All 2,960 Lines • ▼ Show 20 Lines	for (int Shift = 1; Shift != Scale; ++Shift)
for (SDValue V : {V1, V2})		for (SDValue V : {V1, V2})
if (SDValue Match = MatchShift(Shift, Scale, Left, V))		if (SDValue Match = MatchShift(Shift, Scale, Left, V))
return Match;		return Match;

// no match		// no match
return SDValue();		return SDValue();
}		}

		/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
		static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
		SDValue V2, ArrayRef<int> Mask,
		SelectionDAG &DAG) {
		SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
		assert(!Zeroable.all() && "Fully zeroable shuffle mask");

		int Size = Mask.size();
		int HalfSize = Size / 2;
		assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

		// Upper half must be undefined.
		if (!isUndefInRange(Mask, HalfSize, HalfSize))
		return SDValue();

		// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
		// Remainder of lower half result is zero and upper half is all undef.
		auto LowerAsEXTRQ = [&]() {
		// Determine the extraction length from the part of the
		// lower half that isn't zeroable.
		int Len = HalfSize;
		for (; Len >= 0; --Len)
		if (!Zeroable[Len - 1])
		break;
		assert(Len > 0 && "Zeroable shuffle mask");

		// Attempt to match first Len sequential elements from the lower half.
		SDValue Src;
		int Idx = -1;
		for (int i = 0; i != Len; ++i) {
		int M = Mask[i];
		if (M < 0)
		continue;
		SDValue &V = (M < Size ? V1 : V2);
		M = M % Size;

		// All mask elements must be in the lower half.
		if (M > HalfSize)
		return SDValue();

		if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
		Src = V;
		Idx = M - i;
		continue;
		}
		return SDValue();
		}

		if (Idx < 0)
		return SDValue();

		assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
		int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
		int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
		return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
		DAG.getConstant(BitLen, DL, MVT::i8),
		DAG.getConstant(BitIdx, DL, MVT::i8));
		};

		if (SDValue ExtrQ = LowerAsEXTRQ())
		return ExtrQ;

		// INSERTQ: Extract lowest Len elements from lower half of second source and
		// insert over first source, starting at Idx.
		// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
		auto LowerAsInsertQ = [&]() {
		for (int Idx = 0; Idx != HalfSize; ++Idx) {
		SDValue Base;

		// Attempt to match first source from mask before insertion point.
		if (isUndefInRange(Mask, 0, Idx)) {
		/* EMPTY */
		} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
		Base = V1;
		} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
		Base = V2;
		} else {
		continue;
		}

		// Extend the extraction length looking to match both the insertion of
		// the second source and the remaining elements of the first.
		for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
		SDValue Insert;
		int Len = Hi - Idx;

		// Match insertion.
		if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
		Insert = V1;
		} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
		Insert = V2;
		} else {
		continue;
		}

		// Match the remaining elements of the lower half.
		if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
		/* EMPTY */
		} else if ((!Base \|\| (Base == V1)) &&
		isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
		Base = V1;
		} else if ((!Base \|\| (Base == V2)) &&
		isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
		Size + Hi)) {
		Base = V2;
		} else {
		continue;
		}

		// We may not have a base (first source) - this can safely be undefined.
		if (!Base)
		Base = DAG.getUNDEF(VT);

		int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
		int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
		return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
		DAG.getConstant(BitLen, DL, MVT::i8),
		DAG.getConstant(BitIdx, DL, MVT::i8));
		}
		}

		return SDValue();
		};

		if (SDValue InsertQ = LowerAsInsertQ())
		return InsertQ;

		return SDValue();
		}

/// \brief Lower a vector shuffle as a zero or any extension.		/// \brief Lower a vector shuffle as a zero or any extension.
///		///
/// Given a specific number of elements, element bit width, and extension		/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available		/// stride, produce either a zero or any extension based on the available
/// features of the subtarget.		/// features of the subtarget.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(		static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,		SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {		ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");		assert(Scale > 1 && "Need a scale to extend.");
int NumElements = VT.getVectorNumElements();		int NumElements = VT.getVectorNumElements();
int EltBits = VT.getScalarSizeInBits();		int EltBits = VT.getScalarSizeInBits();
assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&		assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");		"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");		assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");

// Found a valid zext mask! Try various lowering strategies based on the		// Found a valid zext mask! Try various lowering strategies based on the
Show All 20 Lines	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));		getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFHWMask[4] = {1, -1, -1, -1};		int PSHUFHWMask[4] = {1, -1, -1, -1};
return DAG.getBitcast(		return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,		VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),		DAG.getBitcast(MVT::v8i16, InputV),
getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));		getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
}		}

		// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
		// to 64-bits.
		if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
		assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
		assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");

		SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
		DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
		DAG.getConstant(EltBits, DL, MVT::i8),
		DAG.getConstant(0, DL, MVT::i8)));
		if (isUndefInRange(Mask, NumElements/2, NumElements/2))
		return DAG.getNode(ISD::BITCAST, DL, VT, Lo);

		SDValue Hi =
		DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
		DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
		DAG.getConstant(EltBits, DL, MVT::i8),
		DAG.getConstant(EltBits, DL, MVT::i8)));
		return DAG.getNode(ISD::BITCAST, DL, VT,
		DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
		}

// If this would require more than 2 unpack instructions to expand, use		// If this would require more than 2 unpack instructions to expand, use
// pshufb when available. We can only use more than 2 unpack instructions		// pshufb when available. We can only use more than 2 unpack instructions
// when zero extending i8 elements which also makes it easier to use pshufb.		// when zero extending i8 elements which also makes it easier to use pshufb.
if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {		if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");		assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];		SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i)		for (int i = 0; i < 16; ++i)
PSHUFBMask[i] =		PSHUFBMask[i] =
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	auto Lower = [&](int Scale) -> SDValue {

// If we fail to find an input, we have a zero-shuffle which should always		// If we fail to find an input, we have a zero-shuffle which should always
// have already been handled.		// have already been handled.
// FIXME: Maybe handle this here in case during blending we end up with one?		// FIXME: Maybe handle this here in case during blending we end up with one?
if (!InputV)		if (!InputV)
return SDValue();		return SDValue();

return lowerVectorShuffleAsSpecificZeroOrAnyExtend(		return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);		DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
};		};

// The widest scale possible for extending is to a 64-bit integer.		// The widest scale possible for extending is to a 64-bit integer.
assert(Bits % 64 == 0 &&		assert(Bits % 64 == 0 &&
"The number of bits in a vector must be divisible by 64 on x86!");		"The number of bits in a vector must be divisible by 64 on x86!");
int NumExtElements = Bits / 64;		int NumExtElements = Bits / 64;

// Each iteration, try extending the elements half as much, but into twice as		// Each iteration, try extending the elements half as much, but into twice as
▲ Show 20 Lines • Show All 1,510 Lines • ▼ Show 20 Lines	assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
"All single-input shuffles should be canonicalized to be V1-input "		"All single-input shuffles should be canonicalized to be V1-input "
"shuffles.");		"shuffles.");

// Try to use shift instructions.		// Try to use shift instructions.
if (SDValue Shift =		if (SDValue Shift =
lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))		lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
return Shift;		return Shift;

		// See if we can use SSE4A Extraction / Insertion.
		if (Subtarget->hasSSE4A())
		if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
		return V;

// There are special ways we can lower some single-element blends.		// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)		if (NumV2Inputs == 1)
if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,		if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))		Mask, Subtarget, DAG))
return V;		return V;

// We have different paths for blend lowering, but they all must use the		// We have different paths for blend lowering, but they all must use the
// exact same predicate.		// exact same predicate.
▲ Show 20 Lines • Show All 136 Lines • ▼ Show 20 Lines	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;		return Rotate;

// Try to use a zext lowering.		// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(		if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))		DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return ZExt;		return ZExt;

		// See if we can use SSE4A Extraction / Insertion.
		if (Subtarget->hasSSE4A())
		if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
		return V;

int NumV2Elements =		int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });		std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });

// For single-input shuffles, there are some nicer lowering tricks we can use.		// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {		if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.		// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,		if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
Mask, Subtarget, DAG))		Mask, Subtarget, DAG))
▲ Show 20 Lines • Show All 6,373 Lines • ▼ Show 20 Lines	if (IntrData) {
case INTR_TYPE_1OP:		case INTR_TYPE_1OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));		return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
case INTR_TYPE_2OP:		case INTR_TYPE_2OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),		return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));		Op.getOperand(2));
case INTR_TYPE_3OP:		case INTR_TYPE_3OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),		return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));		Op.getOperand(2), Op.getOperand(3));
		case INTR_TYPE_4OP:
		return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
		Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
case INTR_TYPE_1OP_MASK_RM: {		case INTR_TYPE_1OP_MASK_RM: {
SDValue Src = Op.getOperand(1);		SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);		SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);		SDValue Mask = Op.getOperand(3);
SDValue RoundingMode;		SDValue RoundingMode;
if (Op.getNumOperands() == 4)		if (Op.getNumOperands() == 4)
RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);		RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
else		else
▲ Show 20 Lines • Show All 3,377 Lines • ▼ Show 20 Lines	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FMAX: return "X86ISD::FMAX";		case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";		case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
case X86ISD::FMIN: return "X86ISD::FMIN";		case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";		case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
case X86ISD::FMAXC: return "X86ISD::FMAXC";		case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";		case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";		case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";		case X86ISD::FRCP: return "X86ISD::FRCP";
		case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
		case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";		case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";		case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";		case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";		case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";		case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";		case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";		case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";		case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
▲ Show 20 Lines • Show All 7,311 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td

	Show First 20 Lines • Show All 198 Lines • ▼ Show 20 Lines

	def X86pmuludq : SDNode<"X86ISD::PMULUDQ",			def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
	SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,			SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
	SDTCisSameAs<1,2>]>>;			SDTCisSameAs<1,2>]>>;
	def X86pmuldq : SDNode<"X86ISD::PMULDQ",			def X86pmuldq : SDNode<"X86ISD::PMULDQ",
	SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,			SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
	SDTCisSameAs<1,2>]>>;			SDTCisSameAs<1,2>]>>;

				def X86extrqi : SDNode<"X86ISD::EXTRQI",
				SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
				SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
				def X86insertqi : SDNode<"X86ISD::INSERTQI",
				SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
				SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
				SDTCisVT<4, i8>]>>;

	// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get			// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
	// translated into one of the target nodes below during lowering.			// translated into one of the target nodes below during lowering.
	// Note: this is a work in progress...			// Note: this is a work in progress...
	def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;			def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,			def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>]>;			SDTCisSameAs<0,2>]>;
	def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,			def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;			SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
	▲ Show 20 Lines • Show All 592 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 7,767 Lines • ▼ Show 20 Lines
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	let Predicates = [HasSSE4A] in {			let Predicates = [HasSSE4A] in {

	let Constraints = "$src = $dst" in {			let Constraints = "$src = $dst" in {
	def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),			def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
	(ins VR128:$src, u8imm:$len, u8imm:$idx),			(ins VR128:$src, u8imm:$len, u8imm:$idx),
	"extrq\t{$idx, $len, $src\|$src, $len, $idx}",			"extrq\t{$idx, $len, $src\|$src, $len, $idx}",
	[(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,			[(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
	imm:$idx))]>, PD;			imm:$idx))]>, PD;
	def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),			def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src, VR128:$mask),			(ins VR128:$src, VR128:$mask),
	"extrq\t{$mask, $src\|$src, $mask}",			"extrq\t{$mask, $src\|$src, $mask}",
	[(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,			[(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
	VR128:$mask))]>, PD;			VR128:$mask))]>, PD;

	def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),			def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),			(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
	"insertq\t{$idx, $len, $src2, $src\|$src, $src2, $len, $idx}",			"insertq\t{$idx, $len, $src2, $src\|$src, $src2, $len, $idx}",
	[(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,			[(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
	VR128:$src2, imm:$len, imm:$idx))]>, XD;			imm:$len, imm:$idx))]>, XD;
	def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),			def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src, VR128:$mask),			(ins VR128:$src, VR128:$mask),
	"insertq\t{$mask, $src\|$src, $mask}",			"insertq\t{$mask, $src\|$src, $mask}",
	[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,			[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
	VR128:$mask))]>, XD;			VR128:$mask))]>, XD;
	}			}

	def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),			def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
	▲ Show 20 Lines • Show All 1,106 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h

Show All 13 Lines
#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H		#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H		#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H

namespace llvm {		namespace llvm {

enum IntrinsicType {		enum IntrinsicType {
INTR_NO_TYPE,		INTR_NO_TYPE,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,		GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,		INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,		CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,		INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,		INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
VPERM_3OP_MASKZ,		VPERM_3OP_MASKZ,
INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,		INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
EXPAND_FROM_MEM, BLEND		EXPAND_FROM_MEM, BLEND
};		};

▲ Show 20 Lines • Show All 1,043 Lines • ▼ Show 20 Lines	static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),		X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),		X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
		X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
		X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),		X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),		X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),		X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),		X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),		X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),		X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),		X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),		X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vector-shuffle-sse4a.ll

				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a \| FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a \| FileCheck %s --check-prefix=ALL --check-prefix=BTVER2

				;
				; EXTRQI
				;

				define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
				; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
				; BTVER1: # BB#0:
				; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: retq
				;
				; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
				; BTVER2: # BB#0:
				; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
				; BTVER2-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i8> %s
				}

				define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
				; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
				; BTVER1: # BB#0:
				; BTVER1-NEXT: movaps %xmm0, %xmm1
				; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; BTVER1-NEXT: retq
				;
				; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
				; BTVER2: # BB#0:
				; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
				; BTVER2-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
				ret <16 x i8> %s
				}

				define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
				; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
				; BTVER1: # BB#0:
				; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: retq
				;
				; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
				; BTVER2: # BB#0:
				; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
				; BTVER2-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i8> %s
				}

				define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
				; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
				; BTVER1: # BB#0:
				; BTVER1-NEXT: movaps %xmm0, %xmm1
				; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; BTVER1-NEXT: retq
				;
				; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
				; BTVER2: # BB#0:
				; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
				; BTVER2-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
				ret <16 x i8> %s
				}

				define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
				; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
				; ALL: # BB#0:
				; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i8> %s
				}

				define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
				; ALL-LABEL: shuf_1zzzuuuu:
				; ALL: # BB#0:
				; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
				; ALL-LABEL: shuf_12zzuuuu:
				; ALL: # BB#0:
				; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
				; ALL-LABEL: shuf_012zuuuu:
				; ALL: # BB#0:
				; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
				; BTVER1-LABEL: shuf_0zzz1zzz:
				; BTVER1: # BB#0:
				; BTVER1-NEXT: movaps %xmm0, %xmm1
				; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
				; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; BTVER1-NEXT: retq
				;
				; BTVER2-LABEL: shuf_0zzz1zzz:
				; BTVER2: # BB#0:
				; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
				; BTVER2-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8>
				ret <8 x i16> %s
				}

				define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
				; BTVER1-LABEL: shuf_0z1z:
				; BTVER1: # BB#0:
				; BTVER1-NEXT: pxor %xmm1, %xmm1
				; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
				; BTVER1-NEXT: retq
				;
				; BTVER2-LABEL: shuf_0z1z:
				; BTVER2: # BB#0:
				; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
				; BTVER2-NEXT: retq
				%s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
				ret <4 x i32> %s
				}

				;
				; INSERTQI
				;

				define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
				; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i8> %s
				}

				define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
				; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i8> %s
				}

				define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
				; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <16 x i8> %s
				}

				define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
				; ALL-LABEL: shuf_0823uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
				; ALL-LABEL: shuf_0183uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
				; ALL-LABEL: shuf_0128uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
				; ALL-LABEL: shuf_0893uuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
				; ALL-LABEL: shuf_089Auuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}

				define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
				; ALL-LABEL: shuf_089uuuuu:
				; ALL: # BB#0:
				; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
				; ALL-NEXT: retq
				%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
				ret <8 x i16> %s
				}