Diff 129323

lib/Target/X86/X86.td

	Show First 20 Lines • Show All 323 Lines • ▼ Show 20 Lines
	// Gather is available since Haswell (AVX2 set). So technically, we can			// Gather is available since Haswell (AVX2 set). So technically, we can
	// generate Gathers on all AVX2 processors. But the overhead on HSW is high.			// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
	// Skylake Client processor has faster Gathers than HSW and performance is			// Skylake Client processor has faster Gathers than HSW and performance is
	// similar to Skylake Server (AVX-512).			// similar to Skylake Server (AVX-512).
	def FeatureHasFastGather			def FeatureHasFastGather
	: SubtargetFeature<"fast-gather", "HasFastGather", "true",			: SubtargetFeature<"fast-gather", "HasFastGather", "true",
	"Indicates if gather is reasonably fast.">;			"Indicates if gather is reasonably fast.">;

				def FeaturePrefer256Bit
				: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
				"Prefer 256-bit AVX instructions">;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// Register File Description			// Register File Description
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	include "X86RegisterInfo.td"			include "X86RegisterInfo.td"
	include "X86RegisterBanks.td"			include "X86RegisterBanks.td"

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	▲ Show 20 Lines • Show All 709 Lines • Show Last 20 Lines

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 14,270 Lines • ▼ Show 20 Lines	case MVT::v4i1:
ExtVT = MVT::v4i32;		ExtVT = MVT::v4i32;
break;		break;
case MVT::v8i1:		case MVT::v8i1:
// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit		// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
// shuffle.		// shuffle.
ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;		ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
break;		break;
case MVT::v16i1:		case MVT::v16i1:
ExtVT = MVT::v16i32;		// Take 512-bit type, unless we are avoiding 512-bit types and have the
		// 256-bit operation available.
		ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
break;		break;
case MVT::v32i1:		case MVT::v32i1:
ExtVT = MVT::v32i16;		// Take 512-bit type, unless we are avoiding 512-bit types and have the
		// 256-bit operation available.
		assert(Subtarget.hasBWI() && "Expected AVX512BW support");
		ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
break;		break;
case MVT::v64i1:		case MVT::v64i1:
ExtVT = MVT::v64i8;		ExtVT = MVT::v64i8;
break;		break;
}		}

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);		V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);		V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
▲ Show 20 Lines • Show All 2,031 Lines • ▼ Show 20 Lines	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
MVT InVT = In.getSimpleValueType();		MVT InVT = In.getSimpleValueType();
assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");		assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
SDLoc DL(Op);		SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();

// Extend VT if the scalar type is v8/v16 and BWI is not supported.		// Extend VT if the scalar type is v8/v16 and BWI is not supported.
MVT ExtVT = VT;		MVT ExtVT = VT;
if (!Subtarget.hasBWI() &&		if (!Subtarget.hasBWI() &&
(VT.getVectorElementType().getSizeInBits() <= 16))		(VT.getVectorElementType().getSizeInBits() <= 16)) {
		// If v16i32 is to be avoided, we'll need to split and concatenate.
		if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
		SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i1, In,
		DAG.getIntPtrConstant(0, DL));
		SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i1, In,
		DAG.getIntPtrConstant(8, DL));
		Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Lo);
		Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Hi);
		SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i16, Lo, Hi);
		return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
		}

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);		ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
		}

// Widen to 512-bits if VLX is not supported.		// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;		MVT WideVT = ExtVT;
if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {		if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
NumElts *= 512 / ExtVT.getSizeInBits();		NumElts *= 512 / ExtVT.getSizeInBits();
InVT = MVT::getVectorVT(MVT::i1, NumElts);		InVT = MVT::getVectorVT(MVT::i1, NumElts);
In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),		In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
In, DAG.getIntPtrConstant(0, DL));		In, DAG.getIntPtrConstant(0, DL));
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	if (Subtarget.hasBWI()) {
}		}
return DAG.getNode(X86ISD::PCMPGTM, DL, VT, DAG.getConstant(0, DL, InVT),		return DAG.getNode(X86ISD::PCMPGTM, DL, VT, DAG.getConstant(0, DL, InVT),
In);		In);
}		}
// Use TESTD/Q, extended vector to packed dword/qword.		// Use TESTD/Q, extended vector to packed dword/qword.
assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&		assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
"Unexpected vector type.");		"Unexpected vector type.");
unsigned NumElts = InVT.getVectorNumElements();		unsigned NumElts = InVT.getVectorNumElements();
		// If we don't have VLX and we're trying to avoid 512-bit vectors we
		// need special handling. Without VLX we have no choice but to use 512-bit
		// vectors.
		echristoUnsubmitted Not Done Reply Inline Actions This comment could probably use some clarification: i.e. we're going to want to use blah blah blah. Also how is this going to work for a preferred 128 bit vector? echristo: This comment could probably use some clarification: i.e. we're going to want to use blah blah…
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions 128 bit vectors here would probably have to split several times using shuffles to move higher elements to the lower elements, then sign_extend_vector_inreg, testd, and concat the results. craig.topper: 128 bit vectors here would probably have to split several times using shuffles to move higher…
		if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
		// If we can't use 512-bit ops we'll need to split this to use
		// MVT::v8i32 and concat the result.
		if (InVT == MVT::v16i8) {
		// First we need to sign extend up to 256-bits so we can split that.
		InVT = MVT::v16i16;
		In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
		}
		SDValue Lo = extract128BitVector(In, 0, DAG, DL);
		SDValue Hi = extract128BitVector(In, 8, DAG, DL);
		// We're split now, just emit two truncates and a concat. The two
		// truncates will trigger legalization to come back to this function.
		Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
		Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
		return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
		}
MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);		MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);		MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);		In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;		InVT = ExtVT;
ShiftInx = InVT.getScalarSizeInBits() - 1;		ShiftInx = InVT.getScalarSizeInBits() - 1;
}		}

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {		if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
Show All 15 Lines	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");		"Invalid TRUNCATE operation");

if (VT.getVectorElementType() == MVT::i1)		if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);		return LowerTruncateVecI1(Op, DAG, Subtarget);

// vpmovqb/w/d, vpmovdb/w, vpmovwb		// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {		if (Subtarget.hasAVX512()) {
// word to byte only under BWI		// word to byte only under BWI
if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8		if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
		// Make sure we're allowed to promote 512-bits.
		if (Subtarget.canExtendTo512DQ())
return DAG.getNode(X86ISD::VTRUNC, DL, VT,		return DAG.getNode(X86ISD::VTRUNC, DL, VT,
getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));		getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In,
		DAG));
		} else {
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);		return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
		RKSimonUnsubmitted Not Done Reply Inline Actions Is this right? It isn't the default option for AVX512 anymore. RKSimon: Is this right? It isn't the default option for AVX512 anymore.
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions I need to rebase this to use ISD::TRUNCATE is that what you meant? craig.topper: I need to rebase this to use ISD::TRUNCATE is that what you meant?
}		}
		}

// Truncate with PACKSS if we are truncating a vector with sign-bits that		// Truncate with PACKSS if we are truncating a vector with sign-bits that
// extend all the way to the packed/truncated value.		// extend all the way to the packed/truncated value.
unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);		unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))		if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
if (SDValue V =		if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))		truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
return V;		return V;
▲ Show 20 Lines • Show All 1,948 Lines • ▼ Show 20 Lines	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");		assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
MVT VTElt = VT.getVectorElementType();		MVT VTElt = VT.getVectorElementType();
SDLoc dl(Op);		SDLoc dl(Op);

unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();

// Extend VT if the scalar type is v8/v16 and BWI is not supported.		// Extend VT if the scalar type is v8/v16 and BWI is not supported.
MVT ExtVT = VT;		MVT ExtVT = VT;
if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)		if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
		// If v16i32 is to be avoided, we'll need to split and concatenate.
		// NOTE: Without VLX we'll end up being forced to use 512-bit anyway so
		// check that here too.
		if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
		SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
		DAG.getIntPtrConstant(0, dl));
		SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
		DAG.getIntPtrConstant(8, dl));
		Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, Lo);
		Hi = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, Hi);
		SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
		RKSimonUnsubmitted Not Done Reply Inline Actions We have enough of these that we probbaly need a helper function, we already have similar for unary/binary int-256 cases on AVX1. RKSimon: We have enough of these that we probbaly need a helper function, we already have similar for…
		return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
		}

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);		ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
		}

// Widen to 512-bits if VLX is not supported.		// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;		MVT WideVT = ExtVT;
if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {		if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
NumElts *= 512 / ExtVT.getSizeInBits();		NumElts *= 512 / ExtVT.getSizeInBits();
InVT = MVT::getVectorVT(MVT::i1, NumElts);		InVT = MVT::getVectorVT(MVT::i1, NumElts);
In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),		In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
In, DAG.getIntPtrConstant(0, dl));		In, DAG.getIntPtrConstant(0, dl));
▲ Show 20 Lines • Show All 3,265 Lines • ▼ Show 20 Lines
}		}

/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.		/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
//		//
// i8/i16 vector implemented using dword LZCNT vector instruction		// i8/i16 vector implemented using dword LZCNT vector instruction
// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,		// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
// split the vector, perform operation on it's Lo a Hi part and		// split the vector, perform operation on it's Lo a Hi part and
// concatenate the results.		// concatenate the results.
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {		static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
		const X86Subtarget &Subtarget) {
assert(Op.getOpcode() == ISD::CTLZ);		assert(Op.getOpcode() == ISD::CTLZ);
SDLoc dl(Op);		SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();		MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();

assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&		assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
"Unsupported element type");		"Unsupported element type");

// Split vector, it's Lo and Hi parts will be handled in next iteration.		// Split vector, it's Lo and Hi parts will be handled in next iteration.
if (16 < NumElems)		if (NumElems > 16 \|\|
		(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
return LowerVectorIntUnary(Op, DAG);		return LowerVectorIntUnary(Op, DAG);

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);		MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&		assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
"Unsupported value type for operation");		"Unsupported value type for operation");

// Use native supported vector instruction vplzcntd.		// Use native supported vector instruction vplzcntd.
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));		Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
return Res;		return Res;
}		}

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,		static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,		const X86Subtarget &Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();

if (Subtarget.hasCDI())		if (Subtarget.hasCDI() &&
return LowerVectorCTLZ_AVX512CDI(Op, DAG);		// vXi8 vectors need to be promoted to 512-bits for vXi32.
		(Subtarget.canExtendTo512DQ() \|\| VT.getVectorElementType() != MVT::i8))
		return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

// Decompose 256-bit ops into smaller 128-bit ops.		// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())		if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntUnary(Op, DAG);		return Lower256IntUnary(Op, DAG);

// Decompose 512-bit ops into smaller 256-bit ops.		// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())		if (VT.is512BitVector() && !Subtarget.hasBWI())
return Lower512IntUnary(Op, DAG);		return Lower512IntUnary(Op, DAG);
▲ Show 20 Lines • Show All 399 Lines • ▼ Show 20 Lines	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

// AVX2 implementations - extend xmm subvectors to ymm.		// AVX2 implementations - extend xmm subvectors to ymm.
if (Subtarget.hasInt256()) {		if (Subtarget.hasInt256()) {
unsigned NumElems = VT.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();
SDValue Lo = DAG.getIntPtrConstant(0, dl);		SDValue Lo = DAG.getIntPtrConstant(0, dl);
SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);		SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);

if (VT == MVT::v32i8) {		if (VT == MVT::v32i8) {
if (Subtarget.hasBWI()) {		if (Subtarget.canExtendTo512BW()) {
SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);		SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);		SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);		SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,		Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
DAG.getConstant(8, dl, MVT::v32i16));		DAG.getConstant(8, dl, MVT::v32i16));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);		return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
}		}
SDValue ALo = extract128BitVector(A, 0, DAG, dl);		SDValue ALo = extract128BitVector(A, 0, DAG, dl);
▲ Show 20 Lines • Show All 772 Lines • ▼ Show 20 Lines	if (VT == MVT::v4i32) {
SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});		SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});		SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});		return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}		}

// It's worth extending once and using the vXi16/vXi32 shifts for smaller		// It's worth extending once and using the vXi16/vXi32 shifts for smaller
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32		// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
// make the existing SSE solution better.		// make the existing SSE solution better.
		// NOTE: We honor prefered vector width before promoting to 512-bits.
if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|		if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|		(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) \|\|
(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|		(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) \|\|
(Subtarget.hasBWI() && VT == MVT::v32i8)) {		(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) \|\|
		(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&		assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
"Unexpected vector type");		"Unexpected vector type");
MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;		MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());		MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
unsigned ExtOpc =		unsigned ExtOpc =
Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;		Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
R = DAG.getNode(ExtOpc, dl, ExtVT, R);		R = DAG.getNode(ExtOpc, dl, ExtVT, R);
Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);		Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
▲ Show 20 Lines • Show All 808 Lines • ▼ Show 20 Lines	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SDLoc DL(Op.getNode());		SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);		SDValue Op0 = Op.getOperand(0);

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.		// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
if (Subtarget.hasVPOPCNTDQ()) {		if (Subtarget.hasVPOPCNTDQ()) {
unsigned NumElems = VT.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();
assert((VT.getVectorElementType() == MVT::i8 \|\|		assert((VT.getVectorElementType() == MVT::i8 \|\|
VT.getVectorElementType() == MVT::i16) && "Unexpected type");		VT.getVectorElementType() == MVT::i16) && "Unexpected type");
if (NumElems <= 16) {		if (NumElems < 16 \|\| (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);		MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);		Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);		Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);		return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
}		}
}		}

if (!Subtarget.hasSSSE3()) {		if (!Subtarget.hasSSSE3()) {
▲ Show 20 Lines • Show All 14,730 Lines • Show Last 20 Lines

lib/Target/X86/X86Subtarget.h

Show First 20 Lines • Show All 346 Lines • ▼ Show 20 Lines	protected:
/// The minimum alignment known to hold of the stack frame on		/// The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.		/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;		unsigned stackAlignment;

/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.		/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///		///
unsigned MaxInlineSizeThreshold;		unsigned MaxInlineSizeThreshold;

		/// Indicates target prefers 256 bit instructions.
		bool Prefer256Bit;

/// What processor and OS we're targeting.		/// What processor and OS we're targeting.
Triple TargetTriple;		Triple TargetTriple;

/// Instruction itineraries for scheduling		/// Instruction itineraries for scheduling
InstrItineraryData InstrItins;		InstrItineraryData InstrItins;

/// GlobalISel related APIs.		/// GlobalISel related APIs.
std::unique_ptr<CallLowering> CallLoweringInfo;		std::unique_ptr<CallLowering> CallLoweringInfo;
std::unique_ptr<LegalizerInfo> Legalizer;		std::unique_ptr<LegalizerInfo> Legalizer;
std::unique_ptr<RegisterBankInfo> RegBankInfo;		std::unique_ptr<RegisterBankInfo> RegBankInfo;
std::unique_ptr<InstructionSelector> InstSelector;		std::unique_ptr<InstructionSelector> InstSelector;

private:		private:
/// Override the stack alignment.		/// Override the stack alignment.
unsigned StackAlignOverride;		unsigned StackAlignOverride;

		/// Preferred vector width from function attribute.
		unsigned PreferVectorWidthOverride;

		/// Resolved preferred vector width from function attribute and subtarget
		/// features.
		unsigned PreferVectorWidth;

/// True if compiling for 64-bit, false for 16-bit or 32-bit.		/// True if compiling for 64-bit, false for 16-bit or 32-bit.
bool In64BitMode;		bool In64BitMode;

/// True if compiling for 32-bit, false for 16-bit or 64-bit.		/// True if compiling for 32-bit, false for 16-bit or 64-bit.
bool In32BitMode;		bool In32BitMode;

/// True if compiling for 16-bit, false for 32-bit or 64-bit.		/// True if compiling for 16-bit, false for 32-bit or 64-bit.
bool In16BitMode;		bool In16BitMode;
Show All 9 Lines	private:
X86TargetLowering TLInfo;		X86TargetLowering TLInfo;
X86FrameLowering FrameLowering;		X86FrameLowering FrameLowering;

public:		public:
/// This constructor initializes the data members to match that		/// This constructor initializes the data members to match that
/// of the specified triple.		/// of the specified triple.
///		///
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,		X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM, unsigned StackAlignOverride);		const X86TargetMachine &TM, unsigned StackAlignOverride,
		unsigned PreferVectorWidthOverride);

const X86TargetLowering *getTargetLowering() const override {		const X86TargetLowering *getTargetLowering() const override {
return &TLInfo;		return &TLInfo;
}		}

const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }		const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }

const X86FrameLowering *getFrameLowering() const override {		const X86FrameLowering *getFrameLowering() const override {
▲ Show 20 Lines • Show All 165 Lines • ▼ Show 20 Lines	public:
bool hasVNNI() const { return HasVNNI; }		bool hasVNNI() const { return HasVNNI; }
bool hasBITALG() const { return HasBITALG; }		bool hasBITALG() const { return HasBITALG; }
bool hasMPX() const { return HasMPX; }		bool hasMPX() const { return HasMPX; }
bool hasSHSTK() const { return HasSHSTK; }		bool hasSHSTK() const { return HasSHSTK; }
bool hasIBT() const { return HasIBT; }		bool hasIBT() const { return HasIBT; }
bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }		bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
bool hasCLWB() const { return HasCLWB; }		bool hasCLWB() const { return HasCLWB; }

		unsigned getPreferVectorWidth() const { return PreferVectorWidth; }

		// Helper functions to determine when we should allow widening to 512-bit
		// during codegen.
		// TODO: Currently we're always allowing widening on CPUs without VLX,
		// because for many cases we don't have a better option.
		bool canExtendTo512DQ() const {
		return hasAVX512() && (!hasVLX() \|\| getPreferVectorWidth() >= 512);
		}
		bool canExtendTo512BW() const {
		return hasBWI() && canExtendTo512DQ();
		}

bool isXRaySupported() const override { return is64Bit(); }		bool isXRaySupported() const override { return is64Bit(); }

X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }		X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }

/// TODO: to be removed later and replaced with suitable properties		/// TODO: to be removed later and replaced with suitable properties
bool isAtom() const { return X86ProcFamily == IntelAtom; }		bool isAtom() const { return X86ProcFamily == IntelAtom; }
bool isSLM() const { return X86ProcFamily == IntelSLM; }		bool isSLM() const { return X86ProcFamily == IntelSLM; }
bool useSoftFloat() const { return UseSoftFloat; }		bool useSoftFloat() const { return UseSoftFloat; }
▲ Show 20 Lines • Show All 136 Lines • Show Last 20 Lines

lib/Target/X86/X86Subtarget.cpp

Show First 20 Lines • Show All 248 Lines • ▼ Show 20 Lines	void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// parameter is used for cost estimation of Gather Op and comparison with		// parameter is used for cost estimation of Gather Op and comparison with
// other alternatives.		// other alternatives.
// TODO: Remove the explicit hasAVX512()?, That would mean we would only		// TODO: Remove the explicit hasAVX512()?, That would mean we would only
// enable gather with a -march.		// enable gather with a -march.
if (hasAVX512() \|\| (hasAVX2() && hasFastGather()))		if (hasAVX512() \|\| (hasAVX2() && hasFastGather()))
GatherOverhead = 2;		GatherOverhead = 2;
if (hasAVX512())		if (hasAVX512())
ScatterOverhead = 2;		ScatterOverhead = 2;

		// Consume the vector width attribute or apply any target specific limit.
		if (PreferVectorWidthOverride)
		PreferVectorWidth = PreferVectorWidthOverride;
		RKSimonUnsubmitted Not Done Reply Inline Actions Do we need to assert for a sane value here (else in X86TargetMachine.cpp)? RKSimon: Do we need to assert for a sane value here (else in X86TargetMachine.cpp)?
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions What do you consider a sane value? There's no bounds checking on the attribute coming in. craig.topper: What do you consider a sane value? There's no bounds checking on the attribute coming in.
		RKSimonUnsubmitted Not Done Reply Inline Actions assert(IsPowerOf2(PreferVectorWidthOverride) && PreferVectorWidthOverride >= 128 )? RKSimon: assert(IsPowerOf2(PreferVectorWidthOverride) && PreferVectorWidthOverride >= 128 )?
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions It comes from an unchecked command line argument passed to clang that was intended to be target independent. There's no guarantee of any particular value or range. craig.topper: It comes from an unchecked command line argument passed to clang that was intended to be target…
		else if (Prefer256Bit)
		PreferVectorWidth = 256;
}		}

void X86Subtarget::initializeEnvironment() {		void X86Subtarget::initializeEnvironment() {
X86SSELevel = NoSSE;		X86SSELevel = NoSSE;
X863DNowLevel = NoThreeDNow;		X863DNowLevel = NoThreeDNow;
HasX87 = false;		HasX87 = false;
HasCMov = false;		HasCMov = false;
HasX86_64 = false;		HasX86_64 = false;
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	void X86Subtarget::initializeEnvironment() {
SlowIncDec = false;		SlowIncDec = false;
stackAlignment = 4;		stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?		// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;		MaxInlineSizeThreshold = 128;
UseSoftFloat = false;		UseSoftFloat = false;
X86ProcFamily = Others;		X86ProcFamily = Others;
GatherOverhead = 1024;		GatherOverhead = 1024;
ScatterOverhead = 1024;		ScatterOverhead = 1024;
		PreferVectorWidth = UINT32_MAX;
}		}

X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,		X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
StringRef FS) {		StringRef FS) {
initializeEnvironment();		initializeEnvironment();
initSubtargetFeatures(CPU, FS);		initSubtargetFeatures(CPU, FS);
return *this;		return *this;
}		}

X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,		X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM,		const X86TargetMachine &TM,
unsigned StackAlignOverride)		unsigned StackAlignOverride,
		unsigned PreferVectorWidthOverride)
: X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),		: X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),		PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
StackAlignOverride(StackAlignOverride),		StackAlignOverride(StackAlignOverride),
		PreferVectorWidthOverride(PreferVectorWidthOverride),
In64BitMode(TargetTriple.getArch() == Triple::x86_64),		In64BitMode(TargetTriple.getArch() == Triple::x86_64),
In32BitMode(TargetTriple.getArch() == Triple::x86 &&		In32BitMode(TargetTriple.getArch() == Triple::x86 &&
TargetTriple.getEnvironment() != Triple::CODE16),		TargetTriple.getEnvironment() != Triple::CODE16),
In16BitMode(TargetTriple.getArch() == Triple::x86 &&		In16BitMode(TargetTriple.getArch() == Triple::x86 &&
TargetTriple.getEnvironment() == Triple::CODE16),		TargetTriple.getEnvironment() == Triple::CODE16),
InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),		InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
FrameLowering(*this, getStackAlignment()) {		FrameLowering(*this, getStackAlignment()) {
// Determine the PICStyle based on the target selected.		// Determine the PICStyle based on the target selected.
Show All 38 Lines

lib/Target/X86/X86TargetMachine.cpp

Show First 20 Lines • Show All 251 Lines • ▼ Show 20 Lines	bool SoftFloat =
F.getFnAttribute("use-soft-float").getValueAsString() == "true";		F.getFnAttribute("use-soft-float").getValueAsString() == "true";
// If the soft float attribute is set on the function turn on the soft float		// If the soft float attribute is set on the function turn on the soft float
// subtarget feature.		// subtarget feature.
if (SoftFloat)		if (SoftFloat)
Key += FS.empty() ? "+soft-float" : ",+soft-float";		Key += FS.empty() ? "+soft-float" : ",+soft-float";

FS = Key.substr(CPU.size());		FS = Key.substr(CPU.size());

		// Translate vector width function attribute into subtarget features. This
		// overrides any CPU specific turning parameter
		unsigned PreferVectorWidthOverride = 0;
		if (F.hasFnAttribute("prefer-vector-width")) {
		echristoUnsubmitted Not Done Reply Inline Actions Bikeshed: "preferred-vector-width"? echristo: Bikeshed: "preferred-vector-width"?
		craig.topperAuthorUnsubmitted Not Done Reply Inline Actions It got named this way because gcc implemented their command line option as -mprefer-vector-width= so then I matched that in clang and kept the attribute name matching the command line option. craig.topper: It got named this way because gcc implemented their command line option as -mprefer-vector…
		StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString();
		unsigned Width;
		if (!Val.getAsInteger(0, Width)) {
		if (Key.size() > CPU.size())
		Key += ",";
		Key += "prefer-vector-width=";
		Key += Val;
		PreferVectorWidthOverride = Width;
		}
		}

auto &I = SubtargetMap[Key];		auto &I = SubtargetMap[Key];
if (!I) {		if (!I) {
// This needs to be done before we create a new subtarget since any		// This needs to be done before we create a new subtarget since any
// creation will depend on the TM and the code generation flags on the		// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.		// function that reside in TargetOptions.
resetTargetOptions(F);		resetTargetOptions(F);
I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,		I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
Options.StackAlignmentOverride);		Options.StackAlignmentOverride,
		PreferVectorWidthOverride);
}		}
return I.get();		return I.get();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Command line options for x86		// Command line options for x86
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
static cl::opt<bool>		static cl::opt<bool>
▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	if (ST->is64Bit()) {
if (Vector && ST->hasAVX512())		if (Vector && ST->hasAVX512())
return 32;		return 32;
return 16;		return 16;
}		}
return 8;		return 8;
}		}

unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {		unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
		unsigned PreferVectorWidth = ST->getPreferVectorWidth();
if (Vector) {		if (Vector) {
if (ST->hasAVX512())		if (ST->hasAVX512())
return 512;		return std::min(512U, PreferVectorWidth);
if (ST->hasAVX())		if (ST->hasAVX())
return 256;		return std::min(256U, PreferVectorWidth);
if (ST->hasSSE1())		if (ST->hasSSE1())
return 128;		return std::min(128U, PreferVectorWidth);
return 0;		return 0;
}		}

if (ST->is64Bit())		if (ST->is64Bit())
return 64;		return 64;

return 32;		return 32;
}		}
▲ Show 20 Lines • Show All 2,370 Lines • ▼ Show 20 Lines	bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
Type *ScalarTy = DataTy->getScalarType();		Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?		int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();		DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();

// Some CPUs have better gather performance than others.		// Some CPUs have better gather performance than others.
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only		// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.		// enable gather with a -march.
return (DataWidth == 32 \|\| DataWidth == 64) &&		return (DataWidth == 32 \|\| DataWidth == 64) &&
(ST->hasAVX512() \|\| (ST->hasFastGather() && ST->hasAVX2()));		(ST->hasAVX512() \|\| (ST->hasFastGather() && ST->hasAVX2()));
}		}

bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {		bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
// AVX2 doesn't support scatter		// AVX2 doesn't support scatter
if (!ST->hasAVX512())		if (!ST->hasAVX512())
return false;		return false;
return isLegalMaskedGather(DataType);		return isLegalMaskedGather(DataType);
}		}
▲ Show 20 Lines • Show All 327 Lines • Show Last 20 Lines

test/CodeGen/X86/prefer-avx256-lzcnt.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512cd,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512cd,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F

				define <8 x i16> @testv8i16(<8 x i16> %in) {
				; AVX256-LABEL: testv8i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX256-NEXT: vplzcntd %ymm0, %ymm0
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
				; AVX256-NEXT: vzeroupper
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv8i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX512VL-NEXT: vplzcntd %ymm0, %ymm0
				; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
				; AVX512VL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv8i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX512F-NEXT: vplzcntd %zmm0, %zmm0
				; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512F-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 false)
				ret <8 x i16> %out
				}

				define <16 x i8> @testv16i8(<16 x i8> %in) {
				; AVX256-LABEL: testv16i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2
				; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
				; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2
				; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0
				; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm0
				; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX256-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
				; AVX256-NEXT: vpand %xmm1, %xmm2, %xmm1
				; AVX256-NEXT: vpshufb %xmm0, %xmm3, %xmm0
				; AVX256-NEXT: vpaddb %xmm0, %xmm1, %xmm0
				; AVX256-NEXT: retq
				;
				; AVX512-LABEL: testv16i8:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512-NEXT: vplzcntd %zmm0, %zmm0
				; AVX512-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
				; AVX512-NEXT: vzeroupper
				; AVX512-NEXT: retq
				%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 false)
				ret <16 x i8> %out
				}

				define <16 x i16> @testv16i16(<16 x i16> %in) {
				; AVX256-LABEL: testv16i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
				; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
				; AVX256-NEXT: vplzcntd %ymm1, %ymm1
				; AVX256-NEXT: vpmovdw %ymm1, %xmm1
				; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
				; AVX256-NEXT: vpsubw %xmm2, %xmm1, %xmm1
				; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX256-NEXT: vplzcntd %ymm0, %ymm0
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vpsubw %xmm2, %xmm0, %xmm0
				; AVX256-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512-LABEL: testv16i16:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
				; AVX512-NEXT: vplzcntd %zmm0, %zmm0
				; AVX512-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
				; AVX512-NEXT: retq
				%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 false)
				ret <16 x i16> %out
				}

				define <32 x i8> @testv32i8(<32 x i8> %in) {
				; AVX256-LABEL: testv32i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2
				; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
				; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2
				; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
				; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
				; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
				; AVX256-NEXT: vpand %ymm1, %ymm2, %ymm1
				; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0
				; AVX256-NEXT: vpaddb %ymm0, %ymm1, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512-LABEL: testv32i8:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
				; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
				; AVX512-NEXT: vplzcntd %zmm1, %zmm1
				; AVX512-NEXT: vpmovdb %zmm1, %xmm1
				; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
				; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1
				; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512-NEXT: vplzcntd %zmm0, %zmm0
				; AVX512-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0
				; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
				; AVX512-NEXT: retq
				%out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 false)
				ret <32 x i8> %out
				}

				declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
				declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
				declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
				declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)

test/CodeGen/X86/prefer-avx256-mask-extend.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F

				define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
				; AVX256-LABEL: testv8i1_sext_v8i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
				; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vzeroupper
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv8i1_sext_v8i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
				; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv8i1_sext_v8i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512F-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0
				; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				%in = load <8 x i32>, <8 x i32>* %p
				%cmp = icmp eq <8 x i32> %in, zeroinitializer
				%ext = sext <8 x i1> %cmp to <8 x i16>
				ret <8 x i16> %ext
				}

				define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
				; AVX256-LABEL: testv16i1_sext_v16i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
				; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
				; AVX256-NEXT: vpmovdw %ymm1, %xmm1
				; AVX256-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
				; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX256-NEXT: vzeroupper
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv16i1_sext_v16i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
				; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
				; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv16i1_sext_v16i8:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
				; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
				; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
				; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
				; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				%in = load <8 x i32>, <8 x i32>* %p
				%cmp = icmp eq <8 x i32> %in, zeroinitializer
				%in2 = load <8 x i32>, <8 x i32>* %q
				%cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
				%concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
				%ext = sext <16 x i1> %concat to <16 x i8>
				ret <16 x i8> %ext
				}

				define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
				; AVX256-LABEL: testv16i1_sext_v16i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
				; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
				; AVX256-NEXT: vpmovdw %ymm1, %xmm1
				; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv16i1_sext_v16i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
				; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
				; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv16i1_sext_v16i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
				; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
				; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
				; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
				; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512F-NEXT: retq
				%in = load <8 x i32>, <8 x i32>* %p
				%cmp = icmp eq <8 x i32> %in, zeroinitializer
				%in2 = load <8 x i32>, <8 x i32>* %q
				%cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
				%concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
				%ext = sext <16 x i1> %concat to <16 x i16>
				ret <16 x i16> %ext
				}

				define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
				; AVX256-LABEL: testv8i1_zext_v8i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vzeroupper
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv8i1_zext_v8i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv8i1_zext_v8i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512F-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0
				; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				%in = load <8 x i32>, <8 x i32>* %p
				%cmp = icmp eq <8 x i32> %in, zeroinitializer
				%ext = zext <8 x i1> %cmp to <8 x i16>
				ret <8 x i16> %ext
				}

				define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
				; AVX256-LABEL: testv16i1_zext_v16i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX256-NEXT: movl {{.*}}(%rip), %eax
				; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k2} {z}
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
				; AVX256-NEXT: vpshufb %xmm1, %xmm0, %xmm0
				; AVX256-NEXT: vpbroadcastd %eax, %ymm2 {%k1} {z}
				; AVX256-NEXT: vpmovdw %ymm2, %xmm2
				; AVX256-NEXT: vpshufb %xmm1, %xmm2, %xmm1
				; AVX256-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
				; AVX256-NEXT: vzeroupper
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv16i1_zext_v16i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
				; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
				; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv16i1_zext_v16i8:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
				; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
				; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
				; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
				; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				%in = load <8 x i32>, <8 x i32>* %p
				%cmp = icmp eq <8 x i32> %in, zeroinitializer
				%in2 = load <8 x i32>, <8 x i32>* %q
				%cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
				%concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
				%ext = zext <16 x i1> %concat to <16 x i8>
				ret <16 x i8> %ext
				}

				define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
				; AVX256-LABEL: testv16i1_zext_v16i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX256-NEXT: movl {{.*}}(%rip), %eax
				; AVX256-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z}
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vpbroadcastd %eax, %ymm1 {%k2} {z}
				; AVX256-NEXT: vpmovdw %ymm1, %xmm1
				; AVX256-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv16i1_zext_v16i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
				; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
				; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv16i1_zext_v16i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
				; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
				; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
				; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
				; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
				; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
				; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512F-NEXT: retq
				%in = load <8 x i32>, <8 x i32>* %p
				%cmp = icmp eq <8 x i32> %in, zeroinitializer
				%in2 = load <8 x i32>, <8 x i32>* %q
				%cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
				%concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
				%ext = zext <16 x i1> %concat to <16 x i16>
				ret <16 x i16> %ext
				}

test/CodeGen/X86/prefer-avx256-mask-shuffle.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256 --check-prefix=AVX256VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256 --check-prefix=AVX256VLBW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VLBW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW

				define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) {
				; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX256VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
				; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
				; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
				; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z}
				; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
				; AVX256VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
				; AVX256VL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1]
				; AVX256VL-NEXT: vpmovsxwd %xmm3, %ymm3
				; AVX256VL-NEXT: vpslld $31, %ymm3, %ymm3
				; AVX256VL-NEXT: vptestmd %ymm3, %ymm3, %k1
				; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
				; AVX256VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,2,3,14,15,6,7,6,7,14,15,0,1]
				; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7]
				; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
				; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
				; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
				; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0
				; AVX256VL-NEXT: kshiftrw $8, %k0, %k2
				; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
				; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
				; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
				; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
				; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
				; AVX256VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX256VL-NEXT: vzeroupper
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
				; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
				; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
				; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
				; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
				; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
				; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX256VLBW: # %bb.0:
				; AVX256VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX256VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
				; AVX256VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
				; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0
				; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1
				; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
				; AVX256VLBW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
				; AVX256VLBW-NEXT: vpmovw2m %ymm2, %k0
				; AVX256VLBW-NEXT: vpmovm2b %k0, %xmm0
				; AVX256VLBW-NEXT: vzeroupper
				; AVX256VLBW-NEXT: retq
				;
				; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512VLBW: # %bb.0:
				; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
				; AVX512VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
				; AVX512VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
				; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
				; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
				; AVX512VLBW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
				; AVX512VLBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
				; AVX512VLBW-NEXT: vptestmd %zmm2, %zmm2, %k0
				; AVX512VLBW-NEXT: vpmovm2b %k0, %xmm0
				; AVX512VLBW-NEXT: vzeroupper
				; AVX512VLBW-NEXT: retq
				;
				; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
				; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
				; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
				; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
				; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm1, %k2
				; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
				; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
				; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
				; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
				; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
				; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
				; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				;
				; AVX512BW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
				; AVX512BW-NEXT: vmovdqa (%rsi), %ymm1
				; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
				; AVX512BW-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
				; AVX512BW-NEXT: vpcmpeqd %zmm2, %zmm1, %k2
				; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
				; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
				; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
				; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
				; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k0
				; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
				; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
				; AVX512BW-NEXT: vzeroupper
				; AVX512BW-NEXT: retq

				%a1 = load <8 x i32>, <8 x i32>* %a
				%b1 = load <8 x i32>, <8 x i32>* %b
				%a2 = icmp eq <8 x i32> %a1, zeroinitializer
				%b2 = icmp eq <8 x i32> %b1, zeroinitializer
				%c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 10, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 11, i32 7, i32 0>
				ret <16 x i1> %c
				}

				define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i8> %a) {
				; AVX256VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
				; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
				; AVX256VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
				; AVX256VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
				; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
				; AVX256VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
				; AVX256VL-NEXT: retq
				;
				; AVX512NOBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512NOBW: # %bb.0:
				; AVX512NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512NOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
				; AVX512NOBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
				; AVX512NOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
				; AVX512NOBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
				; AVX512NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
				; AVX512NOBW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
				; AVX512NOBW-NEXT: retq
				;
				; AVX256VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX256VLBW: # %bb.0:
				; AVX256VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX256VLBW-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
				; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
				; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
				; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
				; AVX256VLBW-NEXT: movl $-537190396, %eax # imm = 0xDFFB2004
				; AVX256VLBW-NEXT: kmovd %eax, %k1
				; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
				; AVX256VLBW-NEXT: vpmovb2m %ymm0, %k0
				; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
				; AVX256VLBW-NEXT: retq
				;
				; AVX512VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512VLBW: # %bb.0:
				; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512VLBW-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
				; AVX512VLBW-NEXT: vpmovm2w %k0, %zmm0
				; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
				; AVX512VLBW-NEXT: vpermw %zmm0, %zmm1, %zmm0
				; AVX512VLBW-NEXT: vpmovw2m %zmm0, %k0
				; AVX512VLBW-NEXT: vpmovm2b %k0, %ymm0
				; AVX512VLBW-NEXT: retq
				;
				; AVX512BW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
				; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
				; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
				; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
				; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
				; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
				; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
				; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
				; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
				; AVX512BW-NEXT: retq
				%cmp = icmp eq <32 x i8> %a, zeroinitializer
				%b = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
				ret <32 x i1> %b
				}

test/CodeGen/X86/prefer-avx256-popcnt.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vpopcntdq,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vpopcntdq,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vpopcntdq,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vpopcntdq,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F

				define <8 x i16> @testv8i16(<8 x i16> %in) {
				; AVX256-LABEL: testv8i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX256-NEXT: vpopcntd %ymm0, %ymm0
				; AVX256-NEXT: vpmovdw %ymm0, %xmm0
				; AVX256-NEXT: vzeroupper
				; AVX256-NEXT: retq
				;
				; AVX512VL-LABEL: testv8i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX512VL-NEXT: vpopcntd %ymm0, %ymm0
				; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512F-LABEL: testv8i16:
				; AVX512F: # %bb.0:
				; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
				; AVX512F-NEXT: vpopcntd %zmm0, %zmm0
				; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
				; AVX512F-NEXT: vzeroupper
				; AVX512F-NEXT: retq
				%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
				ret <8 x i16> %out
				}

				define <16 x i8> @testv16i8(<16 x i8> %in) {
				; AVX256-LABEL: testv16i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2
				; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
				; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2
				; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0
				; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm0
				; AVX256-NEXT: vpshufb %xmm0, %xmm3, %xmm0
				; AVX256-NEXT: vpaddb %xmm2, %xmm0, %xmm0
				; AVX256-NEXT: retq
				;
				; AVX512-LABEL: testv16i8:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512-NEXT: vpopcntd %zmm0, %zmm0
				; AVX512-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512-NEXT: vzeroupper
				; AVX512-NEXT: retq
				%out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
				ret <16 x i8> %out
				}

				define <16 x i16> @testv16i16(<16 x i16> %in) {
				; AVX256-LABEL: testv16i16:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2
				; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
				; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2
				; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
				; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
				; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0
				; AVX256-NEXT: vpaddb %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: vpsllw $8, %ymm0, %ymm1
				; AVX256-NEXT: vpaddb %ymm0, %ymm1, %ymm0
				; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512-LABEL: testv16i16:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
				; AVX512-NEXT: vpopcntd %zmm0, %zmm0
				; AVX512-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512-NEXT: retq
				%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
				ret <16 x i16> %out
				}

				define <32 x i8> @testv32i8(<32 x i8> %in) {
				; CHECK-LABEL: testv32i8:
				; CHECK: # %bb.0:
				; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2
				; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
				; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2
				; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
				; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
				; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0
				; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
				; CHECK-NEXT: retq
				%out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
				ret <32 x i8> %out
				}

				declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
				declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
				declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
				declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)

test/CodeGen/X86/prefer-avx256-shift.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=ALL --check-prefix=AVX256 --check-prefix=AVX256BW --check-prefix=AVX256BWVL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW --check-prefix=AVX512BWVL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=ALL --check-prefix=AVX256 --check-prefix=AVX256VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+prefer-256-bit \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW --check-prefix=AVX512BWNOVL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,-prefer-256-bit \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW --check-prefix=AVX512BWNOVL

				define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) {
				; AVX256-LABEL: var_shl_v32i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1
				; AVX256-NEXT: vpsllw $4, %ymm0, %ymm2
				; AVX256-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: vpsllw $2, %ymm0, %ymm2
				; AVX256-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: vpaddb %ymm0, %ymm0, %ymm2
				; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512BW-LABEL: var_shl_v32i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
				; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
				; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
				; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BW-NEXT: retq
				;
				; AVX512VL-LABEL: var_shl_v32i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
				; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
				; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
				; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
				; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: retq
				%shift = shl <32 x i8> %a, %b
				ret <32 x i8> %shift
				}

				define <16 x i16> @var_shl_v16i16(<16 x i16> %a, <16 x i16> %b) {
				; AVX256BW-LABEL: var_shl_v16i16:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: retq
				;
				; AVX512BWVL-LABEL: var_shl_v16i16:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
				; AVX512BWVL-NEXT: retq
				;
				; AVX256VL-LABEL: var_shl_v16i16:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
				; AVX256VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
				; AVX256VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
				; AVX256VL-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
				; AVX256VL-NEXT: vpsrld $16, %ymm3, %ymm3
				; AVX256VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
				; AVX256VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
				; AVX256VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
				; AVX256VL-NEXT: vpsrld $16, %ymm0, %ymm0
				; AVX256VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: var_shl_v16i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
				; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
				; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512VL-NEXT: retq
				;
				; AVX512BWNOVL-LABEL: var_shl_v16i16:
				; AVX512BWNOVL: # %bb.0:
				; AVX512BWNOVL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
				; AVX512BWNOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
				; AVX512BWNOVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
				; AVX512BWNOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
				; AVX512BWNOVL-NEXT: retq
				%shift = shl <16 x i16> %a, %b
				ret <16 x i16> %shift
				}

				define <16 x i8> @var_shl_v16i8(<16 x i8> %a, <16 x i8> %b) {
				; AVX256BW-LABEL: var_shl_v16i8:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX256BW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: vpmovwb %ymm0, %xmm0
				; AVX256BW-NEXT: vzeroupper
				; AVX256BW-NEXT: retq
				;
				; AVX512BWVL-LABEL: var_shl_v16i8:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
				; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
				; AVX512BWVL-NEXT: vzeroupper
				; AVX512BWVL-NEXT: retq
				;
				; AVX256VL-LABEL: var_shl_v16i8:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1
				; AVX256VL-NEXT: vpsllw $4, %xmm0, %xmm2
				; AVX256VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: vpsllw $2, %xmm0, %xmm2
				; AVX256VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
				; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2
				; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: var_shl_v16i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
				; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
				; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512BWNOVL-LABEL: var_shl_v16i8:
				; AVX512BWNOVL: # %bb.0:
				; AVX512BWNOVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX512BWNOVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX512BWNOVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
				; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BWNOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
				; AVX512BWNOVL-NEXT: vzeroupper
				; AVX512BWNOVL-NEXT: retq
				%shift = shl <16 x i8> %a, %b
				ret <16 x i8> %shift
				}

				define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) {
				; AVX256-LABEL: var_lshr_v32i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1
				; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm2
				; AVX256-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: vpsrlw $2, %ymm0, %ymm2
				; AVX256-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: vpsrlw $1, %ymm0, %ymm2
				; AVX256-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512BW-LABEL: var_lshr_v32i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
				; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
				; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
				; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BW-NEXT: retq
				;
				; AVX512VL-LABEL: var_lshr_v32i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
				; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
				; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
				; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2
				; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
				; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: retq
				%shift = lshr <32 x i8> %a, %b
				ret <32 x i8> %shift
				}

				define <16 x i16> @var_lshr_v16i16(<16 x i16> %a, <16 x i16> %b) {
				; AVX256BW-LABEL: var_lshr_v16i16:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: retq
				;
				; AVX512BWVL-LABEL: var_lshr_v16i16:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
				; AVX512BWVL-NEXT: retq
				;
				; AVX256VL-LABEL: var_lshr_v16i16:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
				; AVX256VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
				; AVX256VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
				; AVX256VL-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
				; AVX256VL-NEXT: vpsrld $16, %ymm3, %ymm3
				; AVX256VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
				; AVX256VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
				; AVX256VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
				; AVX256VL-NEXT: vpsrld $16, %ymm0, %ymm0
				; AVX256VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: var_lshr_v16i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
				; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
				; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512VL-NEXT: retq
				;
				; AVX512BWNOVL-LABEL: var_lshr_v16i16:
				; AVX512BWNOVL: # %bb.0:
				; AVX512BWNOVL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
				; AVX512BWNOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
				; AVX512BWNOVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
				; AVX512BWNOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
				; AVX512BWNOVL-NEXT: retq
				%shift = lshr <16 x i16> %a, %b
				ret <16 x i16> %shift
				}

				define <16 x i8> @var_lshr_v16i8(<16 x i8> %a, <16 x i8> %b) {
				; AVX256BW-LABEL: var_lshr_v16i8:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX256BW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: vpmovwb %ymm0, %xmm0
				; AVX256BW-NEXT: vzeroupper
				; AVX256BW-NEXT: retq
				;
				; AVX512BWVL-LABEL: var_lshr_v16i8:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
				; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
				; AVX512BWVL-NEXT: vzeroupper
				; AVX512BWVL-NEXT: retq
				;
				; AVX256VL-LABEL: var_lshr_v16i8:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1
				; AVX256VL-NEXT: vpsrlw $4, %xmm0, %xmm2
				; AVX256VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: vpsrlw $2, %xmm0, %xmm2
				; AVX256VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
				; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: vpsrlw $1, %xmm0, %xmm2
				; AVX256VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
				; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: var_lshr_v16i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
				; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
				; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
				; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512BWNOVL-LABEL: var_lshr_v16i8:
				; AVX512BWNOVL: # %bb.0:
				; AVX512BWNOVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX512BWNOVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX512BWNOVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
				; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BWNOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
				; AVX512BWNOVL-NEXT: vzeroupper
				; AVX512BWNOVL-NEXT: retq
				%shift = lshr <16 x i8> %a, %b
				ret <16 x i8> %shift
				}

				define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
				; AVX256-LABEL: var_ashr_v32i8:
				; AVX256: # %bb.0:
				; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1
				; AVX256-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
				; AVX256-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
				; AVX256-NEXT: vpsraw $4, %ymm3, %ymm4
				; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
				; AVX256-NEXT: vpsraw $2, %ymm3, %ymm4
				; AVX256-NEXT: vpaddw %ymm2, %ymm2, %ymm2
				; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
				; AVX256-NEXT: vpsraw $1, %ymm3, %ymm4
				; AVX256-NEXT: vpaddw %ymm2, %ymm2, %ymm2
				; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
				; AVX256-NEXT: vpsrlw $8, %ymm2, %ymm2
				; AVX256-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
				; AVX256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
				; AVX256-NEXT: vpsraw $4, %ymm0, %ymm3
				; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
				; AVX256-NEXT: vpsraw $2, %ymm0, %ymm3
				; AVX256-NEXT: vpaddw %ymm1, %ymm1, %ymm1
				; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
				; AVX256-NEXT: vpsraw $1, %ymm0, %ymm3
				; AVX256-NEXT: vpaddw %ymm1, %ymm1, %ymm1
				; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
				; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm0
				; AVX256-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
				; AVX256-NEXT: retq
				;
				; AVX512BW-LABEL: var_ashr_v32i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
				; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
				; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
				; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BW-NEXT: retq
				;
				; AVX512VL-LABEL: var_ashr_v32i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
				; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
				; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
				; AVX512VL-NEXT: vpsraw $4, %ymm3, %ymm4
				; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
				; AVX512VL-NEXT: vpsraw $2, %ymm3, %ymm4
				; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
				; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
				; AVX512VL-NEXT: vpsraw $1, %ymm3, %ymm4
				; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2
				; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
				; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
				; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
				; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
				; AVX512VL-NEXT: vpsraw $4, %ymm0, %ymm3
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
				; AVX512VL-NEXT: vpsraw $2, %ymm0, %ymm3
				; AVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
				; AVX512VL-NEXT: vpsraw $1, %ymm0, %ymm3
				; AVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm1
				; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
				; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
				; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
				; AVX512VL-NEXT: retq
				%shift = ashr <32 x i8> %a, %b
				ret <32 x i8> %shift
				}

				define <16 x i16> @var_ashr_v16i16(<16 x i16> %a, <16 x i16> %b) {
				; AVX256BW-LABEL: var_ashr_v16i16:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vpsravw %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: retq
				;
				; AVX512BWVL-LABEL: var_ashr_v16i16:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
				; AVX512BWVL-NEXT: retq
				;
				; AVX256VL-LABEL: var_ashr_v16i16:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
				; AVX256VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
				; AVX256VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
				; AVX256VL-NEXT: vpsravd %ymm3, %ymm4, %ymm3
				; AVX256VL-NEXT: vpsrld $16, %ymm3, %ymm3
				; AVX256VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
				; AVX256VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
				; AVX256VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
				; AVX256VL-NEXT: vpsrld $16, %ymm0, %ymm0
				; AVX256VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: var_ashr_v16i16:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
				; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
				; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
				; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
				; AVX512VL-NEXT: retq
				;
				; AVX512BWNOVL-LABEL: var_ashr_v16i16:
				; AVX512BWNOVL: # %bb.0:
				; AVX512BWNOVL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
				; AVX512BWNOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
				; AVX512BWNOVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
				; AVX512BWNOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
				; AVX512BWNOVL-NEXT: retq
				%shift = ashr <16 x i16> %a, %b
				ret <16 x i16> %shift
				}

				define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) {
				; AVX256BW-LABEL: var_ashr_v16i8:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX256BW-NEXT: vpmovsxbw %xmm0, %ymm0
				; AVX256BW-NEXT: vpsravw %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: vpmovwb %ymm0, %xmm0
				; AVX256BW-NEXT: vzeroupper
				; AVX256BW-NEXT: retq
				;
				; AVX512BWVL-LABEL: var_ashr_v16i8:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
				; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
				; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
				; AVX512BWVL-NEXT: vzeroupper
				; AVX512BWVL-NEXT: retq
				;
				; AVX256VL-LABEL: var_ashr_v16i8:
				; AVX256VL: # %bb.0:
				; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1
				; AVX256VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
				; AVX256VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
				; AVX256VL-NEXT: vpsraw $4, %xmm3, %xmm4
				; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
				; AVX256VL-NEXT: vpsraw $2, %xmm3, %xmm4
				; AVX256VL-NEXT: vpaddw %xmm2, %xmm2, %xmm2
				; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
				; AVX256VL-NEXT: vpsraw $1, %xmm3, %xmm4
				; AVX256VL-NEXT: vpaddw %xmm2, %xmm2, %xmm2
				; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
				; AVX256VL-NEXT: vpsrlw $8, %xmm2, %xmm2
				; AVX256VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
				; AVX256VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
				; AVX256VL-NEXT: vpsraw $4, %xmm0, %xmm3
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
				; AVX256VL-NEXT: vpsraw $2, %xmm0, %xmm3
				; AVX256VL-NEXT: vpaddw %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
				; AVX256VL-NEXT: vpsraw $1, %xmm0, %xmm3
				; AVX256VL-NEXT: vpaddw %xmm1, %xmm1, %xmm1
				; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
				; AVX256VL-NEXT: vpsrlw $8, %xmm0, %xmm0
				; AVX256VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
				; AVX256VL-NEXT: retq
				;
				; AVX512VL-LABEL: var_ashr_v16i8:
				; AVX512VL: # %bb.0:
				; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
				; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
				; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
				; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512VL-NEXT: vzeroupper
				; AVX512VL-NEXT: retq
				;
				; AVX512BWNOVL-LABEL: var_ashr_v16i8:
				; AVX512BWNOVL: # %bb.0:
				; AVX512BWNOVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX512BWNOVL-NEXT: vpmovsxbw %xmm0, %ymm0
				; AVX512BWNOVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
				; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BWNOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
				; AVX512BWNOVL-NEXT: vzeroupper
				; AVX512BWNOVL-NEXT: retq
				%shift = ashr <16 x i8> %a, %b
				ret <16 x i8> %shift
				}

test/CodeGen/X86/prefer-avx256-trunc.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256 --check-prefix=AVX256NOBW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512VL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512F
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256 --check-prefix=AVX256BWVL
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256 --check-prefix=AVX512BWVL

				define <16 x i8> @testv16i16_trunc_v16i8(<16 x i16> %x) {
				; AVX256NOBW-LABEL: testv16i16_trunc_v16i8:
				; AVX256NOBW: # %bb.0:
				; AVX256NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
				; AVX256NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
				; AVX256NOBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
				; AVX256NOBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
				; AVX256NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
				; AVX256NOBW-NEXT: vzeroupper
				; AVX256NOBW-NEXT: retq
				;
				; AVX512NOBW-LABEL: testv16i16_trunc_v16i8:
				; AVX512NOBW: # %bb.0:
				; AVX512NOBW-NEXT: vpmovsxwd %ymm0, %zmm0
				; AVX512NOBW-NEXT: vpmovdb %zmm0, %xmm0
				; AVX512NOBW-NEXT: vzeroupper
				; AVX512NOBW-NEXT: retq
				;
				; AVX512BW-LABEL: testv16i16_trunc_v16i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
				; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
				; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
				; AVX512BW-NEXT: vzeroupper
				; AVX512BW-NEXT: retq
				;
				; AVX256BWVL-LABEL: testv16i16_trunc_v16i8:
				; AVX256BWVL: # %bb.0:
				; AVX256BWVL-NEXT: vpmovwb %ymm0, %xmm0
				; AVX256BWVL-NEXT: vzeroupper
				; AVX256BWVL-NEXT: retq
				;
				; AVX512BWVL-LABEL: testv16i16_trunc_v16i8:
				; AVX512BWVL: # %bb.0:
				; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
				; AVX512BWVL-NEXT: vzeroupper
				; AVX512BWVL-NEXT: retq
				%trunc = trunc <16 x i16> %x to <16 x i8>
				ret <16 x i8> %trunc
				}

test/CodeGen/X86/prefer-avx256-wide-mul.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX256BW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
				; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,-prefer-256-bit \| FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW

				define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
				; AVX256BW-LABEL: test_div7_32i8:
				; AVX256BW: # %bb.0:
				; AVX256BW-NEXT: vextracti128 $1, %ymm0, %xmm1
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
				; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
				; AVX256BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
				; AVX256BW-NEXT: vpsrlw $8, %ymm1, %ymm1
				; AVX256BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
				; AVX256BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
				; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
				; AVX256BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
				; AVX256BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
				; AVX256BW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
				; AVX256BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: vpsrlw $1, %ymm0, %ymm0
				; AVX256BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
				; AVX256BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
				; AVX256BW-NEXT: vpsrlw $2, %ymm0, %ymm0
				; AVX256BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
				; AVX256BW-NEXT: retq
				;
				; AVX512BW-LABEL: test_div7_32i8:
				; AVX512BW: # %bb.0:
				; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
				; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
				; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
				; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
				; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
				; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
				; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
				; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
				; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0
				; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
				; AVX512BW-NEXT: retq
				%res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
				ret <32 x i8> %res
				}

test/Transforms/LoopVectorize/X86/avx512.ll

	; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s \| llc -mattr=+avx512f \| FileCheck %s			; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s \| llc -mattr=+avx512f \| FileCheck %s
				; RUN: opt -mattr=+avx512vl,+prefer-256-bit --loop-vectorize -S < %s \| llc -mattr=+avx512f \| FileCheck %s --check-prefix=CHECK-PREFER-AVX256

	target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-apple-macosx10.9.0"			target triple = "x86_64-apple-macosx10.9.0"

	; Verify that we generate 512-bit wide vectors for a basic integer memset			; Verify that we generate 512-bit wide vectors for a basic integer memset
	; loop.			; loop.

	; CHECK-LABEL: f:			; CHECK-LABEL: f:
	; CHECK: vmovdqu32 %zmm{{.}},			; CHECK: vmovdqu32 %zmm{{.}},
	; CHECK-NOT: %ymm			; CHECK-NOT: %ymm

				; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to

				; CHECK-PREFER-AVX256-LABEL: f:
				; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
				; CHECK-PREFER-AVX256-NOT: %zmm

	define void @f(i32* %a, i32 %n) {			define void @f(i32* %a, i32 %n) {
	entry:			entry:
	%cmp4 = icmp sgt i32 %n, 0			%cmp4 = icmp sgt i32 %n, 0
	br i1 %cmp4, label %for.body.preheader, label %for.end			br i1 %cmp4, label %for.body.preheader, label %for.end

	for.body.preheader: ; preds = %entry			for.body.preheader: ; preds = %entry
	br label %for.body			br label %for.body

	for.body: ; preds = %for.body.preheader, %for.body			for.body: ; preds = %for.body.preheader, %for.body
	%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]			%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
	%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv			%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
	store i32 %n, i32* %arrayidx, align 4			store i32 %n, i32* %arrayidx, align 4
	%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1			%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
	%lftr.wideiv = trunc i64 %indvars.iv.next to i32			%lftr.wideiv = trunc i64 %indvars.iv.next to i32
	%exitcond = icmp eq i32 %lftr.wideiv, %n			%exitcond = icmp eq i32 %lftr.wideiv, %n
	br i1 %exitcond, label %for.end.loopexit, label %for.body			br i1 %exitcond, label %for.end.loopexit, label %for.body

	for.end.loopexit: ; preds = %for.body			for.end.loopexit: ; preds = %for.body
	br label %for.end			br label %for.end

	for.end: ; preds = %for.end.loopexit, %entry			for.end: ; preds = %for.end.loopexit, %entry
	ret void			ret void
	}			}

				; Verify that the "prefer-vector-width=256" attribute prevents the use of 512-bit
				; vectors

				; CHECK-LABEL: g:
				; CHECK: vmovdqu %ymm{{.}},
				; CHECK-NOT: %zmm

				; CHECK-PREFER-AVX256-LABEL: g:
				; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
				; CHECK-PREFER-AVX256-NOT: %zmm

				define void @g(i32* %a, i32 %n) "prefer-vector-width"="256" {
				entry:
				%cmp4 = icmp sgt i32 %n, 0
				br i1 %cmp4, label %for.body.preheader, label %for.end

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.body: ; preds = %for.body.preheader, %for.body
				%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
				%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
				store i32 %n, i32* %arrayidx, align 4
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%lftr.wideiv = trunc i64 %indvars.iv.next to i32
				%exitcond = icmp eq i32 %lftr.wideiv, %n
				br i1 %exitcond, label %for.end.loopexit, label %for.body

				for.end.loopexit: ; preds = %for.body
				br label %for.end

				for.end: ; preds = %for.end.loopexit, %entry
				ret void
				}

				; Verify that the "prefer-vector-width=512" attribute override the subtarget
				; vectors

				; CHECK-LABEL: h:
				; CHECK: vmovdqu32 %zmm{{.}},
				; CHECK-NOT: %ymm

				; CHECK-PREFER-AVX256-LABEL: h:
				; CHECK-PREFER-AVX256: vmovdqu32 %zmm{{.}},
				; CHECK-PREFER-AVX256-NOT: %ymm

				define void @h(i32* %a, i32 %n) "prefer-vector-width"="512" {
				entry:
				%cmp4 = icmp sgt i32 %n, 0
				br i1 %cmp4, label %for.body.preheader, label %for.end

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.body: ; preds = %for.body.preheader, %for.body
				%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
				%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
				store i32 %n, i32* %arrayidx, align 4
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%lftr.wideiv = trunc i64 %indvars.iv.next to i32
				%exitcond = icmp eq i32 %lftr.wideiv, %n
				br i1 %exitcond, label %for.end.loopexit, label %for.body

				for.end.loopexit: ; preds = %for.body
				br label %for.end

				for.end: ; preds = %for.end.loopexit, %entry
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Another attempt at support prefer-vector-width function attribute
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129323

lib/Target/X86/X86.td

lib/Target/X86/X86ISelLowering.cpp

lib/Target/X86/X86Subtarget.h

lib/Target/X86/X86Subtarget.cpp

lib/Target/X86/X86TargetMachine.cpp

lib/Target/X86/X86TargetTransformInfo.cpp

test/CodeGen/X86/prefer-avx256-lzcnt.ll

test/CodeGen/X86/prefer-avx256-mask-extend.ll

test/CodeGen/X86/prefer-avx256-mask-shuffle.ll

test/CodeGen/X86/prefer-avx256-popcnt.ll

test/CodeGen/X86/prefer-avx256-shift.ll

test/CodeGen/X86/prefer-avx256-trunc.ll

test/CodeGen/X86/prefer-avx256-wide-mul.ll

test/Transforms/LoopVectorize/X86/avx512.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Another attempt at support prefer-vector-width function attributeClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 129323

lib/Target/X86/X86.td

lib/Target/X86/X86ISelLowering.cpp

lib/Target/X86/X86Subtarget.h

lib/Target/X86/X86Subtarget.cpp

lib/Target/X86/X86TargetMachine.cpp

lib/Target/X86/X86TargetTransformInfo.cpp

test/CodeGen/X86/prefer-avx256-lzcnt.ll

test/CodeGen/X86/prefer-avx256-mask-extend.ll

test/CodeGen/X86/prefer-avx256-mask-shuffle.ll

test/CodeGen/X86/prefer-avx256-popcnt.ll

test/CodeGen/X86/prefer-avx256-shift.ll

test/CodeGen/X86/prefer-avx256-trunc.ll

test/CodeGen/X86/prefer-avx256-wide-mul.ll

test/Transforms/LoopVectorize/X86/avx512.ll

[X86] Another attempt at support prefer-vector-width function attribute
ClosedPublic