Diff 123547

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 964 Lines • ▼ Show 20 Lines	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })		for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);		setOperationAction(ISD::BITREVERSE, VT, Custom);

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,		for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })		MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);		setOperationAction(ISD::BITREVERSE, VT, Custom);
}		}

		// Special handling for masked gather of 2 elements
		if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
		setOperationAction(ISD::MGATHER, MVT::v2i64, Custom);

if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {		if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
bool HasInt256 = Subtarget.hasInt256();		bool HasInt256 = Subtarget.hasInt256();

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass		addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);		: &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass		addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);		: &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass		addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
▲ Show 20 Lines • Show All 23,315 Lines • ▼ Show 20 Lines	Mask = DAG.getNode(ISD::TRUNCATE, dl,
MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);		MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),		return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
Mask, N->getMemoryVT(), N->getMemOperand(),		Mask, N->getMemoryVT(), N->getMemOperand(),
N->isTruncatingStore(), N->isCompressingStore());		N->isTruncatingStore(), N->isCompressingStore());
}		}

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,		static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&		assert(Subtarget.hasAVX2() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");		"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());		MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
SDLoc dl(Op);		SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();
SDValue Index = N->getIndex();		SDValue Index = N->getIndex();
SDValue Mask = N->getMask();		SDValue Mask = N->getMask();
SDValue Src0 = N->getValue();		SDValue Src0 = N->getValue();
MVT IndexVT = Index.getSimpleValueType();		MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();		MVT MaskVT = Mask.getSimpleValueType();

unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");		assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&		if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {		!Index.getSimpleValueType().is512BitVector()) {
// AVX512F supports only 512-bit vectors. Or data or index should		// AVX512F supports only 512-bit vectors. Or data or index should
// be 512 bit wide. If now the both index and data are 256-bit, but		// be 512 bit wide. If now the both index and data are 256-bit, but
// the vector contains 8 elements, we just sign-extend the index		// the vector contains 8 elements, we just sign-extend the index
if (NumElts == 8) {		if (NumElts == 8) {
Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);		Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),		SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), Index };		N->getOperand(3), Index };
Show All 26 Lines	SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
N->getMemoryVT(), dl, Ops,		N->getMemoryVT(), dl, Ops,
N->getMemOperand());		N->getMemOperand());
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,		SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
NewGather.getValue(0),		NewGather.getValue(0),
DAG.getIntPtrConstant(0, dl));		DAG.getIntPtrConstant(0, dl));
SDValue RetOps[] = {Extract, NewGather.getValue(1)};		SDValue RetOps[] = {Extract, NewGather.getValue(1)};
return DAG.getMergeValues(RetOps, dl);		return DAG.getMergeValues(RetOps, dl);
}		}
if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {		if (N->getMemoryVT() == MVT::v2i32) {
// There is a special case when the return type is v2i32 is illegal and		// There is a special case when the return type is v2i32 is illegal and
// the type legaizer extended it to v2i64. Without this conversion we end up		// the type legaizer extended it to v2i64. Without this conversion we end up
// with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.		// with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
// In order to avoid this situation, we'll build an X86 specific Gather node		// In order to avoid this situation, we'll build an X86 specific Gather node
// with index v2i64 and value type v4i32.		// with index v2i64 and value type v4i32.
assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&		assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
"Unexpected type in masked gather");		"Unexpected type in masked gather");
Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,		Src0 =
DAG.getBitcast(MVT::v4i32, Src0),		DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src0),
DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });		DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
// The mask should match the destination type. Extending mask with zeroes		// The mask should match the destination type. Extending mask with zeroes
// is not necessary since instruction itself reads only two values from		// is not necessary since instruction itself reads only two values from
// memory.		// memory.
		SDVTList VTList;
		if (Subtarget.hasVLX()) {
		Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
		VTList = DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other);
		}
		else {
		Mask =
		DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Mask),
		DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1});
		VTList = DAG.getVTList(MVT::v4i32, MVT::Other);
		}
SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };		SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(		SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other), Ops, dl,		VTList, Ops, dl, N->getMemoryVT(), N->getMemOperand());
N->getMemoryVT(), N->getMemOperand());

SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,		SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
NewGather.getValue(0), DAG);		NewGather.getValue(0), DAG);
SDValue RetOps[] = { Sext, NewGather.getValue(1) };		SDValue RetOps[] = { Sext, NewGather.getValue(1) };
return DAG.getMergeValues(RetOps, dl);		return DAG.getMergeValues(RetOps, dl);
}		}
return Op;		return Op;
}		}
▲ Show 20 Lines • Show All 13,661 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td

Show First 20 Lines • Show All 1,095 Lines • ▼ Show 20 Lines	def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncSStore node:$src1, node:$src2, node:$src3), [{		(X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;		return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;		}]>;

def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),		def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{		(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;		return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;		}]>;

		// AVX2 special nodes
		// masked gather of AVX2 where mask elements are i32
		def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER",
		SDTypeProfile<2, 3, [
		SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
		SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>,
		[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

		def avx2_masked_gather_32 : SDNode<"ISD::MGATHER",
		SDTypeProfile<2, 3, [
		SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
		SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>,
		[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

		// masked gather of AVX2 where mask elements are i64
		def avx2_masked_gather_64 : SDNode<"ISD::MGATHER",
		SDTypeProfile<2, 3, [
		SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
		SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>,
		[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

		// dword gathers
		def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v4i32 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v4i32);
		return false;
		}]>;

		def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
		if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v2i64 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v2i64);
		return false;
		}]>;

		def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v8i32 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v8i32);
		return false;
		}]>;

		def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v4i64 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v4i64);
		return false;
		}]>;

		// qwords
		def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v2i32 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v2i32);
		return false;
		}]>;

		def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v2i64 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
		Mgt->getMemoryVT().is128BitVector();
		return false;
		}]>;

		def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v4i32 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v4i32);
		return false;
		}]>;

		def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
		(avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
		if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
		return (Mgt->getIndex().getValueType() == MVT::v4i64 \|\|
		Mgt->getBasePtr().getValueType() == MVT::v4i64);
		return false;
		}]>;

llvm/trunk/lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,320 Lines • ▼ Show 20 Lines	def : Pat<(v8i32 (X86vsrav VR256:$src1,
(bitconvert (loadv4i64 addr:$src2)))),		(bitconvert (loadv4i64 addr:$src2)))),
(VPSRAVDYrm VR256:$src1, addr:$src2)>;		(VPSRAVDYrm VR256:$src1, addr:$src2)>;
}		}



//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// VGATHER - GATHER Operations		// VGATHER - GATHER Operations
multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,		multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
		ValueType VTy, PatFrag GatherNode128,
		PatFrag GatherNode256, RegisterClass RC256,
X86MemOperand memop128, X86MemOperand memop256> {		X86MemOperand memop128, X86MemOperand memop256> {
def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),		def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
(ins VR128:$src1, memop128:$src2, VR128:$mask),		(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,		!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst\|$dst, $src2, $mask}"),		"\t{$mask, $src2, $dst\|$dst, $src2, $mask}"),
[]>, VEX;		[(set (VTx VR128:$dst), VR128:$mask_wb,
		(GatherNode128 (VTx VR128:$src1), VR128:$mask,
		vectoraddr:$src2))]>, VEX;
def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),		def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),		(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,		!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst\|$dst, $src2, $mask}"),		"\t{$mask, $src2, $dst\|$dst, $src2, $mask}"),
[]>, VEX, VEX_L;		[(set (VTy RC256:$dst), RC256:$mask_wb,
		(GatherNode256 (VTy RC256:$src1), RC256:$mask,
		vectoraddr:$src2))]>, VEX, VEX_L;
}		}

		let Predicates = [UseAVX2] in {
let mayLoad = 1, hasSideEffects = 0, Constraints		let mayLoad = 1, hasSideEffects = 0, Constraints
= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"		= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
in {		in {
defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;		defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm,
defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;		avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W;
defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;		defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm,
defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;		avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W;
		defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm,
		avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>;
		defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm,
		avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>;

let ExeDomain = SSEPackedDouble in {		let ExeDomain = SSEPackedDouble in {
defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;		defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm,
defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;		avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W;
		defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm,
		avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W;
}		}

let ExeDomain = SSEPackedSingle in {		let ExeDomain = SSEPackedSingle in {
defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;		defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm,
defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;		avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>;
		defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm,
		avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>;
		}
}		}
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Extra selection patterns for FR128, f128, f128mem		// Extra selection patterns for FR128, f128, f128mem

// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.		// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
def : Pat<(store (f128 FR128:$src), addr:$dst),		def : Pat<(store (f128 FR128:$src), addr:$dst),
▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86Subtarget.h

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	enum Style {
GOT, // Used on 32 bit elf on when in pic mode.		GOT, // Used on 32 bit elf on when in pic mode.
RIPRel, // Used on X86-64 when in pic mode.		RIPRel, // Used on X86-64 when in pic mode.
None // Set when not in pic mode.		None // Set when not in pic mode.
};		};

} // end namespace PICStyles		} // end namespace PICStyles

class X86Subtarget final : public X86GenSubtargetInfo {		class X86Subtarget final : public X86GenSubtargetInfo {
protected:		public:
enum X86SSEEnum {
NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
};

enum X863DNowEnum {
NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
};

enum X86ProcFamilyEnum {		enum X86ProcFamilyEnum {
Others,		Others,
IntelAtom,		IntelAtom,
IntelSLM,		IntelSLM,
IntelGLM,		IntelGLM,
IntelHaswell,		IntelHaswell,
IntelBroadwell,		IntelBroadwell,
IntelSkylake,		IntelSkylake,
IntelKNL,		IntelKNL,
IntelSKX,		IntelSKX,
IntelCannonlake,		IntelCannonlake,
IntelIcelake,		IntelIcelake,
};		};

		protected:
		enum X86SSEEnum {
		NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
		};

		enum X863DNowEnum {
		NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
		};

/// X86 processor family: Intel Atom, and others		/// X86 processor family: Intel Atom, and others
X86ProcFamilyEnum X86ProcFamily;		X86ProcFamilyEnum X86ProcFamily;

/// Which PIC style to use		/// Which PIC style to use
PICStyles::Style PICStyle;		PICStyles::Style PICStyle;

const TargetMachine &TM;		const TargetMachine &TM;

▲ Show 20 Lines • Show All 598 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 2,362 Lines • ▼ Show 20 Lines	for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
return IndexSize; // 64		return IndexSize; // 64
}		}
return (unsigned)32;		return (unsigned)32;
};		};


// Trying to reduce IndexSize to 32 bits for vector 16.		// Trying to reduce IndexSize to 32 bits for vector 16.
// By default the IndexSize is equal to pointer size.		// By default the IndexSize is equal to pointer size.
unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :		unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
DL.getPointerSizeInBits();		? getIndexSizeInBits(Ptr, DL)
		: DL.getPointerSizeInBits();

Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),		Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
IndexSize), VF);		IndexSize), VF);
std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);		std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);		std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
int SplitFactor = std::max(IdxsLT.first, SrcLT.first);		int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
if (SplitFactor > 1) {		if (SplitFactor > 1) {
// Handle splitting of vector of pointers		// Handle splitting of vector of pointers
Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);		Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,		return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
AddressSpace);		AddressSpace);
}		}

// The gather / scatter cost is given by Intel architects. It is a rough		// The gather / scatter cost is given by Intel architects. It is a rough
// number since we are looking at one instruction in a time.		// number since we are looking at one instruction in a time.
const int GSOverhead = 2;		const int GSOverhead = (Opcode == Instruction::Load)
		? ST->getGatherOverhead()
		: ST->getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),		return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);		Alignment, AddressSpace);
}		}

/// Return the cost of full scalarization of gather / scatter operation.		/// Return the cost of full scalarization of gather / scatter operation.
///		///
/// Opcode - Load or Store instruction.		/// Opcode - Load or Store instruction.
/// SrcVTy - The type of the data vector that should be gathered or scattered.		/// SrcVTy - The type of the data vector that should be gathered or scattered.
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) \|\|
(Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))		(Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
Scalarize = true;		Scalarize = true;
// Gather / Scatter for vector 2 is not profitable on KNL / SKX		// Gather / Scatter for vector 2 is not profitable on KNL / SKX
// Vector-4 of gather/scatter instruction does not exist on KNL.		// Vector-4 of gather/scatter instruction does not exist on KNL.
// We can extend it to 8 elements, but zeroing upper bits of		// We can extend it to 8 elements, but zeroing upper bits of
// the mask vector will add more instructions. Right now we give the scalar		// the mask vector will add more instructions. Right now we give the scalar
// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction		// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
// is better in the VariableMask case.		// is better in the VariableMask case.
if (VF == 2 \|\| (VF == 4 && !ST->hasVLX()))		if (ST->hasAVX512() && (VF == 2 \|\| (VF == 4 && !ST->hasVLX())))
Scalarize = true;		Scalarize = true;

if (Scalarize)		if (Scalarize)
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,		return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
AddressSpace);		AddressSpace);

return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);		return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}		}
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	if (isa<VectorType>(DataTy)) {
unsigned NumElts = DataTy->getVectorNumElements();		unsigned NumElts = DataTy->getVectorNumElements();
if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))		if (NumElts == 1 \|\| !isPowerOf2_32(NumElts))
return false;		return false;
}		}
Type *ScalarTy = DataTy->getScalarType();		Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?		int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();		DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();

// AVX-512 allows gather and scatter		// AVX-512 and Skylake AVX2 allows gather and scatter
return (DataWidth == 32 \|\| DataWidth == 64) && ST->hasAVX512();		return (DataWidth == 32 \|\| DataWidth == 64) && (ST->hasAVX512() \|\|
		ST->getProcFamily() == X86Subtarget::IntelSkylake);
}		}

bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {		bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
		// AVX2 doesn't support scatter
		if (!ST->hasAVX512())
		return false;
return isLegalMaskedGather(DataType);		return isLegalMaskedGather(DataType);
}		}

bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {		bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
EVT VT = TLI->getValueType(DL, DataType);		EVT VT = TLI->getValueType(DL, DataType);
return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);		return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
}		}

▲ Show 20 Lines • Show All 322 Lines • Show Last 20 Lines

llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll

; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s \| FileCheck %s --check-prefix=AVX2		; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s \| FileCheck %s --check-prefix=AVX2
		; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze < %s \| FileCheck %s --check-prefix=SKL
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s \| FileCheck %s --check-prefix=KNL		; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s \| FileCheck %s --check-prefix=KNL
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s \| FileCheck %s --check-prefix=SKX		; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s \| FileCheck %s --check-prefix=SKX


; AVX2-LABEL: test1		; AVX2-LABEL: test1
; AVX2: Found an estimated cost of 4 {{.*}}.masked		; AVX2: Found an estimated cost of 4 {{.*}}.masked
define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {		define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
%mask = icmp eq <2 x i64> %trigger, zeroinitializer		%mask = icmp eq <2 x i64> %trigger, zeroinitializer
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
ret <2 x i32> %res		ret <2 x i32> %res
}		}

define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) {		define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) {

; AVX2-LABEL: test_gather_2f64		; AVX2-LABEL: test_gather_2f64
; AVX2: Found an estimated cost of 7 {{.*}}.gather		; AVX2: Found an estimated cost of 7 {{.*}}.gather

		; SKL-LABEL: test_gather_2f64
		; SKL: Found an estimated cost of 4 {{.*}}.gather

; KNL-LABEL: test_gather_2f64		; KNL-LABEL: test_gather_2f64
; KNL: Found an estimated cost of 7 {{.*}}.gather		; KNL: Found an estimated cost of 7 {{.*}}.gather

; SKX-LABEL: test_gather_2f64		; SKX-LABEL: test_gather_2f64
; SKX: Found an estimated cost of 7 {{.*}}.gather		; SKX: Found an estimated cost of 7 {{.*}}.gather

%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)		%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
ret <2 x double> %res		ret <2 x double> %res
}		}
declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)		declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)

define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) {		define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) {

; AVX2-LABEL: test_gather_4i32		; AVX2-LABEL: test_gather_4i32
; AVX2: Found an estimated cost of 16 {{.*}}.gather		; AVX2: Found an estimated cost of 16 {{.*}}.gather

		; SKL-LABEL: test_gather_4i32
		; SKL: Found an estimated cost of 6 {{.*}}.gather

; KNL-LABEL: test_gather_4i32		; KNL-LABEL: test_gather_4i32
; KNL: Found an estimated cost of 16 {{.*}}.gather		; KNL: Found an estimated cost of 16 {{.*}}.gather

; SKX-LABEL: test_gather_4i32		; SKX-LABEL: test_gather_4i32
; SKX: Found an estimated cost of 6 {{.*}}.gather		; SKX: Found an estimated cost of 6 {{.*}}.gather

%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)		%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
ret <4 x i32> %res		ret <4 x i32> %res
}		}

define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) {		define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) {

; AVX2-LABEL: test_gather_4i32_const_mask		; AVX2-LABEL: test_gather_4i32_const_mask
; AVX2: Found an estimated cost of 8 {{.*}}.gather		; AVX2: Found an estimated cost of 8 {{.*}}.gather

		; SKL-LABEL: test_gather_4i32_const_mask
		; SKL: Found an estimated cost of 6 {{.*}}.gather

; KNL-LABEL: test_gather_4i32_const_mask		; KNL-LABEL: test_gather_4i32_const_mask
; KNL: Found an estimated cost of 8 {{.*}}.gather		; KNL: Found an estimated cost of 8 {{.*}}.gather

; SKX-LABEL: test_gather_4i32_const_mask		; SKX-LABEL: test_gather_4i32_const_mask
; SKX: Found an estimated cost of 6 {{.*}}.gather		; SKX: Found an estimated cost of 6 {{.*}}.gather

%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)		%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
ret <4 x i32> %res		ret <4 x i32> %res
}		}
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)		declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)

define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {		define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {

; AVX2-LABEL: test_gather_16f32_const_mask		; AVX2-LABEL: test_gather_16f32_const_mask
; AVX2: Found an estimated cost of 30 {{.*}}.gather		; AVX2: Found an estimated cost of 30 {{.*}}.gather

		; SKL-LABEL: test_gather_16f32_const_mask
		; SKL: Found an estimated cost of 24 {{.*}}.gather

; KNL-LABEL: test_gather_16f32_const_mask		; KNL-LABEL: test_gather_16f32_const_mask
; KNL: Found an estimated cost of 18 {{.*}}.gather		; KNL: Found an estimated cost of 18 {{.*}}.gather

; SKX-LABEL: test_gather_16f32_const_mask		; SKX-LABEL: test_gather_16f32_const_mask
; SKX: Found an estimated cost of 18 {{.*}}.gather		; SKX: Found an estimated cost of 18 {{.*}}.gather

%sext_ind = sext <16 x i32> %ind to <16 x i64>		%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind		%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind

%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)		%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res		ret <16 x float>%res
}		}

define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {		define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {

; AVX2-LABEL: test_gather_16f32_var_mask		; AVX2-LABEL: test_gather_16f32_var_mask
; AVX2: Found an estimated cost of 62 {{.*}}.gather		; AVX2: Found an estimated cost of 62 {{.*}}.gather

		; SKL-LABEL: test_gather_16f32_var_mask
		; SKL: Found an estimated cost of 24 {{.*}}.gather

; KNL-LABEL: test_gather_16f32_var_mask		; KNL-LABEL: test_gather_16f32_var_mask
; KNL: Found an estimated cost of 18 {{.*}}.gather		; KNL: Found an estimated cost of 18 {{.*}}.gather

; SKX-LABEL: test_gather_16f32_var_mask		; SKX-LABEL: test_gather_16f32_var_mask
; SKX: Found an estimated cost of 18 {{.*}}.gather		; SKX: Found an estimated cost of 18 {{.*}}.gather

%sext_ind = sext <16 x i32> %ind to <16 x i64>		%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind		%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind

%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)		%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
ret <16 x float>%res		ret <16 x float>%res
}		}

define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {		define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {

; AVX2-LABEL: test_gather_16f32_ra_var_mask		; AVX2-LABEL: test_gather_16f32_ra_var_mask
; AVX2: Found an estimated cost of 62 {{.*}}.gather		; AVX2: Found an estimated cost of 62 {{.*}}.gather

		; SKL-LABEL: test_gather_16f32_ra_var_mask
		; SKL: Found an estimated cost of 24 {{.*}}.gather

; KNL-LABEL: test_gather_16f32_ra_var_mask		; KNL-LABEL: test_gather_16f32_ra_var_mask
; KNL: Found an estimated cost of 20 {{.*}}.gather		; KNL: Found an estimated cost of 20 {{.*}}.gather

; SKX-LABEL: test_gather_16f32_ra_var_mask		; SKX-LABEL: test_gather_16f32_ra_var_mask
; SKX: Found an estimated cost of 20 {{.*}}.gather		; SKX: Found an estimated cost of 20 {{.*}}.gather

%sext_ind = sext <16 x i32> %ind to <16 x i64>		%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind		%gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind

%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)		%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
ret <16 x float>%res		ret <16 x float>%res
}		}

define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {		define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {

; AVX2-LABEL: test_gather_16f32_const_mask2		; AVX2-LABEL: test_gather_16f32_const_mask2
; AVX2: Found an estimated cost of 30 {{.*}}.gather		; AVX2: Found an estimated cost of 30 {{.*}}.gather

		; SKL-LABEL: test_gather_16f32_const_mask2
		; SKL: Found an estimated cost of 24 {{.*}}.gather

; KNL-LABEL: test_gather_16f32_const_mask2		; KNL-LABEL: test_gather_16f32_const_mask2
; KNL: Found an estimated cost of 18 {{.*}}.gather		; KNL: Found an estimated cost of 18 {{.*}}.gather

; SKX-LABEL: test_gather_16f32_const_mask2		; SKX-LABEL: test_gather_16f32_const_mask2
; SKX: Found an estimated cost of 18 {{.*}}.gather		; SKX: Found an estimated cost of 18 {{.*}}.gather

%broadcast.splatinsert = insertelement <16 x float> undef, float %base, i32 0		%broadcast.splatinsert = insertelement <16 x float> undef, float %base, i32 0
%broadcast.splat = shufflevector <16 x float> %broadcast.splatinsert, <16 x float> undef, <16 x i32> zeroinitializer		%broadcast.splat = shufflevector <16 x float> %broadcast.splatinsert, <16 x float> undef, <16 x i32> zeroinitializer

%sext_ind = sext <16 x i32> %ind to <16 x i64>		%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind		%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind

%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)		%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
ret <16 x float>%res		ret <16 x float>%res
}		}

define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {		define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; AVX2-LABEL: test_scatter_16i32		; AVX2-LABEL: test_scatter_16i32
; AVX2: Found an estimated cost of 64 {{.*}}.scatter		; AVX2: Found an estimated cost of 64 {{.*}}.scatter

		; SKL-LABEL: test_scatter_16i32
		; SKL: Found an estimated cost of 64 {{.*}}.scatter

; KNL-LABEL: test_scatter_16i32		; KNL-LABEL: test_scatter_16i32
; KNL: Found an estimated cost of 18 {{.*}}.scatter		; KNL: Found an estimated cost of 18 {{.*}}.scatter

; SKX-LABEL: test_scatter_16i32		; SKX-LABEL: test_scatter_16i32
; SKX: Found an estimated cost of 18 {{.*}}.scatter		; SKX: Found an estimated cost of 18 {{.*}}.scatter

%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %base, i32 0		%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %base, i32 0
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer		%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer

%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind		%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
%imask = bitcast i16 %mask to <16 x i1>		%imask = bitcast i16 %mask to <16 x i1>
call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)		call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
ret void		ret void
}		}

define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {		define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
; AVX2-LABEL: test_scatter_8i32		; AVX2-LABEL: test_scatter_8i32
; AVX2: Found an estimated cost of 32 {{.*}}.scatter		; AVX2: Found an estimated cost of 32 {{.*}}.scatter

		; SKL-LABEL: test_scatter_8i32
		; SKL: Found an estimated cost of 32 {{.*}}.scatter

; KNL-LABEL: test_scatter_8i32		; KNL-LABEL: test_scatter_8i32
; KNL: Found an estimated cost of 10 {{.*}}.scatter		; KNL: Found an estimated cost of 10 {{.*}}.scatter

; SKX-LABEL: test_scatter_8i32		; SKX-LABEL: test_scatter_8i32
; SKX: Found an estimated cost of 10 {{.*}}.scatter		; SKX: Found an estimated cost of 10 {{.*}}.scatter

call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)		call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
ret void		ret void
}		}

declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)		declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)

define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {		define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
; AVX2-LABEL: test_scatter_4i32		; AVX2-LABEL: test_scatter_4i32
; AVX2: Found an estimated cost of 16 {{.*}}.scatter		; AVX2: Found an estimated cost of 16 {{.*}}.scatter

		; SKL-LABEL: test_scatter_4i32
		; SKL: Found an estimated cost of 16 {{.*}}.scatter

; KNL-LABEL: test_scatter_4i32		; KNL-LABEL: test_scatter_4i32
; KNL: Found an estimated cost of 16 {{.*}}.scatter		; KNL: Found an estimated cost of 16 {{.*}}.scatter

; SKX-LABEL: test_scatter_4i32		; SKX-LABEL: test_scatter_4i32
; SKX: Found an estimated cost of 6 {{.*}}.scatter		; SKX: Found an estimated cost of 6 {{.*}}.scatter

call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)		call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
ret void		ret void
}		}

define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {		define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {

; AVX2-LABEL: test_gather_4f32		; AVX2-LABEL: test_gather_4f32
; AVX2: Found an estimated cost of 15 {{.*}}.gather		; AVX2: Found an estimated cost of 15 {{.*}}.gather

		; SKL-LABEL: test_gather_4f32
		; SKL: Found an estimated cost of 6 {{.*}}.gather

; KNL-LABEL: test_gather_4f32		; KNL-LABEL: test_gather_4f32
; KNL: Found an estimated cost of 15 {{.*}}.gather		; KNL: Found an estimated cost of 15 {{.*}}.gather

; SKX-LABEL: test_gather_4f32		; SKX-LABEL: test_gather_4f32
; SKX: Found an estimated cost of 6 {{.*}}.gather		; SKX: Found an estimated cost of 6 {{.*}}.gather

%sext_ind = sext <4 x i32> %ind to <4 x i64>		%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind		%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind

%res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)		%res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
ret <4 x float>%res		ret <4 x float>%res
}		}

define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {		define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {

; AVX2-LABEL: test_gather_4f32_const_mask		; AVX2-LABEL: test_gather_4f32_const_mask
; AVX2: Found an estimated cost of 7 {{.*}}.gather		; AVX2: Found an estimated cost of 7 {{.*}}.gather

		; SKL-LABEL: test_gather_4f32_const_mask
		; SKL: Found an estimated cost of 6 {{.*}}.gather

; KNL-LABEL: test_gather_4f32_const_mask		; KNL-LABEL: test_gather_4f32_const_mask
; KNL: Found an estimated cost of 7 {{.*}}.gather		; KNL: Found an estimated cost of 7 {{.*}}.gather

; SKX-LABEL: test_gather_4f32_const_mask		; SKX-LABEL: test_gather_4f32_const_mask
; SKX: Found an estimated cost of 6 {{.*}}.gather		; SKX: Found an estimated cost of 6 {{.*}}.gather

%sext_ind = sext <4 x i32> %ind to <4 x i64>		%sext_ind = sext <4 x i32> %ind to <4 x i64>
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind		%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
Show All 29 Lines

llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx2 \| FileCheck --check-prefix=X86 %s			; RUN: llc < %s -mcpu=skylake -mtriple=i386-unknown-linux-gnu -mattr=+avx2 \| FileCheck --check-prefix=X86 %s
	; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 \| FileCheck --check-prefix=X64 %s			; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 \| FileCheck --check-prefix=X64 %s

	declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro)			declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro)

	define <2 x i32> @masked_gather_v2i32(<2 x i32> %ptr, <2 x i1> %masks, <2 x i32> %passthro) {			define <2 x i32> @masked_gather_v2i32(<2 x i32> %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
	; X86-LABEL: masked_gather_v2i32:			; X86-LABEL: masked_gather_v2i32:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero			; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
	; X86-NEXT: vpextrb $0, %xmm0, %eax			; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-NEXT: testb $1, %al			; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; X86-NEXT: # implicit-def: %XMM2			; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
	; X86-NEXT: je .LBB0_2			; X86-NEXT: vpmovsxdq %xmm1, %xmm0
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X86-NEXT: .LBB0_2: # %else
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB0_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
	; X86-NEXT: .LBB0_4: # %else2
	; X86-NEXT: vpsllq $63, %xmm0, %xmm0
	; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v2i32:			; X64-LABEL: masked_gather_v2i32:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %xmm3			; X64-NEXT: vmovdqa (%rdi), %xmm2
	; X64-NEXT: vpextrb $0, %xmm0, %eax			; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X64-NEXT: testb $1, %al			; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; X64-NEXT: # implicit-def: %XMM2			; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
	; X64-NEXT: je .LBB0_2			; X64-NEXT: vpmovsxdq %xmm1, %xmm0
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB0_2: # %else
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB0_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: movl (%rax), %eax
	; X64-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
	; X64-NEXT: .LBB0_4: # %else2
	; X64-NEXT: vpsllq $63, %xmm0, %xmm0
	; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <2 x i32>, <2 x i32>* %ptr			%ld = load <2 x i32>, <2 x i32>* %ptr
	%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)			%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
	ret <2 x i32> %res			ret <2 x i32> %res
	}			}

				define <4 x i32> @masked_gather_v2i32_concat(<2 x i32> %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
				; X86-LABEL: masked_gather_v2i32_concat:
				; X86: # BB#0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
				; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
				; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
				; X86-NEXT: vpmovsxdq %xmm1, %xmm0
				; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; X86-NEXT: retl
				;
				; X64-LABEL: masked_gather_v2i32_concat:
				; X64: # BB#0: # %entry
				; X64-NEXT: vmovdqa (%rdi), %xmm2
				; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
				; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
				; X64-NEXT: vpmovsxdq %xmm1, %xmm0
				; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
				; X64-NEXT: retq
				entry:
				%ld = load <2 x i32>, <2 x i32>* %ptr
				%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
				%res2 = shufflevector <2 x i32> %res, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				ret <4 x i32> %res2
				}

	declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro)			declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro)

	define <2 x float> @masked_gather_v2float(<2 x float> %ptr, <2 x i1> %masks, <2 x float> %passthro) {			define <2 x float> @masked_gather_v2float(<2 x float> %ptr, <2 x i1> %masks, <2 x float> %passthro) {
	; X86-LABEL: masked_gather_v2float:			; X86-LABEL: masked_gather_v2float:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
				; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero			; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; X86-NEXT: vpextrb $0, %xmm0, %eax			; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
	; X86-NEXT: testb $1, %al			; X86-NEXT: vmovaps %xmm1, %xmm0
	; X86-NEXT: # implicit-def: %XMM2
	; X86-NEXT: je .LBB1_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X86-NEXT: .LBB1_2: # %else
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB1_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
	; X86-NEXT: .LBB1_4: # %else2
	; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X86-NEXT: vpslld $31, %xmm0, %xmm0
	; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v2float:			; X64-LABEL: masked_gather_v2float:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %xmm3			; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
	; X64-NEXT: vpextrb $0, %xmm0, %eax			; X64-NEXT: vmovaps (%rdi), %xmm2
	; X64-NEXT: testb $1, %al			; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
	; X64-NEXT: # implicit-def: %XMM2			; X64-NEXT: vmovaps %xmm1, %xmm0
	; X64-NEXT: je .LBB1_2			; X64-NEXT: vzeroupper
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB1_2: # %else
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB1_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
	; X64-NEXT: .LBB1_4: # %else2
	; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; X64-NEXT: vpslld $31, %xmm0, %xmm0
	; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <2 x float>, <2 x float>* %ptr			%ld = load <2 x float>, <2 x float>* %ptr
	%res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)			%res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
	ret <2 x float> %res			ret <2 x float> %res
	}			}

				define <4 x float> @masked_gather_v2float_concat(<2 x float> %ptr, <2 x i1> %masks, <2 x float> %passthro) {
				; X86-LABEL: masked_gather_v2float_concat:
				; X86: # BB#0: # %entry
				; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
				; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
				; X86-NEXT: vmovaps %xmm1, %xmm0
				; X86-NEXT: retl
				;
				; X64-LABEL: masked_gather_v2float_concat:
				; X64: # BB#0: # %entry
				; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
				; X64-NEXT: vmovaps (%rdi), %xmm2
				; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
				; X64-NEXT: vmovaps %xmm1, %xmm0
				; X64-NEXT: vzeroupper
				; X64-NEXT: retq
				entry:
				%ld = load <2 x float>, <2 x float>* %ptr
				%res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
				%res2 = shufflevector <2 x float> %res, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				ret <4 x float> %res2
				}


	declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro)			declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro)

	define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) {			define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) {
	; X86-LABEL: masked_gather_v4i32:			; X86-LABEL: masked_gather_v4i32:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: vpextrb $0, %xmm1, %eax			; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2
	; X86-NEXT: testb $1, %al			; X86-NEXT: vmovdqa %xmm2, %xmm0
	; X86-NEXT: # implicit-def: %XMM3
	; X86-NEXT: je .LBB2_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm0, %eax
	; X86-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X86-NEXT: .LBB2_2: # %else
	; X86-NEXT: vpextrb $4, %xmm1, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB2_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $1, %xmm0, %eax
	; X86-NEXT: vpinsrd $1, (%eax), %xmm3, %xmm3
	; X86-NEXT: .LBB2_4: # %else2
	; X86-NEXT: vpextrb $8, %xmm1, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB2_6
	; X86-NEXT: # BB#5: # %cond.load4
	; X86-NEXT: vpextrd $2, %xmm0, %eax
	; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3
	; X86-NEXT: .LBB2_6: # %else5
	; X86-NEXT: vpextrb $12, %xmm1, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB2_8
	; X86-NEXT: # BB#7: # %cond.load7
	; X86-NEXT: vpextrd $3, %xmm0, %eax
	; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3
	; X86-NEXT: .LBB2_8: # %else8
	; X86-NEXT: vpslld $31, %xmm1, %xmm0
	; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v4i32:			; X64-LABEL: masked_gather_v4i32:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vpextrb $0, %xmm1, %eax			; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2
	; X64-NEXT: testb $1, %al			; X64-NEXT: vmovdqa %xmm2, %xmm0
	; X64-NEXT: # implicit-def: %XMM3
	; X64-NEXT: je .LBB2_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm0, %rax
	; X64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB2_2: # %else
	; X64-NEXT: vpextrb $4, %xmm1, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB2_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm0, %rax
	; X64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
	; X64-NEXT: .LBB2_4: # %else2
	; X64-NEXT: vpextrb $8, %xmm1, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB2_6
	; X64-NEXT: # BB#5: # %cond.load4
	; X64-NEXT: vextracti128 $1, %ymm0, %xmm4
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
	; X64-NEXT: .LBB2_6: # %else5
	; X64-NEXT: vpextrb $12, %xmm1, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB2_8
	; X64-NEXT: # BB#7: # %cond.load7
	; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
	; X64-NEXT: vpextrq $1, %xmm0, %rax
	; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
	; X64-NEXT: .LBB2_8: # %else8
	; X64-NEXT: vpslld $31, %xmm1, %xmm0
	; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
	; X64-NEXT: vzeroupper			; X64-NEXT: vzeroupper
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 0, <4 x i1> %masks, <4 x i32> %passthro)			%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 0, <4 x i1> %masks, <4 x i32> %passthro)
	ret <4 x i32> %res			ret <4 x i32> %res
	}			}

	declare <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 %align, <4 x i1> %masks, <4 x float> %passthro)			declare <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 %align, <4 x i1> %masks, <4 x float> %passthro)

	define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) {			define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) {
	; X86-LABEL: masked_gather_v4float:			; X86-LABEL: masked_gather_v4float:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: vpextrb $0, %xmm1, %eax			; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2
	; X86-NEXT: testb $1, %al			; X86-NEXT: vmovaps %xmm2, %xmm0
	; X86-NEXT: # implicit-def: %XMM3
	; X86-NEXT: je .LBB3_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm0, %eax
	; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X86-NEXT: .LBB3_2: # %else
	; X86-NEXT: vpextrb $4, %xmm1, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB3_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $1, %xmm0, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
	; X86-NEXT: .LBB3_4: # %else2
	; X86-NEXT: vpextrb $8, %xmm1, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB3_6
	; X86-NEXT: # BB#5: # %cond.load4
	; X86-NEXT: vpextrd $2, %xmm0, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
	; X86-NEXT: .LBB3_6: # %else5
	; X86-NEXT: vpextrb $12, %xmm1, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB3_8
	; X86-NEXT: # BB#7: # %cond.load7
	; X86-NEXT: vpextrd $3, %xmm0, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
	; X86-NEXT: .LBB3_8: # %else8
	; X86-NEXT: vpslld $31, %xmm1, %xmm0
	; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v4float:			; X64-LABEL: masked_gather_v4float:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vpextrb $0, %xmm1, %eax			; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2
	; X64-NEXT: testb $1, %al			; X64-NEXT: vmovaps %xmm2, %xmm0
	; X64-NEXT: # implicit-def: %XMM3
	; X64-NEXT: je .LBB3_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm0, %rax
	; X64-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB3_2: # %else
	; X64-NEXT: vpextrb $4, %xmm1, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB3_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm0, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
	; X64-NEXT: .LBB3_4: # %else2
	; X64-NEXT: vpextrb $8, %xmm1, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB3_6
	; X64-NEXT: # BB#5: # %cond.load4
	; X64-NEXT: vextracti128 $1, %ymm0, %xmm4
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
	; X64-NEXT: .LBB3_6: # %else5
	; X64-NEXT: vpextrb $12, %xmm1, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB3_8
	; X64-NEXT: # BB#7: # %cond.load7
	; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
	; X64-NEXT: vpextrq $1, %xmm0, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
	; X64-NEXT: .LBB3_8: # %else8
	; X64-NEXT: vpslld $31, %xmm1, %xmm0
	; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
	; X64-NEXT: vzeroupper			; X64-NEXT: vzeroupper
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%res = call <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 0, <4 x i1> %masks, <4 x float> %passthro)			%res = call <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 0, <4 x i1> %masks, <4 x float> %passthro)
	ret <4 x float> %res			ret <4 x float> %res
	}			}

	declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 %align, <8 x i1> %masks, <8 x i32> %passthro)			declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 %align, <8 x i1> %masks, <8 x i32> %passthro)

	define <8 x i32> @masked_gather_v8i32(<8 x i32> %ptr, <8 x i1> %masks, <8 x i32> %passthro) {			define <8 x i32> @masked_gather_v8i32(<8 x i32> %ptr, <8 x i1> %masks, <8 x i32> %passthro) {
	; X86-LABEL: masked_gather_v8i32:			; X86-LABEL: masked_gather_v8i32:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vmovdqa (%eax), %ymm3
	; X86-NEXT: vpextrb $0, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: # implicit-def: %YMM2
	; X86-NEXT: je .LBB4_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X86-NEXT: .LBB4_2: # %else
	; X86-NEXT: vpextrb $2, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $1, %xmm3, %eax
	; X86-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm4
	; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB4_4: # %else2
	; X86-NEXT: vpextrb $4, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_6
	; X86-NEXT: # BB#5: # %cond.load4
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4
	; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB4_6: # %else5
	; X86-NEXT: vpextrb $6, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_8
	; X86-NEXT: # BB#7: # %cond.load7
	; X86-NEXT: vpextrd $3, %xmm3, %eax
	; X86-NEXT: vpinsrd $3, (%eax), %xmm2, %xmm4
	; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB4_8: # %else8
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_10
	; X86-NEXT: # BB#9: # %cond.load10
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X86-NEXT: vmovd %xmm4, %eax
	; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4
	; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB4_10: # %else11
	; X86-NEXT: vpextrb $10, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_12
	; X86-NEXT: # BB#11: # %cond.load13
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X86-NEXT: vpextrd $1, %xmm4, %eax
	; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X86-NEXT: vpinsrd $1, (%eax), %xmm4, %xmm4
	; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB4_12: # %else14
	; X86-NEXT: vpextrb $12, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_14
	; X86-NEXT: # BB#13: # %cond.load16
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X86-NEXT: vpextrd $2, %xmm4, %eax
	; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X86-NEXT: vpinsrd $2, (%eax), %xmm4, %xmm4
	; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB4_14: # %else17
	; X86-NEXT: vpextrb $14, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB4_16
	; X86-NEXT: # BB#15: # %cond.load19
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm3
	; X86-NEXT: vpextrd $3, %xmm3, %eax
	; X86-NEXT: vextracti128 $1, %ymm2, %xmm3
	; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3
	; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
	; X86-NEXT: .LBB4_16: # %else20
	; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; X86-NEXT: vpslld $31, %ymm0, %ymm0			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0			; X86-NEXT: vmovdqa (%eax), %ymm2
				; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1
				; X86-NEXT: vmovdqa %ymm1, %ymm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v8i32:			; X64-LABEL: masked_gather_v8i32:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %ymm4
	; X64-NEXT: vmovdqa 32(%rdi), %ymm3
	; X64-NEXT: vpextrb $0, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: # implicit-def: %YMM2
	; X64-NEXT: je .LBB4_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB4_2: # %else
	; X64-NEXT: vpextrb $2, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm4, %rax
	; X64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm5
	; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB4_4: # %else2
	; X64-NEXT: vpextrb $4, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_6
	; X64-NEXT: # BB#5: # %cond.load4
	; X64-NEXT: vextracti128 $1, %ymm4, %xmm5
	; X64-NEXT: vmovq %xmm5, %rax
	; X64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm5
	; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB4_6: # %else5
	; X64-NEXT: vpextrb $6, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_8
	; X64-NEXT: # BB#7: # %cond.load7
	; X64-NEXT: vextracti128 $1, %ymm4, %xmm4
	; X64-NEXT: vpextrq $1, %xmm4, %rax
	; X64-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm4
	; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB4_8: # %else8
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_10
	; X64-NEXT: # BB#9: # %cond.load10
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X64-NEXT: vpinsrd $0, (%rax), %xmm4, %xmm4
	; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB4_10: # %else11
	; X64-NEXT: vpextrb $10, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_12
	; X64-NEXT: # BB#11: # %cond.load13
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X64-NEXT: vpinsrd $1, (%rax), %xmm4, %xmm4
	; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB4_12: # %else14
	; X64-NEXT: vpextrb $12, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_14
	; X64-NEXT: # BB#13: # %cond.load16
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X64-NEXT: vpinsrd $2, (%rax), %xmm4, %xmm4
	; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB4_14: # %else17
	; X64-NEXT: vpextrb $14, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB4_16
	; X64-NEXT: # BB#15: # %cond.load19
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
	; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
	; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
	; X64-NEXT: .LBB4_16: # %else20
	; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; X64-NEXT: vpslld $31, %ymm0, %ymm0			; X64-NEXT: vpslld $31, %ymm0, %ymm0
	; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0			; X64-NEXT: vpsrad $31, %ymm0, %ymm0
				; X64-NEXT: vmovdqa (%rdi), %ymm2
				; X64-NEXT: vmovdqa 32(%rdi), %ymm3
				; X64-NEXT: vextracti128 $1, %ymm1, %xmm4
				; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
				; X64-NEXT: vpgatherqd %xmm5, (,%ymm3), %xmm4
				; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1
				; X64-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <8 x i32>, <8 x i32>* %ptr			%ld = load <8 x i32>, <8 x i32>* %ptr
	%res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ld, i32 0, <8 x i1> %masks, <8 x i32> %passthro)			%res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ld, i32 0, <8 x i1> %masks, <8 x i32> %passthro)
	ret <8 x i32> %res			ret <8 x i32> %res
	}			}

	declare <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ptrs, i32 %align, <8 x i1> %masks, <8 x float> %passthro)			declare <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ptrs, i32 %align, <8 x i1> %masks, <8 x float> %passthro)

	define <8 x float> @masked_gather_v8float(<8 x float> %ptr, <8 x i1> %masks, <8 x float> %passthro) {			define <8 x float> @masked_gather_v8float(<8 x float> %ptr, <8 x i1> %masks, <8 x float> %passthro) {
	; X86-LABEL: masked_gather_v8float:			; X86-LABEL: masked_gather_v8float:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vmovdqa (%eax), %ymm3
	; X86-NEXT: vpextrb $0, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: # implicit-def: %YMM2
	; X86-NEXT: je .LBB5_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X86-NEXT: .LBB5_2: # %else
	; X86-NEXT: vpextrb $2, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $1, %xmm3, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0],mem[0],xmm2[2,3]
	; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB5_4: # %else2
	; X86-NEXT: vpextrb $4, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_6
	; X86-NEXT: # BB#5: # %cond.load4
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1],mem[0],xmm2[3]
	; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB5_6: # %else5
	; X86-NEXT: vpextrb $6, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_8
	; X86-NEXT: # BB#7: # %cond.load7
	; X86-NEXT: vpextrd $3, %xmm3, %eax
	; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
	; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB5_8: # %else8
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_10
	; X86-NEXT: # BB#9: # %cond.load10
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X86-NEXT: vmovd %xmm4, %eax
	; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
	; X86-NEXT: vextractf128 $1, %ymm2, %xmm5
	; X86-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
	; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB5_10: # %else11
	; X86-NEXT: vpextrb $10, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_12
	; X86-NEXT: # BB#11: # %cond.load13
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X86-NEXT: vpextrd $1, %xmm4, %eax
	; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
	; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
	; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB5_12: # %else14
	; X86-NEXT: vpextrb $12, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_14
	; X86-NEXT: # BB#13: # %cond.load16
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X86-NEXT: vpextrd $2, %xmm4, %eax
	; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
	; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
	; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB5_14: # %else17
	; X86-NEXT: vpextrb $14, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB5_16
	; X86-NEXT: # BB#15: # %cond.load19
	; X86-NEXT: vextracti128 $1, %ymm3, %xmm3
	; X86-NEXT: vpextrd $3, %xmm3, %eax
	; X86-NEXT: vextractf128 $1, %ymm2, %xmm3
	; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
	; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
	; X86-NEXT: .LBB5_16: # %else20
	; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; X86-NEXT: vpslld $31, %ymm0, %ymm0			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0			; X86-NEXT: vmovaps (%eax), %ymm2
				; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1
				; X86-NEXT: vmovaps %ymm1, %ymm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v8float:			; X64-LABEL: masked_gather_v8float:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %ymm4
	; X64-NEXT: vmovdqa 32(%rdi), %ymm3
	; X64-NEXT: vpextrb $0, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: # implicit-def: %YMM2
	; X64-NEXT: je .LBB5_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB5_2: # %else
	; X64-NEXT: vpextrb $2, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm4, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3]
	; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB5_4: # %else2
	; X64-NEXT: vpextrb $4, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_6
	; X64-NEXT: # BB#5: # %cond.load4
	; X64-NEXT: vextracti128 $1, %ymm4, %xmm5
	; X64-NEXT: vmovq %xmm5, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3]
	; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB5_6: # %else5
	; X64-NEXT: vpextrb $6, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_8
	; X64-NEXT: # BB#7: # %cond.load7
	; X64-NEXT: vextracti128 $1, %ymm4, %xmm4
	; X64-NEXT: vpextrq $1, %xmm4, %rax
	; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
	; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB5_8: # %else8
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_10
	; X64-NEXT: # BB#9: # %cond.load10
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
	; X64-NEXT: vextractf128 $1, %ymm2, %xmm5
	; X64-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
	; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB5_10: # %else11
	; X64-NEXT: vpextrb $10, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_12
	; X64-NEXT: # BB#11: # %cond.load13
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
	; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
	; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB5_12: # %else14
	; X64-NEXT: vpextrb $12, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_14
	; X64-NEXT: # BB#13: # %cond.load16
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
	; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
	; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB5_14: # %else17
	; X64-NEXT: vpextrb $14, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB5_16
	; X64-NEXT: # BB#15: # %cond.load19
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
	; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
	; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
	; X64-NEXT: .LBB5_16: # %else20
	; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero			; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; X64-NEXT: vpslld $31, %ymm0, %ymm0			; X64-NEXT: vpslld $31, %ymm0, %ymm0
	; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0			; X64-NEXT: vpsrad $31, %ymm0, %ymm0
				; X64-NEXT: vmovaps (%rdi), %ymm2
				; X64-NEXT: vmovaps 32(%rdi), %ymm3
				; X64-NEXT: vextractf128 $1, %ymm1, %xmm4
				; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
				; X64-NEXT: vgatherqps %xmm5, (,%ymm3), %xmm4
				; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
				; X64-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <8 x float>, <8 x float>* %ptr			%ld = load <8 x float>, <8 x float>* %ptr
	%res = call <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ld, i32 0, <8 x i1> %masks, <8 x float> %passthro)			%res = call <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ld, i32 0, <8 x i1> %masks, <8 x float> %passthro)
	ret <8 x float> %res			ret <8 x float> %res
	}			}

	declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i64> %passthro)			declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i64> %passthro)

	define <4 x i64> @masked_gather_v4i64(<4 x i64> %ptr, <4 x i1> %masks, <4 x i64> %passthro) {			define <4 x i64> @masked_gather_v4i64(<4 x i64> %ptr, <4 x i1> %masks, <4 x i64> %passthro) {
	; X86-LABEL: masked_gather_v4i64:			; X86-LABEL: masked_gather_v4i64:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vmovdqa (%eax), %xmm3
	; X86-NEXT: vpextrb $0, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: # implicit-def: %YMM2
	; X86-NEXT: je .LBB6_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
	; X86-NEXT: .LBB6_2: # %else
	; X86-NEXT: vpextrb $4, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB6_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $1, %xmm3, %eax
	; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4
	; X86-NEXT: vpinsrd $3, 4(%eax), %xmm4, %xmm4
	; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X86-NEXT: .LBB6_4: # %else2
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB6_6
	; X86-NEXT: # BB#5: # %cond.load4
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4
	; X86-NEXT: vpinsrd $1, 4(%eax), %xmm4, %xmm4
	; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB6_6: # %else5
	; X86-NEXT: vpextrb $12, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB6_8
	; X86-NEXT: # BB#7: # %cond.load7
	; X86-NEXT: vpextrd $3, %xmm3, %eax
	; X86-NEXT: vextracti128 $1, %ymm2, %xmm3
	; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3
	; X86-NEXT: vpinsrd $3, 4(%eax), %xmm3, %xmm3
	; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
	; X86-NEXT: .LBB6_8: # %else8
	; X86-NEXT: vpslld $31, %xmm0, %xmm0			; X86-NEXT: vpslld $31, %xmm0, %xmm0
				; X86-NEXT: vpsrad $31, %xmm0, %xmm0
	; X86-NEXT: vpmovsxdq %xmm0, %ymm0			; X86-NEXT: vpmovsxdq %xmm0, %ymm0
	; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: vmovdqa (%eax), %xmm2
				; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1
				; X86-NEXT: vmovdqa %ymm1, %ymm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v4i64:			; X64-LABEL: masked_gather_v4i64:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %ymm3
	; X64-NEXT: vpextrb $0, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: # implicit-def: %YMM2
	; X64-NEXT: je .LBB6_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
	; X64-NEXT: .LBB6_2: # %else
	; X64-NEXT: vpextrb $4, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB6_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm4
	; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
	; X64-NEXT: .LBB6_4: # %else2
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB6_6
	; X64-NEXT: # BB#5: # %cond.load4
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
	; X64-NEXT: vpinsrq $0, (%rax), %xmm4, %xmm4
	; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB6_6: # %else5
	; X64-NEXT: vpextrb $12, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB6_8
	; X64-NEXT: # BB#7: # %cond.load7
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
	; X64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3
	; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
	; X64-NEXT: .LBB6_8: # %else8
	; X64-NEXT: vpslld $31, %xmm0, %xmm0			; X64-NEXT: vpslld $31, %xmm0, %xmm0
				; X64-NEXT: vpsrad $31, %xmm0, %xmm0
	; X64-NEXT: vpmovsxdq %xmm0, %ymm0			; X64-NEXT: vpmovsxdq %xmm0, %ymm0
	; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0			; X64-NEXT: vmovdqa (%rdi), %ymm2
				; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1
				; X64-NEXT: vmovdqa %ymm1, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <4 x i64>, <4 x i64>* %ptr			%ld = load <4 x i64>, <4 x i64>* %ptr
	%res = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ld, i32 0, <4 x i1> %masks, <4 x i64> %passthro)			%res = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ld, i32 0, <4 x i1> %masks, <4 x i64> %passthro)
	ret <4 x i64> %res			ret <4 x i64> %res
	}			}

	declare <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ptrs, i32 %align, <4 x i1> %masks, <4 x double> %passthro)			declare <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ptrs, i32 %align, <4 x i1> %masks, <4 x double> %passthro)

	define <4 x double> @masked_gather_v4double(<4 x double> %ptr, <4 x i1> %masks, <4 x double> %passthro) {			define <4 x double> @masked_gather_v4double(<4 x double> %ptr, <4 x i1> %masks, <4 x double> %passthro) {
	; X86-LABEL: masked_gather_v4double:			; X86-LABEL: masked_gather_v4double:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vmovdqa (%eax), %xmm3
	; X86-NEXT: vpextrb $0, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: # implicit-def: %YMM2
	; X86-NEXT: je .LBB7_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; X86-NEXT: .LBB7_2: # %else
	; X86-NEXT: vpextrb $4, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB7_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $1, %xmm3, %eax
	; X86-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
	; X86-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
	; X86-NEXT: .LBB7_4: # %else2
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB7_6
	; X86-NEXT: # BB#5: # %cond.load4
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
	; X86-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
	; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X86-NEXT: .LBB7_6: # %else5
	; X86-NEXT: vpextrb $12, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB7_8
	; X86-NEXT: # BB#7: # %cond.load7
	; X86-NEXT: vpextrd $3, %xmm3, %eax
	; X86-NEXT: vextractf128 $1, %ymm2, %xmm3
	; X86-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
	; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
	; X86-NEXT: .LBB7_8: # %else8
	; X86-NEXT: vpslld $31, %xmm0, %xmm0			; X86-NEXT: vpslld $31, %xmm0, %xmm0
				; X86-NEXT: vpsrad $31, %xmm0, %xmm0
	; X86-NEXT: vpmovsxdq %xmm0, %ymm0			; X86-NEXT: vpmovsxdq %xmm0, %ymm0
	; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: vmovapd (%eax), %xmm2
				; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1
				; X86-NEXT: vmovapd %ymm1, %ymm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v4double:			; X64-LABEL: masked_gather_v4double:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %ymm3
	; X64-NEXT: vpextrb $0, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: # implicit-def: %YMM2
	; X64-NEXT: je .LBB7_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; X64-NEXT: .LBB7_2: # %else
	; X64-NEXT: vpextrb $4, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB7_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
	; X64-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
	; X64-NEXT: .LBB7_4: # %else2
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB7_6
	; X64-NEXT: # BB#5: # %cond.load4
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
	; X64-NEXT: vmovq %xmm4, %rax
	; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
	; X64-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
	; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
	; X64-NEXT: .LBB7_6: # %else5
	; X64-NEXT: vpextrb $12, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB7_8
	; X64-NEXT: # BB#7: # %cond.load7
	; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
	; X64-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
	; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
	; X64-NEXT: .LBB7_8: # %else8
	; X64-NEXT: vpslld $31, %xmm0, %xmm0			; X64-NEXT: vpslld $31, %xmm0, %xmm0
				; X64-NEXT: vpsrad $31, %xmm0, %xmm0
	; X64-NEXT: vpmovsxdq %xmm0, %ymm0			; X64-NEXT: vpmovsxdq %xmm0, %ymm0
	; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0			; X64-NEXT: vmovapd (%rdi), %ymm2
				; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1
				; X64-NEXT: vmovapd %ymm1, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <4 x double>, <4 x double>* %ptr			%ld = load <4 x double>, <4 x double>* %ptr
	%res = call <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ld, i32 0, <4 x i1> %masks, <4 x double> %passthro)			%res = call <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ld, i32 0, <4 x i1> %masks, <4 x double> %passthro)
	ret <4 x double> %res			ret <4 x double> %res
	}			}

	declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i64> %passthro)			declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i64> %passthro)

	define <2 x i64> @masked_gather_v2i64(<2 x i64> %ptr, <2 x i1> %masks, <2 x i64> %passthro) {			define <2 x i64> @masked_gather_v2i64(<2 x i64> %ptr, <2 x i1> %masks, <2 x i64> %passthro) {
	; X86-LABEL: masked_gather_v2i64:			; X86-LABEL: masked_gather_v2i64:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero			; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
	; X86-NEXT: vpextrb $0, %xmm0, %eax			; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
	; X86-NEXT: testb $1, %al			; X86-NEXT: vmovdqa %xmm1, %xmm0
	; X86-NEXT: # implicit-def: %XMM2
	; X86-NEXT: je .LBB8_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
	; X86-NEXT: .LBB8_2: # %else
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB8_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
	; X86-NEXT: vpinsrd $3, 4(%eax), %xmm2, %xmm2
	; X86-NEXT: .LBB8_4: # %else2
	; X86-NEXT: vpsllq $63, %xmm0, %xmm0
	; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v2i64:			; X64-LABEL: masked_gather_v2i64:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %xmm3			; X64-NEXT: vmovdqa (%rdi), %xmm2
	; X64-NEXT: vpextrb $0, %xmm0, %eax			; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
	; X64-NEXT: testb $1, %al			; X64-NEXT: vmovdqa %xmm1, %xmm0
	; X64-NEXT: # implicit-def: %XMM2
	; X64-NEXT: je .LBB8_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
	; X64-NEXT: .LBB8_2: # %else
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB8_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
	; X64-NEXT: .LBB8_4: # %else2
	; X64-NEXT: vpsllq $63, %xmm0, %xmm0
	; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <2 x i64>, <2 x i64>* %ptr			%ld = load <2 x i64>, <2 x i64>* %ptr
	%res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ld, i32 0, <2 x i1> %masks, <2 x i64> %passthro)			%res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ld, i32 0, <2 x i1> %masks, <2 x i64> %passthro)
	ret <2 x i64> %res			ret <2 x i64> %res
	}			}

	declare <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ptrs, i32 %align, <2 x i1> %masks, <2 x double> %passthro)			declare <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ptrs, i32 %align, <2 x i1> %masks, <2 x double> %passthro)

	define <2 x double> @masked_gather_v2double(<2 x double> %ptr, <2 x i1> %masks, <2 x double> %passthro) {			define <2 x double> @masked_gather_v2double(<2 x double> %ptr, <2 x i1> %masks, <2 x double> %passthro) {
	; X86-LABEL: masked_gather_v2double:			; X86-LABEL: masked_gather_v2double:
	; X86: # BB#0: # %entry			; X86: # BB#0: # %entry
	; X86-NEXT: movl {{[0-9]+}}(%esp), %eax			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero			; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
	; X86-NEXT: vpextrb $0, %xmm0, %eax			; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
	; X86-NEXT: testb $1, %al			; X86-NEXT: vmovapd %xmm1, %xmm0
	; X86-NEXT: # implicit-def: %XMM2
	; X86-NEXT: je .LBB9_2
	; X86-NEXT: # BB#1: # %cond.load
	; X86-NEXT: vmovd %xmm3, %eax
	; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; X86-NEXT: .LBB9_2: # %else
	; X86-NEXT: vpextrb $8, %xmm0, %eax
	; X86-NEXT: testb $1, %al
	; X86-NEXT: je .LBB9_4
	; X86-NEXT: # BB#3: # %cond.load1
	; X86-NEXT: vpextrd $2, %xmm3, %eax
	; X86-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
	; X86-NEXT: .LBB9_4: # %else2
	; X86-NEXT: vpsllq $63, %xmm0, %xmm0
	; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: masked_gather_v2double:			; X64-LABEL: masked_gather_v2double:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: vmovdqa (%rdi), %xmm3			; X64-NEXT: vmovapd (%rdi), %xmm2
	; X64-NEXT: vpextrb $0, %xmm0, %eax			; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
	; X64-NEXT: testb $1, %al			; X64-NEXT: vmovapd %xmm1, %xmm0
	; X64-NEXT: # implicit-def: %XMM2
	; X64-NEXT: je .LBB9_2
	; X64-NEXT: # BB#1: # %cond.load
	; X64-NEXT: vmovq %xmm3, %rax
	; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
	; X64-NEXT: .LBB9_2: # %else
	; X64-NEXT: vpextrb $8, %xmm0, %eax
	; X64-NEXT: testb $1, %al
	; X64-NEXT: je .LBB9_4
	; X64-NEXT: # BB#3: # %cond.load1
	; X64-NEXT: vpextrq $1, %xmm3, %rax
	; X64-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
	; X64-NEXT: .LBB9_4: # %else2
	; X64-NEXT: vpsllq $63, %xmm0, %xmm0
	; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	entry:			entry:
	%ld = load <2 x double>, <2 x double>* %ptr			%ld = load <2 x double>, <2 x double>* %ptr
	%res = call <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ld, i32 0, <2 x i1> %masks, <2 x double> %passthro)			%res = call <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ld, i32 0, <2 x i1> %masks, <2 x double> %passthro)
	ret <2 x double> %res			ret <2 x double> %res
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[LV][X86] Support of AVX2 Gathers code generation and update the LV with this
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 123547

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td

llvm/trunk/lib/Target/X86/X86InstrSSE.td

llvm/trunk/lib/Target/X86/X86Subtarget.h

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll

llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LV][X86] Support of AVX2 Gathers code generation and update the LV with thisClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 123547

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td

llvm/trunk/lib/Target/X86/X86InstrSSE.td

llvm/trunk/lib/Target/X86/X86Subtarget.h

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll

llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll

[LV][X86] Support of AVX2 Gathers code generation and update the LV with this
ClosedPublic