Index: ../llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- ../llvm/lib/Analysis/TargetTransformInfo.cpp +++ ../llvm/lib/Analysis/TargetTransformInfo.cpp @@ -155,7 +155,7 @@ } bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const { - return TTIImpl->isLegalMaskedGather(DataType); + return TTIImpl->isLegalMaskedScatter(DataType); } bool TargetTransformInfo::prefersVectorizedAddressing() const { Index: ../llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- ../llvm/lib/Target/X86/X86ISelLowering.cpp +++ ../llvm/lib/Target/X86/X86ISelLowering.cpp @@ -886,6 +886,12 @@ } } + if (Subtarget.hasAVX2()) { + for (auto VT : { MVT::v2i64 }) { + setOperationAction(ISD::MGATHER, VT, Custom); + } + } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); @@ -23760,9 +23766,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "MGATHER/MSCATTER are supported on AVX-512 arch only"); - + assert((Subtarget.hasAVX512() || Subtarget.hasAVX2()) && + "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -23775,7 +23780,7 @@ unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - if (!Subtarget.hasVLX() && !VT.is512BitVector() && + if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but @@ -23818,7 +23823,8 @@ SDValue RetOps[] = {Exract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2i32 && (Subtarget.hasVLX() || + (Subtarget.hasAVX2()))) { // There is a special case when the return type is v2i32 is illegal and // the type legaizer extended it to v2i64. Without this conversion we end up // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. @@ -23832,7 +23838,14 @@ // The mask should match the destination type. Extending mask with zeroes // is not necessary since instruction itself reads only two values from // memory. - Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + if (Subtarget.hasAVX2()) { + Mask = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Src0), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + } + else { + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + } SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(), @@ -23843,7 +23856,8 @@ SDValue RetOps[] = { Sext, NewGather.getValue(1) }; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2f32 && (Subtarget.hasVLX() || + (Subtarget.hasAVX2()))) { // This transformation is for optimization only. // The type legalizer extended mask and index to 4 elements vector // in order to match requirements of the common gather node - same @@ -23856,7 +23870,14 @@ ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) && Index.getOpcode() == ISD::CONCAT_VECTORS && Index.getOperand(1).isUndef()) { - Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false); + if (Subtarget.hasAVX2()) { + Mask = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Src0), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + } + else { + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + } Index = Index.getOperand(0); } else return Op; Index: ../llvm/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- ../llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ ../llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1141,3 +1141,91 @@ PatFrag<(ops node:$src), (assertzext node:$src), [{ return cast(N->getOperand(1))->getVT() == MVT::i1; }]>; + +// AVX2 special nodes +// masked gather of AVX2 where mask elements are i32 +def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def avx2_masked_gather_32 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// masked gather of AVX2 where mask elements are i64 +def avx2_masked_gather_64 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// dword gathers +def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (X86MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; + +def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i32 || + Mgt->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; + +// qwords +def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i32 || + Mgt->getBasePtr().getValueType() == MVT::v2i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64) && + Mgt->getMemoryVT().is128BitVector(); + return false; +}]>; + +def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; Index: ../llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- ../llvm/lib/Target/X86/X86InstrSSE.td +++ ../llvm/lib/Target/X86/X86InstrSSE.td @@ -8621,36 +8621,52 @@ //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations -multiclass avx2_gather opc, string OpcodeStr, RegisterClass RC256, +multiclass avx2_gather opc, string OpcodeStr, ValueType VTx, + ValueType VTy, PatFrag GatherNode128, + PatFrag GatherNode256, RegisterClass RC256, X86MemOperand memop128, X86MemOperand memop256> { def rm : AVX28I, VEX; + [(set (VTx VR128:$dst), VR128:$mask_wb, + (GatherNode128 (VTx VR128:$src1), VR128:$mask, + vectoraddr:$src2))]>, VEX; def Yrm : AVX28I, VEX, VEX_L; -} - -let mayLoad = 1, hasSideEffects = 0, Constraints - = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" - in { - defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W; - defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W; - defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>; - defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>; - - let ExeDomain = SSEPackedDouble in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W; - } - - let ExeDomain = SSEPackedSingle in { - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>; + [(set (VTy RC256:$dst), RC256:$mask_wb, + (GatherNode256 (VTy RC256:$src1), RC256:$mask, + vectoraddr:$src2))]>, VEX, VEX_L; +} + +let Predicates = [UseAVX2] in { + let mayLoad = 1, hasSideEffects = 0, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + } } } Index: ../llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- ../llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ ../llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2085,7 +2085,7 @@ // Trying to reduce IndexSize to 32 bits for vector 16. // By default the IndexSize is equal to pointer size. - unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + unsigned IndexSize = (ST->hasAVX512() && VF >= 16) ? getIndexSizeInBits(Ptr, DL) : DL.getPointerSizeInBits(); Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), @@ -2102,7 +2102,8 @@ // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. - const int GSOverhead = 2; + const int GSOverhead = (Opcode == Instruction::Load) ? ST->getGatherOverhead() : + ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); } @@ -2173,7 +2174,7 @@ // the mask vector will add more instructions. Right now we give the scalar // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction // is better in the VariableMask case. - if (VF == 2 || (VF == 4 && !ST->hasVLX())) + if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) Scalarize = true; if (Scalarize) @@ -2213,11 +2214,15 @@ int DataWidth = isa(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - // AVX-512 allows gather and scatter - return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512(); + // AVX-512 & SKL client with AVX2 allows gather and scatter + return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() || + (ST->hasAVX2())); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + // AVX2 doesn't support scatter + if (!ST->hasAVX512()) + return false; return isLegalMaskedGather(DataType); } Index: ../llvm/test/CodeGen/X86/avx2_masked_gather.ll =================================================================== --- ../llvm/test/CodeGen/X86/avx2_masked_gather.ll +++ ../llvm/test/CodeGen/X86/avx2_masked_gather.ll @@ -0,0 +1,123 @@ +; CHECK: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=skylake 2>&1 | FileCheck --check-prefix=CHECK32 %s +; CHECK: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake 2>&1 | FileCheck --check-prefix=CHECK64 %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro) + +define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { +entry: +; CHECK64-LABEL: masked_gather_v2i32 +; CHECK64: vpshufd $232, %xmm1, %xmm1 +; CHECK64: vpshufd $232, %xmm1, %xmm2 +; CHECK64: vpgatherqd %xmm2, (,%xmm0), %xmm1 + %ld = load <2 x i32*>, <2 x i32*>* %ptr + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro) + ret <2 x i32> %res +} + +declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro) + +define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) { +entry: +; CHECK64-LABEL: masked_gather_v2float +; CHECK64: vgatherqps %xmm{{.*}}, (,%xmm{{.*}}), %xmm{{.*}} + %ld = load <2 x float*>, <2 x float*>* %ptr + %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro) + ret <2 x float> %res +} + +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro) + +define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) { +entry: +; CHECK32-LABEL: masked_gather_v4i32 +; CHECK32: vpgatherdd %xmm{{.*}}, (,%xmm{{.*}}), %xmm{{.*}} +; CHECK64-LABEL: masked_gather_v4i32 +; CHECK64: vpgatherqd %xmm{{.*}}, (,%ymm{{.*}}), %xmm{{.*}} + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 0, <4 x i1> %masks, <4 x i32> %passthro) + ret <4 x i32> %res +} + +declare <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 %align, <4 x i1> %masks, <4 x float> %passthro) + +define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) { +entry: +; CHECK32-LABEL: masked_gather_v4float +; CHECK32: vgatherdps %xmm{{.*}}, (,%xmm{{.*}}), %xmm{{.*}} +; CHECK64-LABEL: masked_gather_v4float +; CHECK64: vgatherqps %xmm{{.*}}, (,%ymm{{.*}}), %xmm{{.*}} + %res = call <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 0, <4 x i1> %masks, <4 x float> %passthro) + ret <4 x float> %res +} + +declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 %align, <8 x i1> %masks, <8 x i32> %passthro) + +define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i32> %passthro) { +entry: +; CHECK32-LABEL: masked_gather_v8i32 +; CHECK32: vpgatherdd %ymm{{.*}}, (,%ymm{{.*}}), %ymm{{.*}} + %ld = load <8 x i32*>, <8 x i32*>* %ptr + %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ld, i32 0, <8 x i1> %masks, <8 x i32> %passthro) + ret <8 x i32> %res +} + +declare <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ptrs, i32 %align, <8 x i1> %masks, <8 x float> %passthro) + +define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <8 x float> %passthro) { +entry: +; CHECK32-LABEL: masked_gather_v8float +; CHECK32: vgatherdps %ymm{{.*}}, (,%ymm{{.*}}), %ymm{{.*}} + %ld = load <8 x float*>, <8 x float*>* %ptr + %res = call <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ld, i32 0, <8 x i1> %masks, <8 x float> %passthro) + ret <8 x float> %res +} + +declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i64> %passthro) + +define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i64> %passthro) { +entry: +; CHECK32-LABEL: masked_gather_v4i64 +; CHECK32: vpgatherdq %ymm{{.*}}, (,%xmm{{.*}}), %ymm{{.*}} +; CHECK64-LABEL: masked_gather_v4i64 +; CHECK64: vpgatherqq %ymm{{.*}}, (,%ymm{{.*}}), %ymm{{.*}} + %ld = load <4 x i64*>, <4 x i64*>* %ptr + %res = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ld, i32 0, <4 x i1> %masks, <4 x i64> %passthro) + ret <4 x i64> %res +} + +declare <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ptrs, i32 %align, <4 x i1> %masks, <4 x double> %passthro) + +define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks, <4 x double> %passthro) { +entry: +; CHECK32-LABEL: masked_gather_v4double +; CHECK32: vgatherdpd %ymm{{.*}}, (,%xmm{{.*}}), %ymm{{.*}} +; CHECK64-LABEL: masked_gather_v4double +; CHECK64: vgatherqpd %ymm{{.*}}, (,%ymm{{.*}}), %ymm{{.*}} + %ld = load <4 x double*>, <4 x double*>* %ptr + %res = call <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ld, i32 0, <4 x i1> %masks, <4 x double> %passthro) + ret <4 x double> %res +} + +declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i64> %passthro) + +define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { +entry: +; CHECK64-LABEL: masked_gather_v2i64 +; CHECK64: vpgatherqq %xmm{{.*}}, (,%xmm{{.*}}), %xmm{{.*}} + %ld = load <2 x i64*>, <2 x i64*>* %ptr + %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ld, i32 0, <2 x i1> %masks, <2 x i64> %passthro) + ret <2 x i64> %res +} + +declare <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ptrs, i32 %align, <2 x i1> %masks, <2 x double> %passthro) + +define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { +entry: +; CHECK64-LABEL: masked_gather_v2double +; CHECK64: vgatherqpd %xmm{{.*}}, (,%xmm{{.*}}), %xmm{{.*}} + %ld = load <2 x double*>, <2 x double*>* %ptr + %res = call <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ld, i32 0, <2 x i1> %masks, <2 x double> %passthro) + ret <2 x double> %res +} +