Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1007,6 +1007,13 @@ for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); + // Special handling for masked gather of 2 elements + if (Subtarget.hasAVX2()) { + for (auto VT : { MVT::v2i64 }) { + setOperationAction(ISD::MGATHER, VT, Custom); + } + } + // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -23760,9 +23767,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "MGATHER/MSCATTER are supported on AVX-512 arch only"); - + assert((Subtarget.hasAVX512() || Subtarget.hasAVX2()) && + "MGATHER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -23775,7 +23781,7 @@ unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - if (!Subtarget.hasVLX() && !VT.is512BitVector() && + if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but @@ -23818,7 +23824,8 @@ SDValue RetOps[] = {Exract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2i32 && (Subtarget.hasVLX() || + Subtarget.hasAVX2())) { // There is a special case when the return type is v2i32 is illegal and // the type legaizer extended it to v2i64. Without this conversion we end up // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. @@ -23832,7 +23839,14 @@ // The mask should match the destination type. Extending mask with zeroes // is not necessary since instruction itself reads only two values from // memory. - Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + if (Subtarget.hasAVX2()) { + Mask = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Mask), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + } + else { + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + } SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(), @@ -23843,7 +23857,8 @@ SDValue RetOps[] = { Sext, NewGather.getValue(1) }; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2f32 && (Subtarget.hasVLX() || + Subtarget.hasAVX2())) { // This transformation is for optimization only. // The type legalizer extended mask and index to 4 elements vector // in order to match requirements of the common gather node - same @@ -23856,7 +23871,14 @@ ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) && Index.getOpcode() == ISD::CONCAT_VECTORS && Index.getOperand(1).isUndef()) { - Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false); + if (Subtarget.hasAVX2()) { + Mask = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Mask), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + } + else { + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + } Index = Index.getOperand(0); } else return Op; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1141,3 +1141,91 @@ PatFrag<(ops node:$src), (assertzext node:$src), [{ return cast(N->getOperand(1))->getVT() == MVT::i1; }]>; + +// AVX2 special nodes +// masked gather of AVX2 where mask elements are i32 +def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def avx2_masked_gather_32 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// masked gather of AVX2 where mask elements are i64 +def avx2_masked_gather_64 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// dword gathers +def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (X86MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; + +def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i32 || + Mgt->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; + +// qwords +def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i32 || + Mgt->getBasePtr().getValueType() == MVT::v2i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64) && + Mgt->getMemoryVT().is128BitVector(); + return false; +}]>; + +def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8621,36 +8621,52 @@ //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations -multiclass avx2_gather opc, string OpcodeStr, RegisterClass RC256, +multiclass avx2_gather opc, string OpcodeStr, ValueType VTx, + ValueType VTy, PatFrag GatherNode128, + PatFrag GatherNode256, RegisterClass RC256, X86MemOperand memop128, X86MemOperand memop256> { def rm : AVX28I, VEX; + [(set (VTx VR128:$dst), VR128:$mask_wb, + (GatherNode128 (VTx VR128:$src1), VR128:$mask, + vectoraddr:$src2))]>, VEX; def Yrm : AVX28I, VEX, VEX_L; -} - -let mayLoad = 1, hasSideEffects = 0, Constraints - = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" - in { - defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W; - defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W; - defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>; - defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>; - - let ExeDomain = SSEPackedDouble in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W; - } - - let ExeDomain = SSEPackedSingle in { - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>; + [(set (VTy RC256:$dst), RC256:$mask_wb, + (GatherNode256 (VTy RC256:$src1), RC256:$mask, + vectoraddr:$src2))]>, VEX, VEX_L; +} + +let Predicates = [UseAVX2] in { + let mayLoad = 1, hasSideEffects = 0, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + } } } Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2085,7 +2085,7 @@ // Trying to reduce IndexSize to 32 bits for vector 16. // By default the IndexSize is equal to pointer size. - unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + unsigned IndexSize = (ST->hasAVX512() && VF >= 16) ? getIndexSizeInBits(Ptr, DL) : DL.getPointerSizeInBits(); Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), @@ -2102,7 +2102,8 @@ // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. - const int GSOverhead = 2; + const int GSOverhead = (Opcode == Instruction::Load) ? ST->getGatherOverhead() : + ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); } @@ -2173,7 +2174,7 @@ // the mask vector will add more instructions. Right now we give the scalar // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction // is better in the VariableMask case. - if (VF == 2 || (VF == 4 && !ST->hasVLX())) + if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) Scalarize = true; if (Scalarize) @@ -2213,11 +2214,15 @@ int DataWidth = isa(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - // AVX-512 allows gather and scatter - return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512(); + // AVX-512 & SKL client with AVX2 allows gather + return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() || + ST->hasAVX2()); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + // AVX2 doesn't support scatter + if (!ST->hasAVX512()) + return false; return isLegalMaskedGather(DataType); } Index: test/CodeGen/X86/avx2_masked_gather.ll =================================================================== --- test/CodeGen/X86/avx2_masked_gather.ll +++ test/CodeGen/X86/avx2_masked_gather.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=skylake | FileCheck --check-prefix=X86 %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck --check-prefix=X64 %s + +declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro) + +define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { +; X86-LABEL: masked_gather_v2i32: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X86-NEXT: vpgatherqd %xmm2, (,%xmm0), %xmm1 +; X86-NEXT: vpmovsxdq %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v2i32: +; X64: # BB#0: # %entry +; X64-NEXT: vmovdqa (%rdi), %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; X64-NEXT: vpgatherqd %xmm2, (,%xmm0), %xmm1 +; X64-NEXT: vpmovsxdq %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %ld = load <2 x i32*>, <2 x i32*>* %ptr + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro) + ret <2 x i32> %res +} + +declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro) + +define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) { +; X86-LABEL: masked_gather_v2float: +; X86: # BB#0: # %entry +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v2float: +; X64: # BB#0: # %entry +; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vmovaps %xmm1, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %ld = load <2 x float*>, <2 x float*>* %ptr + %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro) + ret <2 x float> %res +} + +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro) + +define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) { +; X86-LABEL: masked_gather_v4i32: +; X86: # BB#0: # %entry +; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2 +; X86-NEXT: vmovdqa %xmm2, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v4i32: +; X64: # BB#0: # %entry +; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2 +; X64-NEXT: vmovdqa %xmm2, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 0, <4 x i1> %masks, <4 x i32> %passthro) + ret <4 x i32> %res +} + +declare <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 %align, <4 x i1> %masks, <4 x float> %passthro) + +define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) { +; X86-LABEL: masked_gather_v4float: +; X86: # BB#0: # %entry +; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2 +; X86-NEXT: vmovaps %xmm2, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v4float: +; X64: # BB#0: # %entry +; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2 +; X64-NEXT: vmovaps %xmm2, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %res = call <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 0, <4 x i1> %masks, <4 x float> %passthro) + ret <4 x float> %res +} + +declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 %align, <8 x i1> %masks, <8 x i32> %passthro) + +define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i32> %passthro) { +; X86-LABEL: masked_gather_v8i32: +; X86: # BB#0: # %entry +; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqa (%eax), %ymm2 +; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1 +; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v8i32: +; X64: # BB#0: # %entry +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpslld $31, %ymm0, %ymm0 +; X64-NEXT: vpsrad $31, %ymm0, %ymm0 +; X64-NEXT: vmovdqa 32(%rdi), %ymm2 +; X64-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm4 +; X64-NEXT: vpgatherqd %xmm4, (,%ymm2), %xmm3 +; X64-NEXT: vmovdqa (%rdi), %ymm2 +; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm0 +; X64-NEXT: retq +entry: + %ld = load <8 x i32*>, <8 x i32*>* %ptr + %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ld, i32 0, <8 x i1> %masks, <8 x i32> %passthro) + ret <8 x i32> %res +} + +declare <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ptrs, i32 %align, <8 x i1> %masks, <8 x float> %passthro) + +define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <8 x float> %passthro) { +; X86-LABEL: masked_gather_v8float: +; X86: # BB#0: # %entry +; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovaps (%eax), %ymm2 +; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1 +; X86-NEXT: vmovaps %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v8float: +; X64: # BB#0: # %entry +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vpslld $31, %ymm0, %ymm0 +; X64-NEXT: vpsrad $31, %ymm0, %ymm0 +; X64-NEXT: vmovaps 32(%rdi), %ymm2 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm4 +; X64-NEXT: vgatherqps %xmm4, (,%ymm2), %xmm3 +; X64-NEXT: vmovaps (%rdi), %ymm2 +; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm0 +; X64-NEXT: retq +entry: + %ld = load <8 x float*>, <8 x float*>* %ptr + %res = call <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ld, i32 0, <8 x i1> %masks, <8 x float> %passthro) + ret <8 x float> %res +} + +declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i64> %passthro) + +define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i64> %passthro) { +; X86-LABEL: masked_gather_v4i64: +; X86: # BB#0: # %entry +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vpmovsxdq %xmm0, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqa (%eax), %xmm2 +; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1 +; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v4i64: +; X64: # BB#0: # %entry +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-NEXT: vmovdqa (%rdi), %ymm2 +; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1 +; X64-NEXT: vmovdqa %ymm1, %ymm0 +; X64-NEXT: retq +entry: + %ld = load <4 x i64*>, <4 x i64*>* %ptr + %res = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ld, i32 0, <4 x i1> %masks, <4 x i64> %passthro) + ret <4 x i64> %res +} + +declare <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ptrs, i32 %align, <4 x i1> %masks, <4 x double> %passthro) + +define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks, <4 x double> %passthro) { +; X86-LABEL: masked_gather_v4double: +; X86: # BB#0: # %entry +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vpmovsxdq %xmm0, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovapd (%eax), %xmm2 +; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1 +; X86-NEXT: vmovapd %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v4double: +; X64: # BB#0: # %entry +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-NEXT: vmovapd (%rdi), %ymm2 +; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1 +; X64-NEXT: vmovapd %ymm1, %ymm0 +; X64-NEXT: retq +entry: + %ld = load <4 x double*>, <4 x double*>* %ptr + %res = call <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ld, i32 0, <4 x i1> %masks, <4 x double> %passthro) + ret <4 x double> %res +} + +declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i64> %passthro) + +define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) { +; X86-LABEL: masked_gather_v2i64: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovdqa %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v2i64: +; X64: # BB#0: # %entry +; X64-NEXT: vmovdqa (%rdi), %xmm2 +; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X64-NEXT: vmovdqa %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %ld = load <2 x i64*>, <2 x i64*>* %ptr + %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ld, i32 0, <2 x i1> %masks, <2 x i64> %passthro) + ret <2 x i64> %res +} + +declare <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ptrs, i32 %align, <2 x i1> %masks, <2 x double> %passthro) + +define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) { +; X86-LABEL: masked_gather_v2double: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovapd %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v2double: +; X64: # BB#0: # %entry +; X64-NEXT: vmovapd (%rdi), %xmm2 +; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X64-NEXT: vmovapd %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %ld = load <2 x double*>, <2 x double*>* %ptr + %res = call <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ld, i32 0, <2 x i1> %masks, <2 x double> %passthro) + ret <2 x double> %res +} +