Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1014,6 +1014,10 @@ for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); + // Special handling for masked gather of 2 elements + if (Subtarget.hasAVX2()) + setOperationAction(ISD::MGATHER, MVT::v2i64, Custom); + // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -24087,9 +24091,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "MGATHER/MSCATTER are supported on AVX-512 arch only"); - + assert(Subtarget.hasAVX2() && + "MGATHER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -24102,7 +24105,7 @@ unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - if (!Subtarget.hasVLX() && !VT.is512BitVector() && + if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but @@ -24145,7 +24148,7 @@ SDValue RetOps[] = {Exract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2i32) { // There is a special case when the return type is v2i32 is illegal and // the type legaizer extended it to v2i64. Without this conversion we end up // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. @@ -24153,13 +24156,18 @@ // with index v2i64 and value type v4i32. assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && "Unexpected type in masked gather"); - Src0 = DAG.getVectorShuffle(MVT::v4i32, dl, - DAG.getBitcast(MVT::v4i32, Src0), - DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + Src0 = + DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src0), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); // The mask should match the destination type. Extending mask with zeroes // is not necessary since instruction itself reads only two values from // memory. - Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + if (Subtarget.hasVLX()) + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + else + Mask = + DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Mask), + DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1}); SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(), @@ -24170,7 +24178,7 @@ SDValue RetOps[] = { Sext, NewGather.getValue(1) }; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) { + if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasAVX2())) { // This transformation is for optimization only. // The type legalizer extended mask and index to 4 elements vector // in order to match requirements of the common gather node - same @@ -24183,7 +24191,12 @@ ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) && Index.getOpcode() == ISD::CONCAT_VECTORS && Index.getOperand(1).isUndef()) { - Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false); + if (Subtarget.hasVLX()) + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + else + Mask = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Mask), + DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1}); Index = Index.getOperand(0); } else return Op; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1085,3 +1085,91 @@ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ return cast(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; + +// AVX2 special nodes +// masked gather of AVX2 where mask elements are i32 +def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def avx2_masked_gather_32 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// masked gather of AVX2 where mask elements are i64 +def avx2_masked_gather_64 : SDNode<"ISD::MGATHER", + SDTypeProfile<2, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// dword gathers +def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (X86MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; + +def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i32 || + Mgt->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; + +// qwords +def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i32 || + Mgt->getBasePtr().getValueType() == MVT::v2i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64) && + Mgt->getMemoryVT().is128BitVector(); + return false; +}]>; + +def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8327,36 +8327,52 @@ //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations -multiclass avx2_gather opc, string OpcodeStr, RegisterClass RC256, +multiclass avx2_gather opc, string OpcodeStr, ValueType VTx, + ValueType VTy, PatFrag GatherNode128, + PatFrag GatherNode256, RegisterClass RC256, X86MemOperand memop128, X86MemOperand memop256> { def rm : AVX28I, VEX; + [(set (VTx VR128:$dst), VR128:$mask_wb, + (GatherNode128 (VTx VR128:$src1), VR128:$mask, + vectoraddr:$src2))]>, VEX; def Yrm : AVX28I, VEX, VEX_L; -} - -let mayLoad = 1, hasSideEffects = 0, Constraints - = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" - in { - defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W; - defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W; - defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>; - defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>; - - let ExeDomain = SSEPackedDouble in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W; - } - - let ExeDomain = SSEPackedSingle in { - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>; + [(set (VTy RC256:$dst), RC256:$mask_wb, + (GatherNode256 (VTy RC256:$src1), RC256:$mask, + vectoraddr:$src2))]>, VEX, VEX_L; +} + +let Predicates = [UseAVX2] in { + let mayLoad = 1, hasSideEffects = 0, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm, + avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm, + avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm, + avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm, + avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>; + } } } Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2368,8 +2368,9 @@ // Trying to reduce IndexSize to 32 bits for vector 16. // By default the IndexSize is equal to pointer size. - unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : - DL.getPointerSizeInBits(); + unsigned IndexSize = (ST->hasAVX512() && VF >= 16) + ? getIndexSizeInBits(Ptr, DL) + : DL.getPointerSizeInBits(); Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), IndexSize), VF); @@ -2385,7 +2386,9 @@ // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. - const int GSOverhead = 2; + const int GSOverhead = (Opcode == Instruction::Load) + ? ST->getGatherOverhead() + : ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); } @@ -2456,7 +2459,7 @@ // the mask vector will add more instructions. Right now we give the scalar // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction // is better in the VariableMask case. - if (VF == 2 || (VF == 4 && !ST->hasVLX())) + if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) Scalarize = true; if (Scalarize) @@ -2507,11 +2510,15 @@ int DataWidth = isa(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - // AVX-512 allows gather and scatter - return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512(); + // AVX-512 & SKL client with AVX2 allows gather + return (DataWidth == 32 || DataWidth == 64) && + (ST->hasAVX512() || ST->hasAVX2()); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + // AVX2 doesn't support scatter + if (!ST->hasAVX512()) + return false; return isLegalMaskedGather(DataType); } Index: test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- test/CodeGen/X86/avx2-masked-gather.ll +++ test/CodeGen/X86/avx2-masked-gather.ll @@ -8,47 +8,20 @@ ; X86-LABEL: masked_gather_v2i32: ; X86: # BB#0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %XMM2 -; X86-NEXT: je .LBB0_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: .LBB0_2: # %else -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB0_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2 -; X86-NEXT: .LBB0_4: # %else2 -; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vpmovsxdq %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i32: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %xmm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %XMM2 -; X64-NEXT: je .LBB0_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: .LBB0_2: # %else -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB0_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: movl (%rax), %eax -; X64-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2 -; X64-NEXT: .LBB0_4: # %else2 -; X64-NEXT: vpsllq $63, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; X64-NEXT: vmovdqa (%rdi), %xmm2 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 +; X64-NEXT: vpmovsxdq %xmm1, %xmm0 ; X64-NEXT: retq entry: %ld = load <2 x i32*>, <2 x i32*>* %ptr @@ -56,54 +29,53 @@ ret <2 x i32> %res } +define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) { +; X86-LABEL: masked_gather_v2i32_concat: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vpmovsxdq %xmm1, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v2i32_concat: +; X64: # BB#0: # %entry +; X64-NEXT: vmovdqa (%rdi), %xmm2 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1 +; X64-NEXT: vpmovsxdq %xmm1, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: retq +entry: + %ld = load <2 x i32*>, <2 x i32*>* %ptr + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro) + %res2 = shufflevector <2 x i32> %res, <2 x i32> undef, <4 x i32> + ret <4 x i32> %res2 +} + declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro) define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) { ; X86-LABEL: masked_gather_v2float: ; X86: # BB#0: # %entry +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %XMM2 -; X86-NEXT: je .LBB1_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: .LBB1_2: # %else -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB1_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; X86-NEXT: .LBB1_4: # %else2 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: vpslld $31, %xmm0, %xmm0 -; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovaps %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2float: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %xmm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %XMM2 -; X64-NEXT: je .LBB1_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: .LBB1_2: # %else -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB1_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; X64-NEXT: .LBB1_4: # %else2 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: vpslld $31, %xmm0, %xmm0 -; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vmovaps %xmm1, %xmm0 +; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %ld = load <2 x float*>, <2 x float*>* %ptr @@ -111,79 +83,45 @@ ret <2 x float> %res } +define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) { +; X86-LABEL: masked_gather_v2float_concat: +; X86: # BB#0: # %entry +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: masked_gather_v2float_concat: +; X64: # BB#0: # %entry +; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vmovaps %xmm1, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %ld = load <2 x float*>, <2 x float*>* %ptr + %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro) + %res2 = shufflevector <2 x float> %res, <2 x float> undef, <4 x i32> + ret <4 x float> %res2 +} + + declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro) define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) { ; X86-LABEL: masked_gather_v4i32: ; X86: # BB#0: # %entry -; X86-NEXT: vpextrb $0, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %XMM3 -; X86-NEXT: je .LBB2_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm0, %eax -; X86-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X86-NEXT: .LBB2_2: # %else -; X86-NEXT: vpextrb $4, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB2_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $1, %xmm0, %eax -; X86-NEXT: vpinsrd $1, (%eax), %xmm3, %xmm3 -; X86-NEXT: .LBB2_4: # %else2 -; X86-NEXT: vpextrb $8, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB2_6 -; X86-NEXT: # BB#5: # %cond.load4 -; X86-NEXT: vpextrd $2, %xmm0, %eax -; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3 -; X86-NEXT: .LBB2_6: # %else5 -; X86-NEXT: vpextrb $12, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB2_8 -; X86-NEXT: # BB#7: # %cond.load7 -; X86-NEXT: vpextrd $3, %xmm0, %eax -; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3 -; X86-NEXT: .LBB2_8: # %else8 -; X86-NEXT: vpslld $31, %xmm1, %xmm0 -; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2 +; X86-NEXT: vmovdqa %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v4i32: ; X64: # BB#0: # %entry -; X64-NEXT: vpextrb $0, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %XMM3 -; X64-NEXT: je .LBB2_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: .LBB2_2: # %else -; X64-NEXT: vpextrb $4, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB2_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm0, %rax -; X64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3 -; X64-NEXT: .LBB2_4: # %else2 -; X64-NEXT: vpextrb $8, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB2_6 -; X64-NEXT: # BB#5: # %cond.load4 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm4 -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 -; X64-NEXT: .LBB2_6: # %else5 -; X64-NEXT: vpextrb $12, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB2_8 -; X64-NEXT: # BB#7: # %cond.load7 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; X64-NEXT: vpextrq $1, %xmm0, %rax -; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3 -; X64-NEXT: .LBB2_8: # %else8 -; X64-NEXT: vpslld $31, %xmm1, %xmm0 -; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2 +; X64-NEXT: vmovdqa %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -196,74 +134,14 @@ define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) { ; X86-LABEL: masked_gather_v4float: ; X86: # BB#0: # %entry -; X86-NEXT: vpextrb $0, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %XMM3 -; X86-NEXT: je .LBB3_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm0, %eax -; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X86-NEXT: .LBB3_2: # %else -; X86-NEXT: vpextrb $4, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB3_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $1, %xmm0, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] -; X86-NEXT: .LBB3_4: # %else2 -; X86-NEXT: vpextrb $8, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB3_6 -; X86-NEXT: # BB#5: # %cond.load4 -; X86-NEXT: vpextrd $2, %xmm0, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; X86-NEXT: .LBB3_6: # %else5 -; X86-NEXT: vpextrb $12, %xmm1, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB3_8 -; X86-NEXT: # BB#7: # %cond.load7 -; X86-NEXT: vpextrd $3, %xmm0, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; X86-NEXT: .LBB3_8: # %else8 -; X86-NEXT: vpslld $31, %xmm1, %xmm0 -; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2 +; X86-NEXT: vmovaps %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v4float: ; X64: # BB#0: # %entry -; X64-NEXT: vpextrb $0, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %XMM3 -; X64-NEXT: je .LBB3_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: .LBB3_2: # %else -; X64-NEXT: vpextrb $4, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB3_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm0, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] -; X64-NEXT: .LBB3_4: # %else2 -; X64-NEXT: vpextrb $8, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB3_6 -; X64-NEXT: # BB#5: # %cond.load4 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm4 -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; X64-NEXT: .LBB3_6: # %else5 -; X64-NEXT: vpextrb $12, %xmm1, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB3_8 -; X64-NEXT: # BB#7: # %cond.load7 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; X64-NEXT: vpextrq $1, %xmm0, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; X64-NEXT: .LBB3_8: # %else8 -; X64-NEXT: vpslld $31, %xmm1, %xmm0 -; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2 +; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -276,164 +154,25 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i32> %passthro) { ; X86-LABEL: masked_gather_v8i32: ; X86: # BB#0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovdqa (%eax), %ymm3 -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %YMM2 -; X86-NEXT: je .LBB4_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: .LBB4_2: # %else -; X86-NEXT: vpextrb $2, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $1, %xmm3, %eax -; X86-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm4 -; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB4_4: # %else2 -; X86-NEXT: vpextrb $4, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_6 -; X86-NEXT: # BB#5: # %cond.load4 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4 -; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB4_6: # %else5 -; X86-NEXT: vpextrb $6, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_8 -; X86-NEXT: # BB#7: # %cond.load7 -; X86-NEXT: vpextrd $3, %xmm3, %eax -; X86-NEXT: vpinsrd $3, (%eax), %xmm2, %xmm4 -; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB4_8: # %else8 -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_10 -; X86-NEXT: # BB#9: # %cond.load10 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X86-NEXT: vmovd %xmm4, %eax -; X86-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4 -; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB4_10: # %else11 -; X86-NEXT: vpextrb $10, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_12 -; X86-NEXT: # BB#11: # %cond.load13 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X86-NEXT: vpextrd $1, %xmm4, %eax -; X86-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X86-NEXT: vpinsrd $1, (%eax), %xmm4, %xmm4 -; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB4_12: # %else14 -; X86-NEXT: vpextrb $12, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_14 -; X86-NEXT: # BB#13: # %cond.load16 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X86-NEXT: vpextrd $2, %xmm4, %eax -; X86-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X86-NEXT: vpinsrd $2, (%eax), %xmm4, %xmm4 -; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB4_14: # %else17 -; X86-NEXT: vpextrb $14, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB4_16 -; X86-NEXT: # BB#15: # %cond.load19 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm3 -; X86-NEXT: vpextrd $3, %xmm3, %eax -; X86-NEXT: vextracti128 $1, %ymm2, %xmm3 -; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3 -; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; X86-NEXT: .LBB4_16: # %else20 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpslld $31, %ymm0, %ymm0 -; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqa (%eax), %ymm2 +; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1 +; X86-NEXT: vmovdqa %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v8i32: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %ymm4 -; X64-NEXT: vmovdqa 32(%rdi), %ymm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %YMM2 -; X64-NEXT: je .LBB4_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: .LBB4_2: # %else -; X64-NEXT: vpextrb $2, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm4, %rax -; X64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm5 -; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB4_4: # %else2 -; X64-NEXT: vpextrb $4, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_6 -; X64-NEXT: # BB#5: # %cond.load4 -; X64-NEXT: vextracti128 $1, %ymm4, %xmm5 -; X64-NEXT: vmovq %xmm5, %rax -; X64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm5 -; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB4_6: # %else5 -; X64-NEXT: vpextrb $6, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_8 -; X64-NEXT: # BB#7: # %cond.load7 -; X64-NEXT: vextracti128 $1, %ymm4, %xmm4 -; X64-NEXT: vpextrq $1, %xmm4, %rax -; X64-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm4 -; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB4_8: # %else8 -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_10 -; X64-NEXT: # BB#9: # %cond.load10 -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X64-NEXT: vpinsrd $0, (%rax), %xmm4, %xmm4 -; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB4_10: # %else11 -; X64-NEXT: vpextrb $10, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_12 -; X64-NEXT: # BB#11: # %cond.load13 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X64-NEXT: vpinsrd $1, (%rax), %xmm4, %xmm4 -; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB4_12: # %else14 -; X64-NEXT: vpextrb $12, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_14 -; X64-NEXT: # BB#13: # %cond.load16 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X64-NEXT: vpinsrd $2, (%rax), %xmm4, %xmm4 -; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB4_14: # %else17 -; X64-NEXT: vpextrb $14, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB4_16 -; X64-NEXT: # BB#15: # %cond.load19 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm3 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vextracti128 $1, %ymm2, %xmm3 -; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3 -; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; X64-NEXT: .LBB4_16: # %else20 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: vpslld $31, %ymm0, %ymm0 -; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; X64-NEXT: vpsrad $31, %ymm0, %ymm0 +; X64-NEXT: vmovdqa (%rdi), %ymm2 +; X64-NEXT: vmovdqa 32(%rdi), %ymm3 +; X64-NEXT: vextracti128 $1, %ymm1, %xmm4 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm5 +; X64-NEXT: vpgatherqd %xmm5, (,%ymm3), %xmm4 +; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x i32*>, <8 x i32*>* %ptr @@ -446,166 +185,25 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <8 x float> %passthro) { ; X86-LABEL: masked_gather_v8float: ; X86: # BB#0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovdqa (%eax), %ymm3 -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %YMM2 -; X86-NEXT: je .LBB5_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: .LBB5_2: # %else -; X86-NEXT: vpextrb $2, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $1, %xmm3, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0],mem[0],xmm2[2,3] -; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB5_4: # %else2 -; X86-NEXT: vpextrb $4, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_6 -; X86-NEXT: # BB#5: # %cond.load4 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1],mem[0],xmm2[3] -; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB5_6: # %else5 -; X86-NEXT: vpextrb $6, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_8 -; X86-NEXT: # BB#7: # %cond.load7 -; X86-NEXT: vpextrd $3, %xmm3, %eax -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0] -; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB5_8: # %else8 -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_10 -; X86-NEXT: # BB#9: # %cond.load10 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X86-NEXT: vmovd %xmm4, %eax -; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X86-NEXT: vextractf128 $1, %ymm2, %xmm5 -; X86-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB5_10: # %else11 -; X86-NEXT: vpextrb $10, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_12 -; X86-NEXT: # BB#11: # %cond.load13 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X86-NEXT: vpextrd $1, %xmm4, %eax -; X86-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB5_12: # %else14 -; X86-NEXT: vpextrb $12, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_14 -; X86-NEXT: # BB#13: # %cond.load16 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X86-NEXT: vpextrd $2, %xmm4, %eax -; X86-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] -; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB5_14: # %else17 -; X86-NEXT: vpextrb $14, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB5_16 -; X86-NEXT: # BB#15: # %cond.load19 -; X86-NEXT: vextracti128 $1, %ymm3, %xmm3 -; X86-NEXT: vpextrd $3, %xmm3, %eax -; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-NEXT: .LBB5_16: # %else20 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpslld $31, %ymm0, %ymm0 -; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovaps (%eax), %ymm2 +; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1 +; X86-NEXT: vmovaps %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v8float: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %ymm4 -; X64-NEXT: vmovdqa 32(%rdi), %ymm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %YMM2 -; X64-NEXT: je .LBB5_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: .LBB5_2: # %else -; X64-NEXT: vpextrb $2, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm4, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3] -; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB5_4: # %else2 -; X64-NEXT: vpextrb $4, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_6 -; X64-NEXT: # BB#5: # %cond.load4 -; X64-NEXT: vextracti128 $1, %ymm4, %xmm5 -; X64-NEXT: vmovq %xmm5, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3] -; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB5_6: # %else5 -; X64-NEXT: vpextrb $6, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_8 -; X64-NEXT: # BB#7: # %cond.load7 -; X64-NEXT: vextracti128 $1, %ymm4, %xmm4 -; X64-NEXT: vpextrq $1, %xmm4, %rax -; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0] -; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB5_8: # %else8 -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_10 -; X64-NEXT: # BB#9: # %cond.load10 -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X64-NEXT: vextractf128 $1, %ymm2, %xmm5 -; X64-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB5_10: # %else11 -; X64-NEXT: vpextrb $10, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_12 -; X64-NEXT: # BB#11: # %cond.load13 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB5_12: # %else14 -; X64-NEXT: vpextrb $12, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_14 -; X64-NEXT: # BB#13: # %cond.load16 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] -; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB5_14: # %else17 -; X64-NEXT: vpextrb $14, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB5_16 -; X64-NEXT: # BB#15: # %cond.load19 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm3 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-NEXT: .LBB5_16: # %else20 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: vpslld $31, %ymm0, %ymm0 -; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; X64-NEXT: vpsrad $31, %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %ymm2 +; X64-NEXT: vmovaps 32(%rdi), %ymm3 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm5 +; X64-NEXT: vgatherqps %xmm5, (,%ymm3), %xmm4 +; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x float*>, <8 x float*>* %ptr @@ -618,92 +216,23 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i64> %passthro) { ; X86-LABEL: masked_gather_v4i64: ; X86: # BB#0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovdqa (%eax), %xmm3 -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %YMM2 -; X86-NEXT: je .LBB6_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: .LBB6_2: # %else -; X86-NEXT: vpextrb $4, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB6_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $1, %xmm3, %eax -; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4 -; X86-NEXT: vpinsrd $3, 4(%eax), %xmm4, %xmm4 -; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X86-NEXT: .LBB6_4: # %else2 -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB6_6 -; X86-NEXT: # BB#5: # %cond.load4 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4 -; X86-NEXT: vpinsrd $1, 4(%eax), %xmm4, %xmm4 -; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB6_6: # %else5 -; X86-NEXT: vpextrb $12, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB6_8 -; X86-NEXT: # BB#7: # %cond.load7 -; X86-NEXT: vpextrd $3, %xmm3, %eax -; X86-NEXT: vextracti128 $1, %ymm2, %xmm3 -; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3 -; X86-NEXT: vpinsrd $3, 4(%eax), %xmm3, %xmm3 -; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; X86-NEXT: .LBB6_8: # %else8 ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 ; X86-NEXT: vpmovsxdq %xmm0, %ymm0 -; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqa (%eax), %xmm2 +; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1 +; X86-NEXT: vmovdqa %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v4i64: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %ymm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %YMM2 -; X64-NEXT: je .LBB6_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: .LBB6_2: # %else -; X64-NEXT: vpextrb $4, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB6_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm4 -; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: .LBB6_4: # %else2 -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB6_6 -; X64-NEXT: # BB#5: # %cond.load4 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vextracti128 $1, %ymm2, %xmm4 -; X64-NEXT: vpinsrq $0, (%rax), %xmm4, %xmm4 -; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB6_6: # %else5 -; X64-NEXT: vpextrb $12, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB6_8 -; X64-NEXT: # BB#7: # %cond.load7 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm3 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vextracti128 $1, %ymm2, %xmm3 -; X64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3 -; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; X64-NEXT: .LBB6_8: # %else8 ; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 ; X64-NEXT: vpmovsxdq %xmm0, %ymm0 -; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; X64-NEXT: vmovdqa (%rdi), %ymm2 +; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x i64*>, <4 x i64*>* %ptr @@ -716,89 +245,23 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks, <4 x double> %passthro) { ; X86-LABEL: masked_gather_v4double: ; X86: # BB#0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovdqa (%eax), %xmm3 -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %YMM2 -; X86-NEXT: je .LBB7_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: .LBB7_2: # %else -; X86-NEXT: vpextrb $4, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB7_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $1, %xmm3, %eax -; X86-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; X86-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; X86-NEXT: .LBB7_4: # %else2 -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB7_6 -; X86-NEXT: # BB#5: # %cond.load4 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X86-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1] -; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X86-NEXT: .LBB7_6: # %else5 -; X86-NEXT: vpextrb $12, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB7_8 -; X86-NEXT: # BB#7: # %cond.load7 -; X86-NEXT: vpextrd $3, %xmm3, %eax -; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X86-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-NEXT: .LBB7_8: # %else8 ; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 ; X86-NEXT: vpmovsxdq %xmm0, %ymm0 -; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovapd (%eax), %xmm2 +; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1 +; X86-NEXT: vmovapd %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v4double: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %ymm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %YMM2 -; X64-NEXT: je .LBB7_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: .LBB7_2: # %else -; X64-NEXT: vpextrb $4, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB7_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; X64-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; X64-NEXT: .LBB7_4: # %else2 -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB7_6 -; X64-NEXT: # BB#5: # %cond.load4 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; X64-NEXT: vmovq %xmm4, %rax -; X64-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X64-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1] -; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X64-NEXT: .LBB7_6: # %else5 -; X64-NEXT: vpextrb $12, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB7_8 -; X64-NEXT: # BB#7: # %cond.load7 -; X64-NEXT: vextracti128 $1, %ymm3, %xmm3 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X64-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-NEXT: .LBB7_8: # %else8 ; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 ; X64-NEXT: vpmovsxdq %xmm0, %ymm0 -; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; X64-NEXT: vmovapd (%rdi), %ymm2 +; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1 +; X64-NEXT: vmovapd %ymm1, %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x double*>, <4 x double*>* %ptr @@ -812,47 +275,16 @@ ; X86-LABEL: masked_gather_v2i64: ; X86: # BB#0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %XMM2 -; X86-NEXT: je .LBB8_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: .LBB8_2: # %else -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB8_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2 -; X86-NEXT: vpinsrd $3, 4(%eax), %xmm2, %xmm2 -; X86-NEXT: .LBB8_4: # %else2 -; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2i64: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %xmm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %XMM2 -; X64-NEXT: je .LBB8_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: .LBB8_2: # %else -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB8_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2 -; X64-NEXT: .LBB8_4: # %else2 -; X64-NEXT: vpsllq $63, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; X64-NEXT: vmovdqa (%rdi), %xmm2 +; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1 +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq entry: %ld = load <2 x i64*>, <2 x i64*>* %ptr @@ -866,46 +298,16 @@ ; X86-LABEL: masked_gather_v2double: ; X86: # BB#0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; X86-NEXT: vpextrb $0, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: # implicit-def: %XMM2 -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # BB#1: # %cond.load -; X86-NEXT: vmovd %xmm3, %eax -; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: .LBB9_2: # %else -; X86-NEXT: vpextrb $8, %xmm0, %eax -; X86-NEXT: testb $1, %al -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # BB#3: # %cond.load1 -; X86-NEXT: vpextrd $2, %xmm3, %eax -; X86-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; X86-NEXT: .LBB9_4: # %else2 -; X86-NEXT: vpsllq $63, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X86-NEXT: vmovapd %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_v2double: ; X64: # BB#0: # %entry -; X64-NEXT: vmovdqa (%rdi), %xmm3 -; X64-NEXT: vpextrb $0, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: # implicit-def: %XMM2 -; X64-NEXT: je .LBB9_2 -; X64-NEXT: # BB#1: # %cond.load -; X64-NEXT: vmovq %xmm3, %rax -; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: .LBB9_2: # %else -; X64-NEXT: vpextrb $8, %xmm0, %eax -; X64-NEXT: testb $1, %al -; X64-NEXT: je .LBB9_4 -; X64-NEXT: # BB#3: # %cond.load1 -; X64-NEXT: vpextrq $1, %xmm3, %rax -; X64-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; X64-NEXT: .LBB9_4: # %else2 -; X64-NEXT: vpsllq $63, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; X64-NEXT: vmovapd (%rdi), %xmm2 +; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1 +; X64-NEXT: vmovapd %xmm1, %xmm0 ; X64-NEXT: retq entry: %ld = load <2 x double*>, <2 x double*>* %ptr