Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -188,7 +188,6 @@ private: void Select(SDNode *N) override; - bool tryGather(SDNode *N, unsigned Opc); bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); @@ -1982,39 +1981,6 @@ llvm_unreachable("unrecognized size for LdVT"); } -/// Customized ISel for GATHER operations. -bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) { - // Operands of Gather: VSrc, Base, VIdx, VMask, Scale - SDValue Chain = Node->getOperand(0); - SDValue VSrc = Node->getOperand(2); - SDValue Base = Node->getOperand(3); - SDValue VIdx = Node->getOperand(4); - SDValue VMask = Node->getOperand(5); - ConstantSDNode *Scale = dyn_cast(Node->getOperand(6)); - if (!Scale) - return false; - - SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), - MVT::Other); - - SDLoc DL(Node); - - // Memory Operands: Base, Scale, Index, Disp, Segment - SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i32); - const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx, - Disp, Segment, VMask, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); - // Node has 2 outputs: VDst and MVT::Other. - // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. - // We replace VDst of Node with VDst of ResNode, and Other of Node with Other - // of ResNode. - ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); - ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2)); - CurDAG->RemoveDeadNode(Node); - return true; -} - void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opc, MOpc; @@ -2052,55 +2018,6 @@ } break; } - case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: break; - case Intrinsic::x86_avx2_gather_d_pd: - case Intrinsic::x86_avx2_gather_d_pd_256: - case Intrinsic::x86_avx2_gather_q_pd: - case Intrinsic::x86_avx2_gather_q_pd_256: - case Intrinsic::x86_avx2_gather_d_ps: - case Intrinsic::x86_avx2_gather_d_ps_256: - case Intrinsic::x86_avx2_gather_q_ps: - case Intrinsic::x86_avx2_gather_q_ps_256: - case Intrinsic::x86_avx2_gather_d_q: - case Intrinsic::x86_avx2_gather_d_q_256: - case Intrinsic::x86_avx2_gather_q_q: - case Intrinsic::x86_avx2_gather_q_q_256: - case Intrinsic::x86_avx2_gather_d_d: - case Intrinsic::x86_avx2_gather_d_d_256: - case Intrinsic::x86_avx2_gather_q_d: - case Intrinsic::x86_avx2_gather_q_d_256: { - if (!Subtarget->hasAVX2()) - break; - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); - case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break; - case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break; - case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break; - case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break; - case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break; - case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break; - case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break; - case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break; - case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break; - case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break; - case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break; - case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break; - case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break; - case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break; - case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; - case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; - } - if (tryGather(Node, Opc)) - return; - break; - } - } - break; - } case X86ISD::GlobalBaseReg: ReplaceNode(Node, getGlobalBaseReg()); return; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19828,6 +19828,27 @@ } } +static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { + SDLoc dl(Op); + auto *C = cast(ScaleOp); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + EVT MaskVT = Mask.getValueType(); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + Src = DAG.getUNDEF(MaskVT); + else if (Src.isUndef()) + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, dl); +} + static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, @@ -20155,6 +20176,16 @@ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } + case GATHER_AVX2: { + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, + Scale, Chain, Subtarget); + } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK + FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2 }; struct IntrinsicData { @@ -67,6 +67,23 @@ X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), Index: test/CodeGen/X86/avx2-gather.ll =================================================================== --- test/CodeGen/X86/avx2-gather.ll +++ test/CodeGen/X86/avx2-gather.ll @@ -9,12 +9,14 @@ ; X32-LABEL: test_x86_avx2_gather_d_ps: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X32-NEXT: vgatherdps %xmm1, (%eax,%xmm0,2), %xmm2 ; X32-NEXT: vmovaps %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_x86_avx2_gather_d_ps: ; X64: ## BB#0: +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X64-NEXT: vgatherdps %xmm1, (%rdi,%xmm0,2), %xmm2 ; X64-NEXT: vmovaps %xmm2, %xmm0 ; X64-NEXT: retq @@ -30,12 +32,14 @@ ; X32-LABEL: test_x86_avx2_gather_d_pd: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; X32-NEXT: vgatherdpd %xmm1, (%eax,%xmm0,2), %xmm2 ; X32-NEXT: vmovapd %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_x86_avx2_gather_d_pd: ; X64: ## BB#0: +; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; X64-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,2), %xmm2 ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq @@ -51,12 +55,14 @@ ; X32-LABEL: test_x86_avx2_gather_d_ps_256: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2 ; X32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2 ; X32-NEXT: vmovaps %ymm2, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_x86_avx2_gather_d_ps_256: ; X64: ## BB#0: +; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2 ; X64-NEXT: vgatherdps %ymm1, (%rdi,%ymm0,4), %ymm2 ; X64-NEXT: vmovaps %ymm2, %ymm0 ; X64-NEXT: retq @@ -72,12 +78,14 @@ ; X32-LABEL: test_x86_avx2_gather_d_pd_256: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorpd %ymm2, %ymm2, %ymm2 ; X32-NEXT: vgatherdpd %ymm1, (%eax,%xmm0,8), %ymm2 ; X32-NEXT: vmovapd %ymm2, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_x86_avx2_gather_d_pd_256: ; X64: ## BB#0: +; X64-NEXT: vxorpd %ymm2, %ymm2, %ymm2 ; X64-NEXT: vgatherdpd %ymm1, (%rdi,%xmm0,8), %ymm2 ; X64-NEXT: vmovapd %ymm2, %ymm0 ; X64-NEXT: retq @@ -91,7 +99,6 @@ ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovdqa %xmm1, %xmm0 ; X32-NEXT: retl @@ -99,7 +106,6 @@ ; X64-LABEL: test_mm_i32gather_epi32: ; X64: ## BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq @@ -116,7 +122,6 @@ ; X32-LABEL: test_mm_i32gather_pd: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovapd %xmm1, %xmm0 @@ -124,7 +129,6 @@ ; ; X64-LABEL: test_mm_i32gather_pd: ; X64: ## BB#0: -; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovapd %xmm1, %xmm0 Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1068,6 +1068,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovdqa %xmm1, %xmm0 ; X32-NEXT: retl @@ -1075,6 +1076,7 @@ ; X64-LABEL: test_mm_i32gather_epi32: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq @@ -1112,6 +1114,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1 ; X32-NEXT: vmovdqa %ymm1, %ymm0 ; X32-NEXT: retl @@ -1119,6 +1122,7 @@ ; X64-LABEL: test_mm256_i32gather_epi32: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1 ; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq @@ -1234,6 +1238,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovapd %xmm1, %xmm0 ; X32-NEXT: retl @@ -1241,6 +1246,7 @@ ; X64-LABEL: test_mm_i32gather_pd: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovapd %xmm1, %xmm0 ; X64-NEXT: retq @@ -1318,6 +1324,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovaps %xmm1, %xmm0 ; X32-NEXT: retl @@ -1325,6 +1332,7 @@ ; X64-LABEL: test_mm_i32gather_ps: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -1402,6 +1410,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovdqa %xmm1, %xmm0 ; X32-NEXT: retl @@ -1409,6 +1418,7 @@ ; X64-LABEL: test_mm_i64gather_epi32: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq @@ -1444,6 +1454,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1 ; X32-NEXT: vmovdqa %xmm1, %xmm0 ; X32-NEXT: vzeroupper @@ -1452,6 +1463,7 @@ ; X64-LABEL: test_mm256_i64gather_epi32: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1 ; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: vzeroupper @@ -1564,6 +1576,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovapd %xmm1, %xmm0 ; X32-NEXT: retl @@ -1571,6 +1584,7 @@ ; X64-LABEL: test_mm_i64gather_pd: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovapd %xmm1, %xmm0 ; X64-NEXT: retq @@ -1644,6 +1658,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1 ; X32-NEXT: vmovaps %xmm1, %xmm0 ; X32-NEXT: retl @@ -1651,6 +1666,7 @@ ; X64-LABEL: test_mm_i64gather_ps: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -1684,6 +1700,7 @@ ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1 ; X32-NEXT: vmovaps %xmm1, %xmm0 ; X32-NEXT: vzeroupper @@ -1692,6 +1709,7 @@ ; X64-LABEL: test_mm256_i64gather_ps: ; X64: # BB#0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 ; X64-NEXT: vzeroupper