Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4810,6 +4810,18 @@ Info.flags |= MachineMemOperand::MOStore; break; } + case GATHER: + case GATHER_AVX2: { + Info.ptrVal = nullptr; + MVT DataVT = MVT::getVT(I.getType()); + MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); + unsigned NumElts = std::min(DataVT.getVectorNumElements(), + IndexVT.getVectorNumElements()); + Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); + Info.align = 1; + Info.flags |= MachineMemOperand::MOLoad; + break; + } default: return false; } @@ -22376,25 +22388,26 @@ if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = Mask.getValueType(); + EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } -static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget &Subtarget) { +static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); @@ -22412,17 +22425,18 @@ Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); - SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain}; - SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); - SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; - return DAG.getMergeValues(RetOps, dl); + + MemIntrinsicSDNode *MemIntr = cast(Op); + + SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; + SDValue Res = DAG.getTargetMemSDNode( + VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -22787,7 +22801,7 @@ SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, + return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8363,7 +8363,7 @@ VEX, VEX_L, Sched<[WriteLoad]>; } -let Predicates = [UseAVX2] in { +let Predicates = [HasAVX2] in { let mayLoad = 1, hasSideEffects = 0, Constraints = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -64,47 +64,47 @@ * the alphabetical order. */ static const IntrinsicData IntrinsicsWithChain[] = { - X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0), - X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0), - X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0), - X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), - X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), @@ -115,30 +115,30 @@ X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), - X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), - X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), Index: test/CodeGen/X86/avx2-gather.ll =================================================================== --- test/CodeGen/X86/avx2-gather.ll +++ test/CodeGen/X86/avx2-gather.ll @@ -152,9 +152,8 @@ ; X32-LABEL: gather_global: ; X32: # %bb.0: ; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X32-NEXT: movl $x, %eax ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,4), %xmm1 +; X32-NEXT: vgatherqps %xmm2, x(,%ymm0,4), %xmm1 ; X32-NEXT: vmovaps %xmm1, %xmm0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl @@ -162,9 +161,8 @@ ; X64-LABEL: gather_global: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: movl $x, %eax ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vgatherqps %xmm2, (%rax,%ymm0,4), %xmm1 +; X64-NEXT: vgatherqps %xmm2, x(,%ymm0,4), %xmm1 ; X64-NEXT: vmovaps %xmm1, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq Index: test/CodeGen/X86/avx512-gather-scatter-intrin.ll =================================================================== --- test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -881,9 +881,8 @@ ; CHECK-LABEL: gather_global: ; CHECK: # %bb.0: ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: movl $x, %eax ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1} +; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, i8* bitcast ([1024 x float]* @x to i8*), <8 x i64> %0, <8 x i1> , i32 4)