Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19829,20 +19829,23 @@ } static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, - SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain, - const X86Subtarget &Subtarget) { + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); + EVT VT = Op.getValueType(); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDVTList VTs = DAG.getVTList(VT, MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - if (Src.isUndef()) + if (ISD::isBuildVectorAllOnes(VMask.getNode())) + Src = DAG.getUNDEF(VT); // If the mask is all ones we don't need a source. + else if (Src.isUndef()) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); Index: test/CodeGen/X86/avx512-gather-scatter-intrin.ll =================================================================== --- test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -297,11 +297,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovapd %xmm0, %xmm2 -; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1} -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) @@ -330,11 +329,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm2 -; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1} +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1} -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1} +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) @@ -348,11 +346,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovdqa %ymm0, %ymm2 -; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) @@ -366,11 +363,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm2 -; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1} -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) @@ -385,7 +381,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: kxnorw %k0, %k0, %k2 -; CHECK-NEXT: vmovdqa %xmm0, %xmm2 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2} ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -402,11 +397,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm2 -; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1} +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1} -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) @@ -440,11 +434,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovapd %xmm0, %xmm2 -; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1} -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) @@ -473,11 +466,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm2 -; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1} +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1} -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1} +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) @@ -506,11 +498,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm2 -; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1} -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1} +; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) @@ -525,7 +516,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: kxnorw %k0, %k0, %k2 -; CHECK-NEXT: vmovdqa %xmm0, %xmm2 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2} ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1} ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -542,11 +532,10 @@ ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm2 -; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1} +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1} -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2) @@ -846,7 +835,6 @@ ; CHECK-LABEL: gather_mask_test: ; CHECK: ## BB#0: ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm2 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3