diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31103,6 +31103,10 @@ Mask = ExtendToType(Mask, MaskVT, DAG, true); } + // Break dependency on the data register. + if (PassThru.isUndef()) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, N->getScale() }; SDValue NewGather = DAG.getMemIntrinsicNode( diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1748,24 +1748,28 @@ ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: kmovw %k1, %k2 -; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm1 {%k2} +; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k2} ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] -; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k1} -; AVX512F-NEXT: vpaddd %ymm2, %ymm2, %ymm0 -; AVX512F-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm1 {%k1} +; AVX512F-NEXT: vpaddd %ymm1, %ymm1, %ymm0 +; AVX512F-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: gather_v8i32_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [12,12,12,12,12,12,12,12] +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12] ; AVX512VL-NEXT: kmovw %k1, %k2 -; AVX512VL-NEXT: vpgatherdd c(,%ymm0), %ymm1 {%k2} -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [28,28,28,28,28,28,28,28] -; AVX512VL-NEXT: vpgatherdd c(,%ymm0), %ymm2 {%k1} -; AVX512VL-NEXT: vpaddd %ymm2, %ymm2, %ymm0 -; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm2 {%k2} +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28] +; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm0 {%k1} +; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %1 = icmp eq <8 x i32> %trigger, zeroinitializer %2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> getelementptr (%struct.a, <8 x %struct.a*> <%struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c, %struct.a* @c>, <8 x i64> zeroinitializer, i32 0, <8 x i64> ), i32 4, <8 x i1> %1, <8 x i32> undef) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -21,6 +21,7 @@ ; KNL_64-LABEL: test1: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -29,6 +30,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -36,6 +38,7 @@ ; SKX-LABEL: test1: ; SKX: # %bb.0: ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq @@ -44,6 +47,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -78,6 +82,7 @@ ; KNL_64-LABEL: test2: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -86,6 +91,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -93,6 +99,7 @@ ; SKX-LABEL: test2: ; SKX: # %bb.0: ; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq @@ -101,6 +108,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -119,6 +127,7 @@ ; KNL_64-LABEL: test3: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -127,6 +136,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -134,6 +144,7 @@ ; SKX-LABEL: test3: ; SKX: # %bb.0: ; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq @@ -142,6 +153,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -161,6 +173,7 @@ ; KNL_64-LABEL: test4: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: kmovw %k1, %k2 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -172,6 +185,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -182,6 +196,7 @@ ; SKX-LABEL: test4: ; SKX: # %bb.0: ; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: kmovw %k1, %k2 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -193,6 +208,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: kmovw %k1, %k2 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -292,6 +308,7 @@ ; KNL_64-LABEL: test6: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -302,6 +319,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: kmovw %k1, %k2 @@ -313,6 +331,7 @@ ; SKX-LABEL: test6: ; SKX: # %bb.0: ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: kxnorw %k0, %k0, %k2 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -322,6 +341,7 @@ ; SKX_32-LABEL: test6: ; SKX_32: # %bb.0: ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2} ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1} @@ -342,6 +362,7 @@ ; KNL_64-NEXT: kmovw %esi, %k0 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: kmovw %k1, %k2 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -357,6 +378,7 @@ ; KNL_32-NEXT: kmovw %ecx, %k0 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -367,6 +389,7 @@ ; SKX-LABEL: test7: ; SKX: # %bb.0: ; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: kmovw %k1, %k2 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} ; SKX-NEXT: vmovdqa %ymm1, %ymm2 @@ -378,6 +401,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: kmovw %k1, %k2 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2} ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2 @@ -403,20 +427,23 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: kmovw %edi, %k1 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL_64-NEXT: kmovw %k2, %k3 -; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} +; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3} ; KNL_64-NEXT: kmovw %k1, %k3 -; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} -; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 -; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} -; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} -; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3} +; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2} +; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} +; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test8: ; KNL_32: # %bb.0: ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -428,20 +455,23 @@ ; SKX: # %bb.0: ; SKX-NEXT: kmovw %edi, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: kmovw %k2, %k3 -; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3} ; SKX-NEXT: kmovw %k1, %k3 -; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} -; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 -; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} -; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} -; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3} +; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2} +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} +; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test8: ; SKX_32: # %bb.0: ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: kmovw %k1, %k2 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -479,6 +509,7 @@ ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; KNL_64-NEXT: retq ; @@ -494,6 +525,7 @@ ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68] ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} @@ -510,6 +542,7 @@ ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; SKX_SMALL-NEXT: retq ; @@ -526,6 +559,7 @@ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; SKX_LARGE-NEXT: retq ; @@ -538,6 +572,7 @@ ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} ; SKX_32-NEXT: retl entry: @@ -565,6 +600,7 @@ ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; KNL_64-NEXT: retq ; @@ -580,6 +616,7 @@ ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68] ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} @@ -596,6 +633,7 @@ ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; SKX_SMALL-NEXT: retq ; @@ -612,6 +650,7 @@ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} ; SKX_LARGE-NEXT: retq ; @@ -624,6 +663,7 @@ ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} ; SKX_32-NEXT: retl entry: @@ -643,6 +683,7 @@ ; KNL_64-NEXT: leaq (%rdi,%rax,4), %rax ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; @@ -653,6 +694,7 @@ ; KNL_32-NEXT: addl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; @@ -662,6 +704,7 @@ ; SKX-NEXT: leaq (%rdi,%rax,4), %rax ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; @@ -672,6 +715,7 @@ ; SKX_32-NEXT: addl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl @@ -689,6 +733,7 @@ ; KNL_64-LABEL: test12: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -697,6 +742,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -704,6 +750,7 @@ ; SKX-LABEL: test12: ; SKX: # %bb.0: ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq @@ -712,6 +759,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -728,6 +776,7 @@ ; KNL_64-LABEL: test13: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -736,6 +785,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vmovaps %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -743,6 +793,7 @@ ; SKX-LABEL: test13: ; SKX: # %bb.0: ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq @@ -751,6 +802,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -772,6 +824,7 @@ ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -781,6 +834,7 @@ ; KNL_32-NEXT: vmovd %xmm0, %eax ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: retl ; @@ -791,6 +845,7 @@ ; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 ; SKX-NEXT: vpsllq $2, %zmm0, %zmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; SKX-NEXT: retq @@ -800,6 +855,7 @@ ; SKX_32-NEXT: vmovd %xmm0, %eax ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; SKX_32-NEXT: retl @@ -909,6 +965,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} ; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq @@ -918,6 +975,7 @@ ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} ; SKX_32-NEXT: vmovaps %xmm1, %xmm0 ; SKX_32-NEXT: retl @@ -2393,6 +2451,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) { ; KNL_64-LABEL: test29: ; KNL_64: # %bb.0: +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: movw $44, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} @@ -2402,6 +2461,7 @@ ; KNL_32-LABEL: test29: ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: movw $44, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} @@ -2410,6 +2470,7 @@ ; ; SKX-LABEL: test29: ; SKX: # %bb.0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: movw $44, %ax ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} @@ -2419,6 +2480,7 @@ ; SKX_32-LABEL: test29: ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: movw $44, %cx ; SKX_32-NEXT: kmovw %ecx, %k1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} @@ -2682,16 +2744,19 @@ ; KNL_64-LABEL: test31: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2 -; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} -; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} -; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0 -; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1 +; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm1 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test31: ; KNL_32: # %bb.0: ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -2699,16 +2764,19 @@ ; SKX-LABEL: test31: ; SKX: # %bb.0: ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; SKX-NEXT: kxnorw %k0, %k0, %k2 -; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} -; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 -; SKX-NEXT: vmovdqa64 %zmm3, %zmm1 +; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; SKX-NEXT: vmovdqa64 %zmm3, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm1 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test31: ; SKX_32: # %bb.0: ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -3439,6 +3507,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1} ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 @@ -3455,6 +3524,7 @@ ; SKX_32-NEXT: subl $32, %esp ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1} ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 @@ -3475,6 +3545,7 @@ ; KNL_64-LABEL: test_global_array: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0 ; KNL_64-NEXT: retq @@ -3482,6 +3553,7 @@ ; KNL_32-LABEL: test_global_array: ; KNL_32: # %bb.0: ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0 ; KNL_32-NEXT: retl @@ -3489,6 +3561,7 @@ ; SKX_SMALL-LABEL: test_global_array: ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_SMALL-NEXT: retq @@ -3497,6 +3570,7 @@ ; SKX_LARGE: # %bb.0: ; SKX_LARGE-NEXT: movabsq $glob_array, %rax ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_LARGE-NEXT: retq @@ -3504,6 +3578,7 @@ ; SKX_32-LABEL: test_global_array: ; SKX_32: # %bb.0: ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_32-NEXT: retl @@ -3516,6 +3591,7 @@ ; KNL_64-LABEL: test_global_array_zeroinitializer_index: ; KNL_64: # %bb.0: ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0 ; KNL_64-NEXT: retq @@ -3523,6 +3599,7 @@ ; KNL_32-LABEL: test_global_array_zeroinitializer_index: ; KNL_32: # %bb.0: ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0 ; KNL_32-NEXT: retl @@ -3530,6 +3607,7 @@ ; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index: ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_SMALL-NEXT: retq @@ -3538,6 +3616,7 @@ ; SKX_LARGE: # %bb.0: ; SKX_LARGE-NEXT: movabsq $glob_array, %rax ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_LARGE-NEXT: retq @@ -3545,6 +3624,7 @@ ; SKX_32-LABEL: test_global_array_zeroinitializer_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0 ; SKX_32-NEXT: retl @@ -3764,6 +3844,7 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; @@ -3772,6 +3853,7 @@ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; @@ -3779,6 +3861,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; @@ -3787,6 +3870,7 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl @@ -3802,6 +3886,7 @@ ; KNL_64-LABEL: sext_v8i8_index: ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: movw $255, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} @@ -3812,6 +3897,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: movw $255, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} @@ -3822,6 +3908,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbd %xmm0, %ymm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; SKX-NEXT: retq ; @@ -3830,6 +3917,7 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl @@ -3847,6 +3935,7 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; @@ -3855,6 +3944,7 @@ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; @@ -3862,6 +3952,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; @@ -3870,6 +3961,7 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl @@ -3885,6 +3977,7 @@ ; KNL_64-LABEL: zext_v8i8_index: ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: movw $255, %ax ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} @@ -3895,6 +3988,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: movw $255, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} @@ -3905,6 +3999,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; SKX-NEXT: retq ; @@ -3913,6 +4008,7 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl @@ -4041,6 +4137,7 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; @@ -4049,6 +4146,7 @@ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; @@ -4056,6 +4154,7 @@ ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX_SMALL-NEXT: retq ; @@ -4064,6 +4163,7 @@ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX_LARGE-NEXT: retq ; @@ -4072,6 +4172,7 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl %ind_masked = and <16 x i32> %ind, @@ -4244,6 +4345,7 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vmovaps %zmm0, (%rsi) ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -4254,6 +4356,7 @@ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; KNL_32-NEXT: vmovaps %zmm0, (%ecx) ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -4262,6 +4365,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vmovaps %zmm0, (%rsi) ; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0 ; SKX-NEXT: retq @@ -4272,6 +4376,7 @@ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; SKX_32-NEXT: vmovaps %zmm0, (%ecx) ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -5127,6 +5232,7 @@ ; KNL_64: # %bb.0: # %bb ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} ; KNL_64-NEXT: retq ; @@ -5135,6 +5241,7 @@ ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1} ; KNL_32-NEXT: retl ; @@ -5142,6 +5249,7 @@ ; SKX_SMALL: # %bb.0: # %bb ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} ; SKX_SMALL-NEXT: retq ; @@ -5150,6 +5258,7 @@ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_LARGE-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} ; SKX_LARGE-NEXT: retq ; @@ -5157,6 +5266,7 @@ ; SKX_32: # %bb.0: # %bb ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1} ; SKX_32-NEXT: retl bb: diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -664,12 +664,14 @@ ; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: kxnorw %k0, %k0, %k1 -; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm2 {%k1} +; WIDEN_SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; WIDEN_SKX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1} ; WIDEN_SKX-NEXT: movw $1, %cx ; WIDEN_SKX-NEXT: kmovw %ecx, %k1 -; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm0 {%k1} -; WIDEN_SKX-NEXT: vmovss %xmm0, 64(%rdi) -; WIDEN_SKX-NEXT: vmovaps %zmm2, (%rdi) +; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1} +; WIDEN_SKX-NEXT: vmovss %xmm2, 64(%rdi) +; WIDEN_SKX-NEXT: vmovaps %zmm3, (%rdi) ; WIDEN_SKX-NEXT: vzeroupper ; WIDEN_SKX-NEXT: retq ; @@ -697,12 +699,14 @@ ; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: kxnorw %k0, %k0, %k1 -; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm2 {%k1} +; WIDEN_KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; WIDEN_KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1} ; WIDEN_KNL-NEXT: movw $1, %cx ; WIDEN_KNL-NEXT: kmovw %ecx, %k1 -; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm0 {%k1} -; WIDEN_KNL-NEXT: vmovss %xmm0, 64(%rdi) -; WIDEN_KNL-NEXT: vmovaps %zmm2, (%rdi) +; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1} +; WIDEN_KNL-NEXT: vmovss %xmm2, 64(%rdi) +; WIDEN_KNL-NEXT: vmovaps %zmm3, (%rdi) ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; @@ -711,32 +715,35 @@ ; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; WIDEN_AVX2-NEXT: movq %rdi, %rax ; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_AVX2-NEXT: movq %rdi, %rax -; WIDEN_AVX2-NEXT: vmovd %edx, %xmm2 -; WIDEN_AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; WIDEN_AVX2-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2 -; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; WIDEN_AVX2-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 -; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_AVX2-NEXT: vmovd %edx, %xmm3 +; WIDEN_AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 +; WIDEN_AVX2-NEXT: vpinsrd $2, %r8d, %xmm3, %xmm3 +; WIDEN_AVX2-NEXT: vpinsrd $3, %r9d, %xmm3, %xmm3 ; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm1 +; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; WIDEN_AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; WIDEN_AVX2-NEXT: vgatherdps %ymm2, (%rsi,%ymm1,4), %ymm4 -; WIDEN_AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; WIDEN_AVX2-NEXT: vgatherdps %ymm1, (%rsi,%ymm0,4), %ymm2 +; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; WIDEN_AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; WIDEN_AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; WIDEN_AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; WIDEN_AVX2-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6 +; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1 ; WIDEN_AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0] -; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm3,4), %ymm1 -; WIDEN_AVX2-NEXT: vmovss %xmm1, 64(%rdi) -; WIDEN_AVX2-NEXT: vmovaps %ymm2, 32(%rdi) -; WIDEN_AVX2-NEXT: vmovaps %ymm4, (%rdi) +; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4 +; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi) +; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi) +; WIDEN_AVX2-NEXT: vmovaps %ymm6, (%rdi) ; WIDEN_AVX2-NEXT: vzeroupper ; WIDEN_AVX2-NEXT: retq { diff --git a/llvm/test/CodeGen/X86/pr45067.ll b/llvm/test/CodeGen/X86/pr45067.ll --- a/llvm/test/CodeGen/X86/pr45067.ll +++ b/llvm/test/CodeGen/X86/pr45067.ll @@ -7,8 +7,9 @@ ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rax +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpgatherdd %ymm1, (%rax,%ymm2), %ymm3 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-NEXT: vpslld $31, %ymm0, %ymm0