Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -42442,6 +42442,40 @@ SDValue Base = GorS->getBasePtr(); SDValue Scale = GorS->getScale(); + // Shrink constant indices if they are larger than 32-bits. + // Only do this before legalize types since v2i64 could become v2i32. + // FIXME: We could check that the type is legal if we're after legalize types, + // but then we would need to construct test cases where that happens. + // FIXME: We could support more than just constant vectors, but we need to + // careful with costing. A truncate that can be optimized out would be fine. + // Otherwise we might only want to create a truncate if it avoids a split. + if (DCI.isBeforeLegalize()) { + if (auto *BV = dyn_cast(Index)) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); + if (BV->isConstant() && IndexWidth > 32 && + DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + unsigned NumElts = Index.getValueType().getVectorNumElements(); + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); + if (auto *Gather = dyn_cast(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); + } + auto *Scatter = cast(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); + } + } + } + if (DCI.isBeforeLegalizeOps()) { // Remove any sign extends from 32 or smaller to larger than 32. // Only do this before LegalizeOps in case we need the sign extend for Index: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll @@ -3004,9 +3004,9 @@ ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,18446744073709551614,u,u,u,u,u,u> +; KNL_64-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4294967294,u,u,u,u,u,u> ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: vpgatherqq (%rdi,%zmm1,8), %zmm0 {%k1} +; KNL_64-NEXT: vpgatherdq (%rdi,%ymm1,8), %zmm0 {%k1} ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -3018,9 +3018,9 @@ ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,4294967294,4294967295] +; KNL_32-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4294967294,u,u,u,u,u,u> ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: vpgatherqq (%eax,%zmm1,8), %zmm0 {%k1} +; KNL_32-NEXT: vpgatherdq (%eax,%ymm1,8), %zmm0 {%k1} ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -3029,9 +3029,9 @@ ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k1 -; SKX_SMALL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,254,255,255,255,255,255,255,255] +; SKX_SMALL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4294967294,u,u> ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_SMALL-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} +; SKX_SMALL-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: gather_2i64_constant_indices: @@ -3041,7 +3041,7 @@ ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vmovdqa (%rax), %xmm1 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_LARGE-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} +; SKX_LARGE-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: gather_2i64_constant_indices: @@ -3049,9 +3049,9 @@ ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 ; SKX_32-NEXT: vpmovq2m %xmm0, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,4294967294,4294967295] +; SKX_32-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4294967294,u,u> ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_32-NEXT: vpgatherqq (%eax,%xmm1,8), %xmm0 {%k1} +; SKX_32-NEXT: vpgatherdq (%eax,%xmm1,8), %xmm0 {%k1} ; SKX_32-NEXT: retl %gep = getelementptr i64, i64* %ptr, <2 x i64> %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep, i32 8, <2 x i1> %mask, <2 x i64> zeroinitializer) #1 @@ -3064,14 +3064,9 @@ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18446744073709551614,1,18446744073709551608,10,20,50,65536] -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16777215,2147483647,100,18446744073709549616,18446744071562067968,76897723,7,18446744073641653929] -; KNL_64-NEXT: kshiftrw $8, %k1, %k2 -; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm1,4), %ymm3 {%k2} -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} -; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: gather_16i64_constant_indices: @@ -3080,14 +3075,9 @@ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4294967294,4294967295,1,0,4294967288,4294967295,10,0,20,0,50,0,65536,0] -; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16777215,0,2147483647,0,100,0,4294965296,4294967295,2147483648,4294967295,76897723,0,7,0,4227069609,4294967295] -; KNL_32-NEXT: kshiftrw $8, %k1, %k2 -; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL_32-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm1,4), %ymm3 {%k2} -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} -; KNL_32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX_SMALL-LABEL: gather_16i64_constant_indices: @@ -3095,14 +3085,9 @@ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1 -; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18446744073709551614,1,18446744073709551608,10,20,50,65536] -; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16777215,2147483647,100,18446744073709549616,18446744071562067968,76897723,7,18446744073641653929] -; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2 -; SKX_SMALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX_SMALL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; SKX_SMALL-NEXT: vpgatherqd (%rdi,%zmm1,4), %ymm3 {%k2} -; SKX_SMALL-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} -; SKX_SMALL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKX_SMALL-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: gather_16i64_constant_indices: @@ -3111,15 +3096,9 @@ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax -; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm0 -; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm1 -; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2 -; SKX_LARGE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX_LARGE-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; SKX_LARGE-NEXT: vpgatherqd (%rdi,%zmm1,4), %ymm3 {%k2} -; SKX_LARGE-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} -; SKX_LARGE-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKX_LARGE-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: gather_16i64_constant_indices: @@ -3128,14 +3107,9 @@ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0 ; SKX_32-NEXT: vpmovd2m %zmm0, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4294967294,4294967295,1,0,4294967288,4294967295,10,0,20,0,50,0,65536,0] -; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16777215,0,2147483647,0,100,0,4294965296,4294967295,2147483648,4294967295,76897723,0,7,0,4227069609,4294967295] -; SKX_32-NEXT: kshiftrw $8, %k1, %k2 -; SKX_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX_32-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; SKX_32-NEXT: vpgatherqd (%eax,%zmm1,4), %ymm3 {%k2} -; SKX_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} -; SKX_32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKX_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl %gep = getelementptr i32, i32* %ptr, <16 x i64> %res = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep, i32 4, <16 x i1> %mask, <16 x i32> zeroinitializer) #1 @@ -3145,26 +3119,26 @@ define void @scatter_2i64_constant_indices(i32* %ptr, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: scatter_2i64_constant_indices: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,18446744073709551614,u,u,u,u,u,u> -; KNL_64-NEXT: vpscatterqd %ymm1, (%rdi,%zmm0,4) {%k1} +; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,4294967294,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: scatter_2i64_constant_indices: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,4294967294,4294967295] -; KNL_32-NEXT: vpscatterqd %ymm1, (%eax,%zmm0,4) {%k1} +; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,4294967294,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -3172,8 +3146,8 @@ ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k1 -; SKX_SMALL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,254,255,255,255,255,255,255,255] -; SKX_SMALL-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} +; SKX_SMALL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4294967294,u,u> +; SKX_SMALL-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: scatter_2i64_constant_indices: @@ -3182,7 +3156,7 @@ ; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vmovdqa (%rax), %xmm0 -; SKX_LARGE-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} +; SKX_LARGE-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: scatter_2i64_constant_indices: @@ -3190,8 +3164,8 @@ ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 ; SKX_32-NEXT: vpmovq2m %xmm0, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,4294967294,4294967295] -; SKX_32-NEXT: vpscatterqd %xmm1, (%eax,%xmm0,4) {%k1} +; SKX_32-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4294967294,u,u> +; SKX_32-NEXT: vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1} ; SKX_32-NEXT: retl %gep = getelementptr i32, i32* %ptr, <2 x i64> call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %src0, <2 x i32*> %gep, i32 4, <2 x i1> %mask) @@ -3204,12 +3178,8 @@ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16777215,2147483647,100,18446744073709549616,18446744071562067968,76897723,7,18446744073641653929] -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,18446744073709551614,1,18446744073709551608,10,20,50,65536] -; KNL_64-NEXT: kshiftrw $8, %k1, %k2 -; KNL_64-NEXT: vpscatterqd %ymm1, (%rdi,%zmm2,4) {%k1} -; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_64-NEXT: vpscatterqd %ymm1, (%rdi,%zmm0,4) {%k2} +; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; @@ -3219,12 +3189,8 @@ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16777215,0,2147483647,0,100,0,4294965296,4294967295,2147483648,4294967295,76897723,0,7,0,4227069609,4294967295] -; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,4294967294,4294967295,1,0,4294967288,4294967295,10,0,20,0,50,0,65536,0] -; KNL_32-NEXT: kshiftrw $8, %k1, %k2 -; KNL_32-NEXT: vpscatterqd %ymm1, (%eax,%zmm2,4) {%k1} -; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_32-NEXT: vpscatterqd %ymm1, (%eax,%zmm0,4) {%k2} +; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -3233,12 +3199,8 @@ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1 -; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16777215,2147483647,100,18446744073709549616,18446744071562067968,76897723,7,18446744073641653929] -; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,18446744073709551614,1,18446744073709551608,10,20,50,65536] -; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2 -; SKX_SMALL-NEXT: vpscatterqd %ymm1, (%rdi,%zmm2,4) {%k1} -; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX_SMALL-NEXT: vpscatterqd %ymm1, (%rdi,%zmm0,4) {%k2} +; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; SKX_SMALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} ; SKX_SMALL-NEXT: vzeroupper ; SKX_SMALL-NEXT: retq ; @@ -3249,12 +3211,7 @@ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm0 -; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax -; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm2 -; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2 -; SKX_LARGE-NEXT: vpscatterqd %ymm1, (%rdi,%zmm2,4) {%k1} -; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX_LARGE-NEXT: vpscatterqd %ymm1, (%rdi,%zmm0,4) {%k2} +; SKX_LARGE-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} ; SKX_LARGE-NEXT: vzeroupper ; SKX_LARGE-NEXT: retq ; @@ -3264,12 +3221,8 @@ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0 ; SKX_32-NEXT: vpmovd2m %zmm0, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16777215,0,2147483647,0,100,0,4294965296,4294967295,2147483648,4294967295,76897723,0,7,0,4227069609,4294967295] -; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,4294967294,4294967295,1,0,4294967288,4294967295,10,0,20,0,50,0,65536,0] -; SKX_32-NEXT: kshiftrw $8, %k1, %k2 -; SKX_32-NEXT: vpscatterqd %ymm1, (%eax,%zmm2,4) {%k1} -; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX_32-NEXT: vpscatterqd %ymm1, (%eax,%zmm0,4) {%k2} +; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] +; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %gep = getelementptr i32, i32* %ptr, <16 x i64>