diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4044,10 +4044,9 @@ Index = ModifyToType(Index, WideIndexVT); SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, Scale }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other), - N->getMemoryVT(), dl, Ops, - N->getMemOperand(), N->getIndexType(), - N->getExtensionType()); + SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other), WideVT, + dl, Ops, N->getMemOperand(), + N->getIndexType(), N->getExtensionType()); // Legalize the chain result - switch anything that used the old chain to // use the new one. diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1742,6 +1742,314 @@ ret <8 x i32> %11 } +define <17 x float> @test_mgather_v17f32(float* %base, <17 x i32> %index) +; SSE-LABEL: test_mgather_v17f32: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movd %edx, %xmm8 +; SSE-NEXT: pinsrd $1, %ecx, %xmm8 +; SSE-NEXT: movq %rsi, %xmm1 +; SSE-NEXT: movd %r8d, %xmm9 +; SSE-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: pmovsxdq %xmm3, %xmm7 +; SSE-NEXT: psllq $2, %xmm7 +; SSE-NEXT: paddq %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pmovsxdq %xmm1, %xmm10 +; SSE-NEXT: psllq $2, %xmm10 +; SSE-NEXT: paddq %xmm0, %xmm10 +; SSE-NEXT: pmovsxdq %xmm2, %xmm11 +; SSE-NEXT: psllq $2, %xmm11 +; SSE-NEXT: paddq %xmm0, %xmm11 +; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pmovsxdq %xmm3, %xmm3 +; SSE-NEXT: psllq $2, %xmm3 +; SSE-NEXT: paddq %xmm0, %xmm3 +; SSE-NEXT: pmovsxdq %xmm4, %xmm4 +; SSE-NEXT: psllq $2, %xmm4 +; SSE-NEXT: paddq %xmm0, %xmm4 +; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pmovsxdq %xmm5, %xmm5 +; SSE-NEXT: psllq $2, %xmm5 +; SSE-NEXT: paddq %xmm0, %xmm5 +; SSE-NEXT: pmovsxdq %xmm6, %xmm6 +; SSE-NEXT: psllq $2, %xmm6 +; SSE-NEXT: paddq %xmm0, %xmm6 +; SSE-NEXT: pinsrd $1, %r9d, %xmm9 +; SSE-NEXT: pmovsxdq %xmm9, %xmm1 +; SSE-NEXT: psllq $2, %xmm1 +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: pmovsxdq %xmm8, %xmm2 +; SSE-NEXT: psllq $2, %xmm2 +; SSE-NEXT: paddq %xmm0, %xmm2 +; SSE-NEXT: movq %xmm7, %rcx +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movq %xmm2, %rcx +; SSE-NEXT: movss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSE-NEXT: pextrq $1, %xmm2, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm7 = xmm7[0],mem[0],xmm7[2,3] +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm7 = xmm7[0,1],mem[0],xmm7[3] +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm7 = xmm7[0,1,2],mem[0] +; SSE-NEXT: movq %xmm6, %rcx +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: pextrq $1, %xmm6, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; SSE-NEXT: movq %xmm5, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; SSE-NEXT: pextrq $1, %xmm5, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; SSE-NEXT: movq %xmm4, %rcx +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: pextrq $1, %xmm4, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; SSE-NEXT: movq %xmm3, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE-NEXT: pextrq $1, %xmm3, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; SSE-NEXT: movq %xmm11, %rcx +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: pextrq $1, %xmm11, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; SSE-NEXT: movq %xmm10, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; SSE-NEXT: pextrq $1, %xmm10, %rcx +; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] +; SSE-NEXT: movss %xmm0, 64(%rdi) +; SSE-NEXT: movaps %xmm3, 48(%rdi) +; SSE-NEXT: movaps %xmm2, 32(%rdi) +; SSE-NEXT: movaps %xmm1, 16(%rdi) +; SSE-NEXT: movaps %xmm7, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mgather_v17f32: +; AVX1: # %bb.0: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm2 +; AVX1-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm9 +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm1 +; AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm0 +; AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm10 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vmovd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm6, %xmm7 +; AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm7, %xmm7 +; AVX1-NEXT: vpsllq $2, %xmm7, %xmm7 +; AVX1-NEXT: vpaddq %xmm7, %xmm5, %xmm7 +; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6 +; AVX1-NEXT: vpsllq $2, %xmm6, %xmm6 +; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm3 +; AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vpsllq $2, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4 +; AVX1-NEXT: vpsllq $2, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpinsrd $3, %r9d, %xmm9, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vmovq %xmm8, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovq %xmm2, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm2, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],mem[0],xmm5[2,3] +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vmovq %xmm6, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm6, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX1-NEXT: vmovq %xmm7, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm7, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[0],mem[0],xmm5[2,3] +; AVX1-NEXT: vmovq %xmm10, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm10, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX1-NEXT: vmovq %xmm4, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm4, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],mem[0],xmm5[2,3] +; AVX1-NEXT: vmovq %xmm3, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] +; AVX1-NEXT: vpextrq $1, %xmm3, %rcx +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],mem[0] +; AVX1-NEXT: vmovss %xmm8, 64(%rdi) +; AVX1-NEXT: vmovaps %xmm3, 16(%rdi) +; AVX1-NEXT: vmovaps %xmm1, 48(%rdi) +; AVX1-NEXT: vmovaps %xmm2, 32(%rdi) +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mgather_v17f32: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrd $2, %r8d, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm4 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpsllq $2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpsllq $2, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpsllq $2, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpinsrd $3, %r9d, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpsllq $2, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vpmovsxdq %xmm4, %xmm4 +; AVX2-NEXT: vpsllq $2, %xmm4, %xmm4 +; AVX2-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],mem[0] +; AVX2-NEXT: vmovq %xmm5, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vpextrq $1, %xmm5, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm5, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] +; AVX2-NEXT: vpextrq $1, %xmm5, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[0,1,2],mem[0] +; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],mem[0] +; AVX2-NEXT: vmovss %xmm0, 64(%rdi) +; AVX2-NEXT: vmovaps %xmm2, 16(%rdi) +; AVX2-NEXT: vmovaps %xmm1, 48(%rdi) +; AVX2-NEXT: vmovaps %xmm4, 32(%rdi) +; AVX2-NEXT: vmovaps %xmm3, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mgather_v17f32: +; AVX512: # %bb.0: +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovd %edx, %xmm1 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $2, %r8d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 +; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX512-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX512-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: kxnorw %k0, %k0, %k1 +; AVX512-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm2 {%k1} +; AVX512-NEXT: movw $1, %cx +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm0 {%k1} +; AVX512-NEXT: vmovss %xmm0, 64(%rdi) +; AVX512-NEXT: vmovaps %zmm2, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +{ + %gep = getelementptr float, float* %base, <17 x i32> %index + %res = call <17 x float> @llvm.masked.gather.v17f32.v17p0f32(<17 x float*> %gep, i32 4, <17 x i1> , <17 x float> undef) + ret <17 x float> %res +} + +declare <17 x float> @llvm.masked.gather.v17f32.v17p0f32(<17 x float*>, i32 immarg, <17 x i1>, <17 x float>) + declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>) declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)