Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5394,8 +5394,18 @@ IsUnary = true; break; case X86ISD::VBROADCAST: { - // We only decode broadcasts of same-sized vectors at the moment. - if (N->getOperand(0).getValueType() == VT) { + SDValue N0 = N->getOperand(0); + // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, + // add the pre-extracted value to the Ops vector. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == VT && + N0.getConstantOperandVal(1) == 0) + Ops.push_back(N0.getOperand(0)); + + // We only decode broadcasts of same-sized vectors, unless the broadcast + // came from an extract from the original width. If we found one, we + // pushed it the Ops vector above. + if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(VT, Mask); IsUnary = true; break; @@ -9716,6 +9726,12 @@ BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } + // We only support broadcasting from 128-bit vectors to minimize the + // number of patterns we need to deal with in isel. So extract down to + // 128-bits. + if (SrcVT.getSizeInBits() > 128) + V = extract128BitVector(V, 0, DAG, DL); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -714,8 +714,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ; KNL_64-LABEL: test14: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 ; KNL_64-NEXT: vmovd %esi, %xmm1 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 @@ -731,8 +730,7 @@ ; ; KNL_32-LABEL: test14: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 -; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 @@ -742,8 +740,7 @@ ; ; SKX-LABEL: test14: ; SKX: # BB#0: -; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 ; SKX-NEXT: vpbroadcastd %esi, %ymm1 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 @@ -758,8 +755,7 @@ ; ; SKX_32-LABEL: test14: ; SKX_32: # BB#0: -; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 -; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 @@ -1541,8 +1537,186 @@ ; Check non-power-of-2 case. It should be scalarized. declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { -; ALL-LABEL: test30: -; ALL-NOT: gather +; KNL_64-LABEL: test30: +; KNL_64: # BB#0: +; KNL_64-NEXT: andl $1, %edx +; KNL_64-NEXT: kmovw %edx, %k0 +; KNL_64-NEXT: andl $1, %esi +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: movl %edi, %eax +; KNL_64-NEXT: andl $1, %eax +; KNL_64-NEXT: kmovw %eax, %k2 +; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 +; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; KNL_64-NEXT: # implicit-def: %XMM0 +; KNL_64-NEXT: testb $1, %dil +; KNL_64-NEXT: je .LBB29_2 +; KNL_64-NEXT: # BB#1: # %cond.load +; KNL_64-NEXT: vmovq %xmm1, %rax +; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_64-NEXT: .LBB29_2: # %else +; KNL_64-NEXT: kmovw %k1, %eax +; KNL_64-NEXT: andl $1, %eax +; KNL_64-NEXT: testb %al, %al +; KNL_64-NEXT: je .LBB29_4 +; KNL_64-NEXT: # BB#3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 +; KNL_64-NEXT: .LBB29_4: # %else2 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: andl $1, %eax +; KNL_64-NEXT: testb %al, %al +; KNL_64-NEXT: je .LBB29_6 +; KNL_64-NEXT: # BB#5: # %cond.load4 +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL_64-NEXT: vmovq %xmm1, %rax +; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 +; KNL_64-NEXT: .LBB29_6: # %else5 +; KNL_64-NEXT: kmovw %k1, %eax +; KNL_64-NEXT: kmovw %k2, %ecx +; KNL_64-NEXT: vmovd %ecx, %xmm1 +; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test30: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k0 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movl %eax, %ecx +; KNL_32-NEXT: andl $1, %ecx +; KNL_32-NEXT: kmovw %ecx, %k2 +; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 +; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; KNL_32-NEXT: # implicit-def: %XMM0 +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: je .LBB29_2 +; KNL_32-NEXT: # BB#1: # %cond.load +; KNL_32-NEXT: vmovd %xmm1, %eax +; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: .LBB29_2: # %else +; KNL_32-NEXT: kmovw %k1, %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: testb %al, %al +; KNL_32-NEXT: je .LBB29_4 +; KNL_32-NEXT: # BB#3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %eax +; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 +; KNL_32-NEXT: .LBB29_4: # %else2 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: testb %al, %al +; KNL_32-NEXT: je .LBB29_6 +; KNL_32-NEXT: # BB#5: # %cond.load4 +; KNL_32-NEXT: vpextrd $2, %xmm1, %eax +; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; KNL_32-NEXT: .LBB29_6: # %else5 +; KNL_32-NEXT: kmovw %k1, %eax +; KNL_32-NEXT: kmovw %k2, %ecx +; KNL_32-NEXT: vmovd %ecx, %xmm1 +; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test30: +; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm2, %xmm2 +; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 +; SKX-NEXT: kshiftlw $15, %k1, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 +; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 +; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: # implicit-def: %XMM0 +; SKX-NEXT: testb %al, %al +; SKX-NEXT: je .LBB29_2 +; SKX-NEXT: # BB#1: # %cond.load +; SKX-NEXT: vmovq %xmm1, %rax +; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: .LBB29_2: # %else +; SKX-NEXT: kshiftlw $14, %k1, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: je .LBB29_4 +; SKX-NEXT: # BB#3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm1, %rax +; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 +; SKX-NEXT: .LBB29_4: # %else2 +; SKX-NEXT: kshiftlw $13, %k1, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: je .LBB29_6 +; SKX-NEXT: # BB#5: # %cond.load4 +; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1 +; SKX-NEXT: vmovq %xmm1, %rax +; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 +; SKX-NEXT: .LBB29_6: # %else5 +; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1} +; SKX-NEXT: vmovdqa %xmm3, %xmm0 +; SKX-NEXT: retq +; +; SKX_32-LABEL: test30: +; SKX_32: # BB#0: +; SKX_32-NEXT: subl $12, %esp +; SKX_32-NEXT: .Lcfi0: +; SKX_32-NEXT: .cfi_def_cfa_offset 16 +; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 +; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 +; SKX_32-NEXT: kshiftlw $15, %k1, %k0 +; SKX_32-NEXT: kshiftrw $15, %k0, %k0 +; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 +; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: andl $1, %eax +; SKX_32-NEXT: # implicit-def: %XMM1 +; SKX_32-NEXT: testb %al, %al +; SKX_32-NEXT: je .LBB29_2 +; SKX_32-NEXT: # BB#1: # %cond.load +; SKX_32-NEXT: vmovd %xmm2, %eax +; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX_32-NEXT: .LBB29_2: # %else +; SKX_32-NEXT: kshiftlw $14, %k1, %k0 +; SKX_32-NEXT: kshiftrw $15, %k0, %k0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: andl $1, %eax +; SKX_32-NEXT: testb %al, %al +; SKX_32-NEXT: je .LBB29_4 +; SKX_32-NEXT: # BB#3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm2, %eax +; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1 +; SKX_32-NEXT: .LBB29_4: # %else2 +; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 +; SKX_32-NEXT: kshiftlw $13, %k1, %k0 +; SKX_32-NEXT: kshiftrw $15, %k0, %k0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: andl $1, %eax +; SKX_32-NEXT: testb %al, %al +; SKX_32-NEXT: je .LBB29_6 +; SKX_32-NEXT: # BB#5: # %cond.load4 +; SKX_32-NEXT: vpextrd $2, %xmm2, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1 +; SKX_32-NEXT: .LBB29_6: # %else5 +; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} +; SKX_32-NEXT: addl $12, %esp +; SKX_32-NEXT: retl %sext_ind = sext <3 x i32> %ind to <3 x i64> %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind @@ -2086,6 +2260,34 @@ ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_64-NEXT: retq ; +; KNL_32-LABEL: test_pr28312: +; KNL_32: # BB#0: +; KNL_32-NEXT: pushl %ebp +; KNL_32-NEXT: .Lcfi12: +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: .Lcfi13: +; KNL_32-NEXT: .cfi_offset %ebp, -8 +; KNL_32-NEXT: movl %esp, %ebp +; KNL_32-NEXT: .Lcfi14: +; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: andl $-32, %esp +; KNL_32-NEXT: subl $32, %esp +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1} +; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 +; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: retl +; ; SKX-LABEL: test_pr28312: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 @@ -2094,6 +2296,27 @@ ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_pr28312: +; SKX_32: # BB#0: +; SKX_32-NEXT: pushl %ebp +; SKX_32-NEXT: .Lcfi13: +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: .Lcfi14: +; SKX_32-NEXT: .cfi_offset %ebp, -8 +; SKX_32-NEXT: movl %esp, %ebp +; SKX_32-NEXT: .Lcfi15: +; SKX_32-NEXT: .cfi_def_cfa_register %ebp +; SKX_32-NEXT: andl $-32, %esp +; SKX_32-NEXT: subl $32, %esp +; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1} +; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 +; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; SKX_32-NEXT: movl %ebp, %esp +; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: retl %g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) %g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) %g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) Index: test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-avx512.ll +++ test/CodeGen/X86/vector-shuffle-avx512.ll @@ -126,7 +126,6 @@ ; ; KNL64-LABEL: expand3: ; KNL64: # BB#0: -; KNL64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL64-NEXT: vpbroadcastq %xmm0, %ymm0 ; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] @@ -142,7 +141,6 @@ ; ; KNL32-LABEL: expand3: ; KNL32: # BB#0: -; KNL32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL32-NEXT: vpbroadcastq %xmm0, %ymm0 ; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] Index: test/CodeGen/X86/widened-broadcast.ll =================================================================== --- test/CodeGen/X86/widened-broadcast.ll +++ test/CodeGen/X86/widened-broadcast.ll @@ -51,14 +51,12 @@ ; ; AVX2-LABEL: load_splat_8f32_4f32_01010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8f32_4f32_01010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -131,14 +129,12 @@ ; ; AVX2-LABEL: load_splat_8i32_4i32_01010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8i32_4i32_01010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -242,14 +238,12 @@ ; ; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -272,14 +266,12 @@ ; ; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -442,14 +434,12 @@ ; ; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -472,14 +462,12 @@ ; ; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -502,14 +490,12 @@ ; ; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr