Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -12982,6 +12982,12 @@ MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); + if (OpVT.getVectorElementType() == MVT::i1) + return insert1BitVector(Op, DAG, Subtarget); + + assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && + "Can only insert into 256-bit or 512-bit vectors"); + // Fold two 16-byte subvector loads into one 32-byte load: // (insert_subvector (insert_subvector undef, (load addr), 0), // (load addr + 16), Elts/2) @@ -12990,9 +12996,10 @@ // (insert_subvector (insert_subvector undef, (load addr), 0), // (load addr), Elts/2) // --> X86SubVBroadcast(load16 addr) + // Similar for 32-byte subvector into a 64-byte vector. if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { auto *Idx2 = dyn_cast(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { SDValue SubVec2 = Vec.getOperand(1); @@ -13020,12 +13027,6 @@ } } - if (OpVT.getVectorElementType() == MVT::i1) - return insert1BitVector(Op, DAG, Subtarget); - - assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && - "Can only insert into 256-bit or 512-bit vectors"); - if (SubVecVT.is128BitVector()) return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -1043,6 +1043,28 @@ v8f64_info, v4f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; +let Predicates = [HasAVX512] in { +def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v16i16 VR256X:$src), 1)>; +def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v32i8 VR256X:$src), 1)>; +} + let Predicates = [HasVLX] in { defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", v8i32x_info, v4i32x_info>, @@ -1106,6 +1128,22 @@ (v2i64 VR128X:$src), 1)>; } +let Predicates = [HasAVX512, NoDQI] in { +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +} + let Predicates = [HasDQI] in { defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", v8i64_info, v2i64x_info>, VEX_W, @@ -1119,6 +1157,15 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, Index: test/CodeGen/X86/avx512-vbroadcasti256.ll =================================================================== --- test/CodeGen/X86/avx512-vbroadcasti256.ll +++ test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -6,8 +6,7 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { ; X64-AVX512-LABEL: test_broadcast_4f64_8f64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = load <4 x double>, <4 x double> *%p @@ -19,8 +18,7 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { ; X64-AVX512-LABEL: test_broadcast_4i64_8i64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 -; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64> *%p @@ -32,22 +30,19 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512DQVL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <8 x float>, <8 x float> *%p @@ -59,22 +54,19 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512DQVL-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p @@ -93,8 +85,7 @@ ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovdqu16 (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; @@ -120,8 +111,7 @@ ; ; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovdqu8 (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -131,8 +131,7 @@ ; X32-AVX512-LABEL: test_broadcast_4f64_8f64: ; X32-AVX512: ## BB#0: ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512-NEXT: vmovapd (%eax), %ymm0 -; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4f64_8f64: @@ -143,8 +142,7 @@ ; ; X64-AVX512-LABEL: test_broadcast_4f64_8f64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: retq %1 = load <4 x double>, <4 x double> *%p %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> @@ -281,8 +279,7 @@ ; X32-AVX512-LABEL: test_broadcast_4i64_8i64: ; X32-AVX512: ## BB#0: ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512-NEXT: vmovdqa64 (%eax), %ymm0 -; X32-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i64_8i64: @@ -293,8 +290,7 @@ ; ; X64-AVX512-LABEL: test_broadcast_4i64_8i64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 -; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64> *%p %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> @@ -396,22 +392,19 @@ ; X32-AVX512F-LABEL: test_broadcast_8f32_16f32: ; X32-AVX512F: ## BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovapd (%eax), %ymm0 -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovapd (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32: ; X32-AVX512DQ: ## BB#0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8f32_16f32: @@ -422,20 +415,17 @@ ; ; X64-AVX512F-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQ-NEXT: retq %1 = load <8 x float>, <8 x float> *%p %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> @@ -550,22 +540,19 @@ ; X32-AVX512F-LABEL: test_broadcast_8i32_16i32: ; X32-AVX512F: ## BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqa32 (%eax), %ymm0 -; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovdqa32 (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32: ; X32-AVX512DQ: ## BB#0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vmovdqa32 (%eax), %ymm0 -; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i32_16i32: @@ -576,20 +563,17 @@ ; ; X64-AVX512F-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQ-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> @@ -711,8 +695,7 @@ ; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovdqu16 (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16: @@ -736,8 +719,7 @@ ; ; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqu16 (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16: @@ -865,8 +847,7 @@ ; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovdqu8 (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8: @@ -890,8 +871,7 @@ ; ; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqu8 (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8: