Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13024,6 +13024,12 @@ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); } } + // If this is subv_broadcast insert into both halves, use a larger + // subv_broadcast. + if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, + SubVec.getOperand(0)); + } } } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -1063,6 +1063,51 @@ def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v32i8 VR256X:$src), 1)>; + +def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF64x4Zrr + (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; + +def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; } let Predicates = [HasVLX] in { @@ -1129,6 +1174,30 @@ } let Predicates = [HasAVX512, NoDQI] in { +def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4rm addr:$src)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF64x4Zrr + (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; + def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), (VBROADCASTF64X4rm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), @@ -1166,6 +1235,25 @@ def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v8i32 VR256X:$src), 1)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF32x8Zrr + (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI32x8Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, Index: test/CodeGen/X86/avx512-vbroadcasti128.ll =================================================================== --- test/CodeGen/X86/avx512-vbroadcasti128.ll +++ test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -110,22 +110,19 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512DQVL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] ; X64-AVX512DQVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <2 x double>, <2 x double> *%p @@ -137,22 +134,19 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] ; X64-AVX512DQVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p @@ -162,26 +156,11 @@ } define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_4f32_16f32: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_4f32_16f32: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_4f32_16f32: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512DQVL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_4f32_16f32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float> *%p %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> %3 = fadd <16 x float> %2, @@ -189,26 +168,11 @@ } define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_4i32_16i32: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_4i32_16i32: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_4i32_16i32: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512DQVL-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_4i32_16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32> *%p %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> %3 = add <16 x i32> %2, @@ -225,8 +189,7 @@ ; ; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; @@ -252,8 +215,7 @@ ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -74,22 +74,19 @@ ; X32-AVX512F-LABEL: test_broadcast_2f64_8f64: ; X32-AVX512F: ## BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64: ; X32-AVX512DQ: ## BB#0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2f64_8f64: @@ -100,20 +97,17 @@ ; ; X64-AVX512F-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] ; X64-AVX512DQ-NEXT: retq %1 = load <2 x double>, <2 x double> *%p %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> @@ -216,22 +210,19 @@ ; X32-AVX512F-LABEL: test_broadcast_2i64_8i64: ; X32-AVX512F: ## BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64: ; X32-AVX512DQ: ## BB#0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX1-LABEL: test_broadcast_2i64_8i64: @@ -248,20 +239,17 @@ ; ; X64-AVX512F-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] ; X64-AVX512DQ-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> @@ -332,26 +320,11 @@ ; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_4f32_16f32: -; X32-AVX512F: ## BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_4f32_16f32: -; X32-AVX512BW: ## BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_4f32_16f32: -; X32-AVX512DQ: ## BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_4f32_16f32: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4f32_16f32: ; X64-AVX: ## BB#0: @@ -359,23 +332,10 @@ ; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_4f32_16f32: -; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_4f32_16f32: -; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_4f32_16f32: -; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_4f32_16f32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float> *%p %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> ret <16 x float> %2 @@ -474,26 +434,11 @@ ; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX2-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_4i32_16i32: -; X32-AVX512F: ## BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32: -; X32-AVX512BW: ## BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32: -; X32-AVX512DQ: ## BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_4i32_16i32: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX1-LABEL: test_broadcast_4i32_16i32: ; X64-AVX1: ## BB#0: @@ -507,23 +452,10 @@ ; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX2-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_4i32_16i32: -; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32: -; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32: -; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_4i32_16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32> *%p %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> ret <16 x i32> %2 @@ -632,8 +564,7 @@ ; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16: @@ -663,8 +594,7 @@ ; ; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16: @@ -784,8 +714,7 @@ ; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8: @@ -815,8 +744,7 @@ ; ; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8: @@ -1345,7 +1273,7 @@ ; X32-AVX512F-NEXT: vmovdqa64 (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxord %xmm1, %xmm1, %xmm1 ; X32-AVX512F-NEXT: vmovdqa32 %xmm1, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 ; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl ; @@ -1356,7 +1284,7 @@ ; X32-AVX512BW-NEXT: vmovdqa64 (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxord %xmm1, %xmm1, %xmm1 ; X32-AVX512BW-NEXT: vmovdqa32 %xmm1, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 ; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X32-AVX512BW-NEXT: retl ; @@ -1367,7 +1295,7 @@ ; X32-AVX512DQ-NEXT: vmovdqa64 (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 ; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 ; X32-AVX512DQ-NEXT: retl ; @@ -1385,7 +1313,7 @@ ; X64-AVX512F-NEXT: vmovdqa64 (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxord %xmm1, %xmm1, %xmm1 ; X64-AVX512F-NEXT: vmovdqa32 %xmm1, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 ; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512F-NEXT: retq ; @@ -1394,7 +1322,7 @@ ; X64-AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxord %xmm1, %xmm1, %xmm1 ; X64-AVX512BW-NEXT: vmovdqa32 %xmm1, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 ; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512BW-NEXT: retq ; @@ -1403,7 +1331,7 @@ ; X64-AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 ; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0