Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -285,6 +285,28 @@ // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the // perserved vector elements come from a new dummy input operand tied to $dst. +// This version uses a separate dag for non-masking and masking. +multiclass AVX512_maskable_split O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskRHS, + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0, bit IsKCommutable = 0, + SDNode Select = vselect> : + AVX512_maskable_custom; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, @@ -512,28 +534,45 @@ //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // -multiclass vinsert_for_size { + +// Supports two different pattern operators for mask and unmasked ops. Allows +// null_frag to be passed for one. +multiclass vinsert_for_size_split { let ExeDomain = To.ExeDomain in { - defm rr : AVX512_maskable, AVX512AIi8Base, EVEX_4V; + (iPTR imm)), + (vinsert_for_mask:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V; - defm rm : AVX512_maskable, AVX512AIi8Base, EVEX_4V, EVEX_CD8; } } +// Passes the same pattern operator for masked and unmasked ops. +multiclass vinsert_for_size : + vinsert_for_size_split; + multiclass vinsert_for_size_lowering p> { @@ -573,22 +612,24 @@ X86VectorVTInfo< 8, EltVT64, VR512>, vinsert256_insert>, VEX_W, EVEX_V512; + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasVLX, HasDQI] in - defm NAME # "64x2Z256" : vinsert_for_size, X86VectorVTInfo< 4, EltVT64, VR256X>, - vinsert128_insert>, VEX_W, EVEX_V256; + null_frag, vinsert128_insert>, VEX_W, EVEX_V256; + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { - defm NAME # "64x2Z" : vinsert_for_size, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert>, VEX_W, EVEX_V512; + null_frag, vinsert128_insert>, VEX_W, EVEX_V512; - defm NAME # "32x8Z" : vinsert_for_size, X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert>, EVEX_V512; + null_frag, vinsert256_insert>, EVEX_V512; } } @@ -596,21 +637,21 @@ defm VINSERTI : vinsert_for_type; // Codegen pattern with the alternative types, -// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +// Even with AVX512DQ we'll still use these for unmasked operations. defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, - vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, - vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; // Codegen pattern with the alternative types insert VEC128 into VEC256 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, @@ -647,16 +688,20 @@ // AVX-512 VECTOR EXTRACT //--- -multiclass vextract_for_size { +// Supports two different pattern operators for mask and unmasked ops. Allows +// null_frag to be passed for one. +multiclass vextract_for_size_split { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { - defm rr : AVX512_maskable, + (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)), + (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>, AVX512AIi8Base, EVEX; def mr : AVX512AIi8 : + vextract_for_size_split; + // Codegen pattern for the alternative types multiclass vextract_for_size_lowering, vextract128_extract>, EVEX_V256, EVEX_CD8<32, CD8VT4>; + + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasVLX, HasDQI] in - defm NAME # "64x2Z256" : vextract_for_size, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract>, + null_frag, vextract128_extract>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { - defm NAME # "64x2Z" : vextract_for_size, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract>, + null_frag, vextract128_extract>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; - defm NAME # "32x8Z" : vextract_for_size, X86VectorVTInfo< 8, EltVT32, VR256X>, - vextract256_extract>, + null_frag, vextract256_extract>, EVEX_V512, EVEX_CD8<32, CD8VT8>; } } @@ -737,21 +792,21 @@ defm VEXTRACTI : vextract_for_type; // extract_subvector codegen patterns with the alternative types. -// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +// Even with AVX512DQ we'll still use these for unmasked operations. defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, - vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, - vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; // Codegen pattern with the alternative types extract VEC128 from VEC256 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info, Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -642,19 +642,12 @@ } define <16 x float> @fptrunc00(<16 x double> %b) nounwind { -; NODQ-LABEL: fptrunc00: -; NODQ: # BB#0: -; NODQ-NEXT: vcvtpd2ps %zmm0, %ymm0 -; NODQ-NEXT: vcvtpd2ps %zmm1, %ymm1 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; DQ-LABEL: fptrunc00: -; DQ: # BB#0: -; DQ-NEXT: vcvtpd2ps %zmm0, %ymm0 -; DQ-NEXT: vcvtpd2ps %zmm1, %ymm1 -; DQ-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 -; DQ-NEXT: retq +; ALL-LABEL: fptrunc00: +; ALL: # BB#0: +; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 +; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq %a = fptrunc <16 x double> %b to <16 x float> ret <16 x float> %a } @@ -876,21 +869,13 @@ } define <16 x double> @uitof64(<16 x i32> %a) nounwind { -; NODQ-LABEL: uitof64: -; NODQ: # BB#0: -; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm2 -; NODQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm1 -; NODQ-NEXT: vmovaps %zmm2, %zmm0 -; NODQ-NEXT: retq -; -; DQ-LABEL: uitof64: -; DQ: # BB#0: -; DQ-NEXT: vcvtudq2pd %ymm0, %zmm2 -; DQ-NEXT: vextractf32x8 $1, %zmm0, %ymm0 -; DQ-NEXT: vcvtudq2pd %ymm0, %zmm1 -; DQ-NEXT: vmovaps %zmm2, %zmm0 -; DQ-NEXT: retq +; ALL-LABEL: uitof64: +; ALL: # BB#0: +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq %b = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -20,25 +20,15 @@ } define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { -; KNL-LABEL: test2: -; KNL: ## BB#0: -; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 -; KNL-NEXT: retq -; KNL-NEXT: ## -- End function -; -; SKX-LABEL: test2: -; SKX: ## BB#0: -; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; SKX-NEXT: vinsertf64x2 $0, %xmm2, %zmm0, %zmm2 -; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0 -; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0 -; SKX-NEXT: retq -; SKX-NEXT: ## -- End function +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 +; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -59,23 +49,14 @@ } define <8 x i64> @test4(<8 x i64> %x) nounwind { -; KNL-LABEL: test4: -; KNL: ## BB#0: -; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 -; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; KNL-NEXT: ## -- End function -; -; SKX-LABEL: test4: -; SKX: ## BB#0: -; SKX-NEXT: vextracti64x2 $2, %zmm0, %xmm1 -; SKX-NEXT: vmovq %xmm1, %rax -; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 -; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 -; SKX-NEXT: retq -; SKX-NEXT: ## -- End function +; CHECK-LABEL: test4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; CHECK-NEXT: vmovq %xmm1, %rax +; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %eee = extractelement <8 x i64> %x, i32 4 %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1 ret <8 x i64> %rrr2 @@ -477,7 +458,7 @@ ; SKX-LABEL: extract_v8i64: ; SKX: ## BB#0: ; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0 +; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -693,23 +674,14 @@ } define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { -; KNL-LABEL: insert_v8i64: -; KNL: ## BB#0: -; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; KNL-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: insert_v8i64: -; SKX: ## BB#0: -; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm1 -; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0 -; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; SKX-NEXT: vinserti64x2 $1, %xmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: insert_v8i64: +; CHECK: ## BB#0: +; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 +; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <8 x i64> %x, i64 %val, i32 1 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3 @@ -888,17 +860,11 @@ } define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) { -; KNL-LABEL: test_insert_128_v8i64: -; KNL: ## BB#0: -; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_insert_128_v8i64: -; SKX: ## BB#0: -; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: test_insert_128_v8i64: +; CHECK: ## BB#0: +; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %r = insertelement <8 x i64> %x, i64 %y, i32 1 ret <8 x i64> %r } @@ -914,17 +880,11 @@ } define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) { -; KNL-LABEL: test_insert_128_v8f64: -; KNL: ## BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_insert_128_v8f64: -; SKX: ## BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; SKX-NEXT: vinsertf64x2 $0, %xmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: test_insert_128_v8f64: +; CHECK: ## BB#0: +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %r = insertelement <8 x double> %x, double %y, i32 1 ret <8 x double> %r } Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -726,27 +726,16 @@ } define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) { -; KNL-LABEL: usat_trunc_qw_1024: -; KNL: ## BB#0: -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 -; KNL-NEXT: vpminuq %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: vpmovqd %zmm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: retq -; -; SKX-LABEL: usat_trunc_qw_1024: -; SKX: ## BB#0: -; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 -; SKX-NEXT: vpminuq %zmm2, %zmm1, %zmm1 -; SKX-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; SKX-NEXT: vpmovqd %zmm0, %ymm0 -; SKX-NEXT: vpmovqd %zmm1, %ymm1 -; SKX-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; SKX-NEXT: vpmovdw %zmm0, %ymm0 -; SKX-NEXT: retq +; ALL-LABEL: usat_trunc_qw_1024: +; ALL: ## BB#0: +; ALL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; ALL-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; ALL-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; ALL-NEXT: vpmovqd %zmm0, %ymm0 +; ALL-NEXT: vpmovqd %zmm1, %ymm1 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq %x3 = icmp ult <16 x i64> %i, %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> %x6 = trunc <16 x i64> %x5 to <16 x i16> Index: test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -6,7 +6,7 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 +; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm0 ; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: kshiftlb $7, %k0, %k1 ; CHECK-NEXT: kshiftrb $7, %k1, %k1 @@ -36,7 +36,7 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: ; CHECK: ## BB#0: -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -56,7 +56,7 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} @@ -76,7 +76,7 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} @@ -96,7 +96,7 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} @@ -116,7 +116,7 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} @@ -162,7 +162,7 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 @@ -231,7 +231,7 @@ ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -334,7 +334,7 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %trigger) { ; SKX-LABEL: test16: ; SKX: # BB#0: -; SKX-NEXT: vextracti32x8 $1, %zmm2, %ymm3 +; SKX-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 ; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2 Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -422,10 +422,10 @@ ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} ; SKX-NEXT: kmovw %k1, %k3 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} -; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 +; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} -; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 ; SKX-NEXT: retq ; @@ -750,7 +750,7 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2} ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} -; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 +; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test14: @@ -1686,11 +1686,11 @@ ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 -; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 +; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} -; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_gather_16i32: @@ -1772,7 +1772,7 @@ ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} -; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; SKX_32-NEXT: movl %ebp, %esp @@ -1809,11 +1809,11 @@ ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 -; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 +; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} -; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_gather_16f32: @@ -1895,7 +1895,7 @@ ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} -; SKX_32-NEXT: vextractf32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} ; SKX_32-NEXT: vmovapd %zmm2, %zmm0 ; SKX_32-NEXT: movl %ebp, %esp @@ -1934,7 +1934,7 @@ ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} -; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 +; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -2016,7 +2016,7 @@ ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} -; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp @@ -2055,7 +2055,7 @@ ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} -; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 +; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -2138,7 +2138,7 @@ ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} -; SKX_32-NEXT: vextractf32x8 $1, %zmm0, %ymm0 +; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp Index: test/CodeGen/X86/stack-folding-fp-avx512.ll =================================================================== --- test/CodeGen/X86/stack-folding-fp-avx512.ll +++ test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -458,7 +458,7 @@ define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0, <8 x double> %a1) { ;CHECK-LABEL: stack_fold_extractf64x2 - ;CHECK: vextractf64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextractf32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill %1 = shufflevector <8 x double> %a0, <8 x double> %a1, <2 x i32> %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <2 x double> %1 @@ -466,7 +466,7 @@ define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0, <16 x float> %a1) { ;CHECK-LABEL: stack_fold_extracti32x8 - ;CHECK: vextractf32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill + ;CHECK: vextractf64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill %1 = shufflevector <16 x float> %a0, <16 x float> %a1, <8 x i32> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x float> %1 @@ -482,7 +482,7 @@ define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) { ;CHECK-LABEL: stack_fold_insertf32x8 - ;CHECK: vinsertf32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK: vinsertf64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> ret <16 x float> %2 Index: test/CodeGen/X86/stack-folding-int-avx512.ll =================================================================== --- test/CodeGen/X86/stack-folding-int-avx512.ll +++ test/CodeGen/X86/stack-folding-int-avx512.ll @@ -130,7 +130,7 @@ define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) { ;CHECK-LABEL: stack_fold_extracti64x2 - ;CHECK: vextracti64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ; add forces execution domain %1 = add <8 x i64> %a0, %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> @@ -140,7 +140,7 @@ define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) { ;CHECK-LABEL: stack_fold_extracti32x8 - ;CHECK: vextracti32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill + ;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill ; add forces execution domain %1 = add <16 x i32> %a0, %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> @@ -160,7 +160,7 @@ define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { ;CHECK-LABEL: stack_fold_inserti32x8 - ;CHECK: vinserti32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> ; add forces execution domain Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -1457,26 +1457,12 @@ ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_4f32_16f32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_4f32_16f32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_4f32_16f32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_4f32_16f32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_4f32_16f32: ; X64-AVX: # BB#0: @@ -1485,26 +1471,12 @@ ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_4f32_16f32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_4f32_16f32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_4f32_16f32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_4f32_16f32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> ret <16 x float> %1 } @@ -1515,46 +1487,22 @@ ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_8f32_16f32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_8f32_16f32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_8f32_16f32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_8f32_16f32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_8f32_16f32: ; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_8f32_16f32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_8f32_16f32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_8f32_16f32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_8f32_16f32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> ret <16 x float> %1 } @@ -1583,26 +1531,12 @@ ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_4i32_16i32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_4i32_16i32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_4i32_16i32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_4i32_16i32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_4i32_16i32: ; X64-AVX: # BB#0: @@ -1611,26 +1545,12 @@ ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_4i32_16i32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_4i32_16i32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_4i32_16i32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_4i32_16i32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> ret <16 x i32> %1 } @@ -1641,46 +1561,22 @@ ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_8i32_16i32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_8i32_16i32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_8i32_16i32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_8i32_16i32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_8i32_16i32: ; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_8i32_16i32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_8i32_16i32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_8i32_16i32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_8i32_16i32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> ret <16 x i32> %1 } Index: test/CodeGen/X86/vector-compare-results.ll =================================================================== --- test/CodeGen/X86/vector-compare-results.ll +++ test/CodeGen/X86/vector-compare-results.ll @@ -2073,353 +2073,121 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_cmp_v16f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4 -; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm5 -; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: vucomisd %xmm4, %xmm5 -; AVX512F-NEXT: movq $-1, %rcx -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512F-NEXT: vucomisd %xmm4, %xmm5 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5 -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512F-NEXT: vucomisd %xmm5, %xmm6 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm7 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512F-NEXT: vucomisd %xmm5, %xmm6 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5 -; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm6 -; AVX512F-NEXT: vucomisd %xmm5, %xmm6 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm7 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512F-NEXT: vucomisd %xmm5, %xmm6 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512F-NEXT: vucomisd %xmm2, %xmm0 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vucomisd %xmm2, %xmm0 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2 -; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm4 -; AVX512F-NEXT: vucomisd %xmm2, %xmm4 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512F-NEXT: vucomisd %xmm2, %xmm4 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm5 -; AVX512F-NEXT: vucomisd %xmm4, %xmm5 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512F-NEXT: vucomisd %xmm4, %xmm5 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm4 -; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5 -; AVX512F-NEXT: vucomisd %xmm4, %xmm5 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512F-NEXT: vucomisd %xmm4, %xmm5 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512F-NEXT: vucomisd %xmm3, %xmm1 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovaq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512F-NEXT: vucomisd %xmm3, %xmm1 -; AVX512F-NEXT: cmovaq %rcx, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512DQ-LABEL: test_cmp_v16f64: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm5 -; AVX512DQ-NEXT: xorl %eax, %eax -; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 -; AVX512DQ-NEXT: movq $-1, %rcx -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5 -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm6 -; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm7 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm5 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm6 -; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm7 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm4 -; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm5 -; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm5 -; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovaq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1 -; AVX512DQ-NEXT: cmovaq %rcx, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_cmp_v16f64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4 -; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm5 -; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 -; AVX512BW-NEXT: movq $-1, %rcx -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5 -; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm7 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5 -; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm6 -; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm7 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX512BW-NEXT: vucomisd %xmm5, %xmm6 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX512BW-NEXT: vucomisd %xmm2, %xmm0 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512BW-NEXT: vucomisd %xmm2, %xmm0 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2 -; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm4 -; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512BW-NEXT: vucomisd %xmm2, %xmm4 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] -; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm5 -; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm4 -; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5 -; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX512BW-NEXT: vucomisd %xmm4, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] -; AVX512BW-NEXT: vucomisd %xmm3, %xmm1 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovaq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512BW-NEXT: vucomisd %xmm3, %xmm1 -; AVX512BW-NEXT: cmovaq %rcx, %rax -; AVX512BW-NEXT: vmovq %rax, %xmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_cmp_v16f64: +; AVX512: # BB#0: +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm5 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: vucomisd %xmm4, %xmm5 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vucomisd %xmm4, %xmm5 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512-NEXT: vucomisd %xmm5, %xmm6 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512-NEXT: vucomisd %xmm5, %xmm6 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5 +; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm6 +; AVX512-NEXT: vucomisd %xmm5, %xmm6 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX512-NEXT: vucomisd %xmm5, %xmm6 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX512-NEXT: vucomisd %xmm2, %xmm0 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vucomisd %xmm2, %xmm0 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2 +; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm4 +; AVX512-NEXT: vucomisd %xmm2, %xmm4 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512-NEXT: vucomisd %xmm2, %xmm4 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4 +; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5 +; AVX512-NEXT: vucomisd %xmm4, %xmm5 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vucomisd %xmm4, %xmm5 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm4 +; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5 +; AVX512-NEXT: vucomisd %xmm4, %xmm5 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX512-NEXT: vucomisd %xmm4, %xmm5 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX512-NEXT: vucomisd %xmm3, %xmm1 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vucomisd %xmm3, %xmm1 +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = fcmp ogt <16 x double> %a0, %a1 ret <16 x i1> %1 } @@ -3060,7 +2828,7 @@ ; AVX512DQ-NEXT: cmoval %ecx, %edx ; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8 ; AVX512DQ-NEXT: vextractf32x4 $3, %zmm3, %xmm2 ; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] @@ -3157,7 +2925,7 @@ ; AVX512DQ-NEXT: cmoval %ecx, %eax ; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 ; AVX512DQ-NEXT: retq @@ -3319,614 +3087,350 @@ ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] ; AVX512BW-NEXT: vucomiss %xmm4, %xmm6 ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmoval %ecx, %edx -; AVX512BW-NEXT: vucomiss %xmm0, %xmm5 -; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmoval %ecx, %esi -; AVX512BW-NEXT: vmovd %esi, %xmm4 -; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] -; AVX512BW-NEXT: vucomiss %xmm6, %xmm7 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmoval %ecx, %edx -; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX512BW-NEXT: vucomiss %xmm0, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmoval %ecx, %edx -; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX512BW-NEXT: vucomiss %xmm4, %xmm5 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmoval %ecx, %edx -; AVX512BW-NEXT: vucomiss %xmm3, %xmm1 -; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmoval %ecx, %esi -; AVX512BW-NEXT: vmovd %esi, %xmm4 -; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] -; AVX512BW-NEXT: vucomiss %xmm5, %xmm6 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmoval %ecx, %edx -; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BW-NEXT: vucomiss %xmm3, %xmm1 -; AVX512BW-NEXT: cmoval %ecx, %eax -; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: retq - %1 = fcmp ogt <32 x float> %a0, %a1 - ret <32 x i1> %1 -} - -define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE2-LABEL: test_cmp_v16i64: -; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm10 -; SSE2-NEXT: packsswb %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: packsswb %xmm6, %xmm4 -; SSE2-NEXT: packsswb %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: packsswb %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: packsswb %xmm3, %xmm0 -; SSE2-NEXT: packsswb %xmm2, %xmm0 -; SSE2-NEXT: packsswb %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSE42-LABEL: test_cmp_v16i64: -; SSE42: # BB#0: -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6 -; SSE42-NEXT: packsswb %xmm7, %xmm6 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4 -; SSE42-NEXT: packsswb %xmm5, %xmm4 -; SSE42-NEXT: packsswb %xmm6, %xmm4 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2 -; SSE42-NEXT: packsswb %xmm3, %xmm2 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1 -; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0 -; SSE42-NEXT: packsswb %xmm1, %xmm0 -; SSE42-NEXT: packsswb %xmm2, %xmm0 -; SSE42-NEXT: packsswb %xmm4, %xmm0 -; SSE42-NEXT: retq -; -; AVX1-LABEL: test_cmp_v16i64: -; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_cmp_v16i64: -; AVX2: # BB#0: -; AVX2-NEXT: vpcmpgtq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: test_cmp_v16i64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: cmpq %rcx, %rdx -; AVX512F-NEXT: movq $-1, %rcx -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vmovq %xmm4, %rdx -; AVX512F-NEXT: vmovq %xmm5, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6 -; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm7 -; AVX512F-NEXT: vmovq %xmm5, %rdx -; AVX512F-NEXT: vmovq %xmm6, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm7 -; AVX512F-NEXT: vmovq %xmm5, %rdx -; AVX512F-NEXT: vmovq %xmm6, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: vmovq %xmm0, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vmovq %xmm2, %rdx -; AVX512F-NEXT: vmovq %xmm4, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vmovq %xmm4, %rdx -; AVX512F-NEXT: vmovq %xmm5, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm6 -; AVX512F-NEXT: vmovq %xmm4, %rdx -; AVX512F-NEXT: vmovq %xmm5, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovgq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm5 -; AVX512F-NEXT: vmovq %xmm3, %rdx -; AVX512F-NEXT: vmovq %xmm1, %rsi -; AVX512F-NEXT: cmpq %rdx, %rsi -; AVX512F-NEXT: cmovgq %rcx, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512DQ-LABEL: test_cmp_v16i64: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4 -; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm5 -; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512DQ-NEXT: xorl %eax, %eax -; AVX512DQ-NEXT: cmpq %rcx, %rdx -; AVX512DQ-NEXT: movq $-1, %rcx -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vmovq %xmm4, %rdx -; AVX512DQ-NEXT: vmovq %xmm5, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5 -; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm6 -; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm7 -; AVX512DQ-NEXT: vmovq %xmm5, %rdx -; AVX512DQ-NEXT: vmovq %xmm6, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm5 -; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm6 -; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm7 -; AVX512DQ-NEXT: vmovq %xmm5, %rdx -; AVX512DQ-NEXT: vmovq %xmm6, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vmovq %xmm2, %rdx -; AVX512DQ-NEXT: vmovq %xmm0, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2 -; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm4 -; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vmovq %xmm2, %rdx -; AVX512DQ-NEXT: vmovq %xmm4, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4 -; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm5 -; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vmovq %xmm4, %rdx -; AVX512DQ-NEXT: vmovq %xmm5, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4 -; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm5 -; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm6 -; AVX512DQ-NEXT: vmovq %xmm4, %rdx -; AVX512DQ-NEXT: vmovq %xmm5, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512DQ-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovgq %rcx, %rdx -; AVX512DQ-NEXT: vmovq %rdx, %xmm5 -; AVX512DQ-NEXT: vmovq %xmm3, %rdx -; AVX512DQ-NEXT: vmovq %xmm1, %rsi -; AVX512DQ-NEXT: cmpq %rdx, %rsi -; AVX512DQ-NEXT: cmovgq %rcx, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_cmp_v16i64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5 -; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512BW-NEXT: xorl %eax, %eax -; AVX512BW-NEXT: cmpq %rcx, %rdx -; AVX512BW-NEXT: movq $-1, %rcx -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vmovq %xmm4, %rdx -; AVX512BW-NEXT: vmovq %xmm5, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5 -; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6 -; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm7 -; AVX512BW-NEXT: vmovq %xmm5, %rdx -; AVX512BW-NEXT: vmovq %xmm6, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5 -; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm7 -; AVX512BW-NEXT: vmovq %xmm5, %rdx -; AVX512BW-NEXT: vmovq %xmm6, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vmovq %xmm0, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vmovq %xmm4, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vmovq %xmm4, %rdx -; AVX512BW-NEXT: vmovq %xmm5, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm0, %xmm5 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0] +; AVX512BW-NEXT: vucomiss %xmm6, %xmm7 ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4 -; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5 -; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm0, %xmm5 ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm6 -; AVX512BW-NEXT: vmovq %xmm4, %rdx -; AVX512BW-NEXT: vmovq %xmm5, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vucomiss %xmm4, %xmm5 ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vucomiss %xmm3, %xmm1 +; AVX512BW-NEXT: movl $0, %esi +; AVX512BW-NEXT: cmoval %ecx, %esi +; AVX512BW-NEXT: vmovd %esi, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] +; AVX512BW-NEXT: vucomiss %xmm5, %xmm6 ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgq %rcx, %rdx -; AVX512BW-NEXT: vmovq %rdx, %xmm5 -; AVX512BW-NEXT: vmovq %xmm3, %rdx -; AVX512BW-NEXT: vmovq %xmm1, %rsi -; AVX512BW-NEXT: cmpq %rdx, %rsi -; AVX512BW-NEXT: cmovgq %rcx, %rax -; AVX512BW-NEXT: vmovq %rax, %xmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: cmoval %ecx, %edx +; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512BW-NEXT: vucomiss %xmm3, %xmm1 +; AVX512BW-NEXT: cmoval %ecx, %eax +; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq + %1 = fcmp ogt <32 x float> %a0, %a1 + ret <32 x i1> %1 +} + +define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE2-LABEL: test_cmp_v16i64: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm10 +; SSE2-NEXT: packsswb %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: packsswb %xmm6, %xmm4 +; SSE2-NEXT: packsswb %xmm10, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: packsswb %xmm3, %xmm0 +; SSE2-NEXT: packsswb %xmm2, %xmm0 +; SSE2-NEXT: packsswb %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: test_cmp_v16i64: +; SSE42: # BB#0: +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6 +; SSE42-NEXT: packsswb %xmm7, %xmm6 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4 +; SSE42-NEXT: packsswb %xmm5, %xmm4 +; SSE42-NEXT: packsswb %xmm6, %xmm4 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2 +; SSE42-NEXT: packsswb %xmm3, %xmm2 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1 +; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0 +; SSE42-NEXT: packsswb %xmm1, %xmm0 +; SSE42-NEXT: packsswb %xmm2, %xmm0 +; SSE42-NEXT: packsswb %xmm4, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: test_cmp_v16i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_cmp_v16i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtq %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cmp_v16i64: +; AVX512: # BB#0: +; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4 +; AVX512-NEXT: vpextrq $1, %xmm4, %rcx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5 +; AVX512-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpq %rcx, %rdx +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vmovq %xmm4, %rdx +; AVX512-NEXT: vmovq %xmm5, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vmovq %xmm5, %rdx +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5 +; AVX512-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; AVX512-NEXT: vpextrq $1, %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm7 +; AVX512-NEXT: vmovq %xmm5, %rdx +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; AVX512-NEXT: vpextrq $1, %xmm4, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: vmovq %xmm4, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4 +; AVX512-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vmovq %xmm4, %rdx +; AVX512-NEXT: vmovq %xmm5, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4 +; AVX512-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5 +; AVX512-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm6 +; AVX512-NEXT: vmovq %xmm4, %rdx +; AVX512-NEXT: vmovq %xmm5, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; AVX512-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: cmovgq %rcx, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vmovq %xmm3, %rdx +; AVX512-NEXT: vmovq %xmm1, %rsi +; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp sgt <16 x i64> %a0, %a1 ret <16 x i1> %1 } @@ -4583,7 +4087,7 @@ ; AVX512DQ-NEXT: cmovgl %ecx, %edx ; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm3, %xmm2 ; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx @@ -4688,7 +4192,7 @@ ; AVX512DQ-NEXT: cmovgl %ecx, %eax ; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -9176,8 +8680,8 @@ ; ; AVX512DQ-LABEL: test_cmp_v32f64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm4, %xmm8 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm9 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm4, %xmm8 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm0, %xmm9 ; AVX512DQ-NEXT: xorl %eax, %eax ; AVX512DQ-NEXT: vucomisd %xmm8, %xmm9 ; AVX512DQ-NEXT: movq $-1, %rcx @@ -9191,8 +8695,8 @@ ; AVX512DQ-NEXT: cmovaq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm8 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm4, %xmm9 -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm10 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm4, %xmm9 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, %xmm10 ; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9205,8 +8709,8 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm9 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm4, %xmm9 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm10 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm4, %xmm9 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm0, %xmm10 ; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9232,8 +8736,8 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm8 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm5, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm0 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm1, %xmm0 ; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9245,8 +8749,8 @@ ; AVX512DQ-NEXT: cmovaq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm5, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm0 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm1, %xmm0 ; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9259,8 +8763,8 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm5, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm0 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm1, %xmm0 ; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9286,10 +8790,10 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm6, %xmm1 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm6, %xmm1 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm2, %xmm4 ; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9301,8 +8805,8 @@ ; AVX512DQ-NEXT: cmovaq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm6, %xmm4 -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm6, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm2, %xmm5 ; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9315,8 +8819,8 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm4 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm6, %xmm1 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm6, %xmm1 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm2, %xmm4 ; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9342,8 +8846,8 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm7, %xmm1 -; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm7, %xmm1 +; AVX512DQ-NEXT: vextractf32x4 $3, %zmm3, %xmm2 ; AVX512DQ-NEXT: vucomisd %xmm1, %xmm2 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9355,8 +8859,8 @@ ; AVX512DQ-NEXT: cmovaq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm7, %xmm2 -; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm7, %xmm2 +; AVX512DQ-NEXT: vextractf32x4 $2, %zmm3, %xmm4 ; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9369,8 +8873,8 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm7, %xmm2 -; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm7, %xmm2 +; AVX512DQ-NEXT: vextractf32x4 $1, %zmm3, %xmm4 ; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4 ; AVX512DQ-NEXT: movl $0, %edx ; AVX512DQ-NEXT: cmovaq %rcx, %rdx @@ -9395,7 +8899,7 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 ; AVX512DQ-NEXT: retq @@ -10512,9 +10016,9 @@ ; ; AVX512DQ-LABEL: test_cmp_v32i64: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm4, %xmm8 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm4, %xmm8 ; AVX512DQ-NEXT: vpextrq $1, %xmm8, %rcx -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm9 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm9 ; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx ; AVX512DQ-NEXT: xorl %eax, %eax ; AVX512DQ-NEXT: cmpq %rcx, %rdx @@ -10529,9 +10033,9 @@ ; AVX512DQ-NEXT: cmovgq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm8 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm4, %xmm9 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm4, %xmm9 ; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm10 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm10 ; AVX512DQ-NEXT: vpextrq $1, %xmm10, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10545,9 +10049,9 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm9 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm4, %xmm9 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm4, %xmm9 ; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm10 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm0, %xmm10 ; AVX512DQ-NEXT: vpextrq $1, %xmm10, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10576,9 +10080,9 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm8 -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm5, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10591,9 +10095,9 @@ ; AVX512DQ-NEXT: cmovgq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0] -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm5, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm5, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, %xmm0 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10607,9 +10111,9 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm5, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm5, %xmm0 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm1, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10638,11 +10142,11 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8 -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm6, %xmm1 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm6, %xmm1 ; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm2, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10655,9 +10159,9 @@ ; AVX512DQ-NEXT: cmovgq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm6, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm6, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm2, %xmm5 ; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10671,9 +10175,9 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm4 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm6, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm6, %xmm0 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm2, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10702,9 +10206,9 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm1 -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm7, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm7, %xmm0 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2 +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm3, %xmm2 ; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10717,9 +10221,9 @@ ; AVX512DQ-NEXT: cmovgq %rcx, %rdx ; AVX512DQ-NEXT: vmovq %rdx, %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm7, %xmm2 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm7, %xmm2 ; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10733,9 +10237,9 @@ ; AVX512DQ-NEXT: vmovq %rdx, %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm7, %xmm0 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm7, %xmm0 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4 +; AVX512DQ-NEXT: vextracti32x4 $1, %zmm3, %xmm4 ; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512DQ-NEXT: cmpq %rdx, %rsi ; AVX512DQ-NEXT: movl $0, %edx @@ -10763,7 +10267,7 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 ; AVX512DQ-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -274,7 +274,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) { ; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15: ; ALL: # BB#0: -; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] ; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; ALL-NEXT: retq @@ -286,7 +286,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_1_2_12: ; ALL: # BB#0: -; ALL-NEXT: vextracti32x8 $1, %zmm0, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; ALL-NEXT: vpbroadcastd %xmm1, %xmm1 ; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] @@ -311,7 +311,7 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) { ; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10: ; ALL: # BB#0: -; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u] @@ -681,7 +681,7 @@ ; ALL: # BB#0: ; ALL-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -692,7 +692,7 @@ ; ALL: # BB#0: ; ALL-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> ret <16 x float> %res Index: test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-avx512.ll +++ test/CodeGen/X86/vector-shuffle-avx512.ll @@ -474,7 +474,7 @@ ; SKX64-LABEL: expand13: ; SKX64: # BB#0: ; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX64-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0 +; SKX64-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; SKX64-NEXT: retq ; ; KNL64-LABEL: expand13: @@ -486,7 +486,7 @@ ; SKX32-LABEL: expand13: ; SKX32: # BB#0: ; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX32-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0 +; SKX32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; SKX32-NEXT: retl ; ; KNL32-LABEL: expand13: Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -171,7 +171,7 @@ ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0 +; VL_BW_DQ-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -257,38 +257,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_add_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_add_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_add_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_add_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = add <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -683,35 +661,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_add_const_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_add_const_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = add <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -1084,38 +1042,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_sub_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpsubq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_sub_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsubq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpsubq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_sub_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -1510,38 +1446,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -2186,7 +2100,7 @@ ; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -2732,38 +2646,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = mul <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -3157,38 +3049,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_and_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_and_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_and_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_and_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -3528,35 +3398,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_and_const_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_and_const_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -3913,38 +3763,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_xor_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpxorq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpxorq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_xor_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxorq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpxorq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxorq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpxorq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_xor_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = xor <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -4284,35 +4112,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = xor <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -4669,38 +4477,16 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_or_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vporq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_or_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vporq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_or_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vporq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_or_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = or <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2 @@ -5040,35 +4826,15 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_or_const_v16i64_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_or_const_v16i64_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = or <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> ret <16 x i8> %2