Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -5288,6 +5288,550 @@ llvm_i8_ty], [IntrReadArgMem]>; } + +// truncate +let TargetPrefix = "x86" in { + def int_x86_avx512_mask_pmov_qb_128 : + GCCBuiltin<"__builtin_ia32_pmovqb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovqb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qb_128 : + GCCBuiltin<"__builtin_ia32_pmovsqb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsqb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qb_128 : + GCCBuiltin<"__builtin_ia32_pmovusqb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusqb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qb_256 : + GCCBuiltin<"__builtin_ia32_pmovqb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovqb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qb_256 : + GCCBuiltin<"__builtin_ia32_pmovsqb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsqb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qb_256 : + GCCBuiltin<"__builtin_ia32_pmovusqb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusqb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qb_512 : + GCCBuiltin<"__builtin_ia32_pmovqb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovqb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qb_512 : + GCCBuiltin<"__builtin_ia32_pmovsqb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsqb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qb_512 : + GCCBuiltin<"__builtin_ia32_pmovusqb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusqb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qw_128 : + GCCBuiltin<"__builtin_ia32_pmovqw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovqw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qw_128 : + GCCBuiltin<"__builtin_ia32_pmovsqw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsqw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qw_128 : + GCCBuiltin<"__builtin_ia32_pmovusqw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusqw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qw_256 : + GCCBuiltin<"__builtin_ia32_pmovqw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovqw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qw_256 : + GCCBuiltin<"__builtin_ia32_pmovsqw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsqw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qw_256 : + GCCBuiltin<"__builtin_ia32_pmovusqw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusqw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qw_512 : + GCCBuiltin<"__builtin_ia32_pmovqw512_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovqw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qw_512 : + GCCBuiltin<"__builtin_ia32_pmovsqw512_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsqw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qw_512 : + GCCBuiltin<"__builtin_ia32_pmovusqw512_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusqw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qd_128 : + GCCBuiltin<"__builtin_ia32_pmovqd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qd_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovqd128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qd_128 : + GCCBuiltin<"__builtin_ia32_pmovsqd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qd_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsqd128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qd_128 : + GCCBuiltin<"__builtin_ia32_pmovusqd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qd_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusqd128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qd_256 : + GCCBuiltin<"__builtin_ia32_pmovqd256_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qd_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovqd256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qd_256 : + GCCBuiltin<"__builtin_ia32_pmovsqd256_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qd_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsqd256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qd_256 : + GCCBuiltin<"__builtin_ia32_pmovusqd256_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qd_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusqd256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qd_512 : + GCCBuiltin<"__builtin_ia32_pmovqd512_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qd_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovqd512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qd_512 : + GCCBuiltin<"__builtin_ia32_pmovsqd512_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qd_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsqd512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qd_512 : + GCCBuiltin<"__builtin_ia32_pmovusqd512_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qd_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusqd512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_db_128 : + GCCBuiltin<"__builtin_ia32_pmovdb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_db_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovdb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_db_128 : + GCCBuiltin<"__builtin_ia32_pmovsdb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_db_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsdb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_db_128 : + GCCBuiltin<"__builtin_ia32_pmovusdb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_db_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusdb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_db_256 : + GCCBuiltin<"__builtin_ia32_pmovdb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_db_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovdb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_db_256 : + GCCBuiltin<"__builtin_ia32_pmovsdb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_db_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsdb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_db_256 : + GCCBuiltin<"__builtin_ia32_pmovusdb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_db_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusdb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_db_512 : + GCCBuiltin<"__builtin_ia32_pmovdb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_db_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovdb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_db_512 : + GCCBuiltin<"__builtin_ia32_pmovsdb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_db_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsdb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_db_512 : + GCCBuiltin<"__builtin_ia32_pmovusdb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_db_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusdb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_dw_128 : + GCCBuiltin<"__builtin_ia32_pmovdw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_dw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovdw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_dw_128 : + GCCBuiltin<"__builtin_ia32_pmovsdw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_dw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsdw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_dw_128 : + GCCBuiltin<"__builtin_ia32_pmovusdw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_dw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusdw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_dw_256 : + GCCBuiltin<"__builtin_ia32_pmovdw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_dw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovdw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_dw_256 : + GCCBuiltin<"__builtin_ia32_pmovsdw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_dw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsdw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_dw_256 : + GCCBuiltin<"__builtin_ia32_pmovusdw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_dw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusdw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_dw_512 : + GCCBuiltin<"__builtin_ia32_pmovdw512_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_dw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovdw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_dw_512 : + GCCBuiltin<"__builtin_ia32_pmovsdw512_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_dw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsdw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_dw_512 : + GCCBuiltin<"__builtin_ia32_pmovusdw512_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_dw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusdw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_wb_128 : + GCCBuiltin<"__builtin_ia32_pmovwb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_wb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovwb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_wb_128 : + GCCBuiltin<"__builtin_ia32_pmovswb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_wb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovswb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_wb_128 : + GCCBuiltin<"__builtin_ia32_pmovuswb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_wb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovuswb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_wb_256 : + GCCBuiltin<"__builtin_ia32_pmovwb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_wb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovwb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_wb_256 : + GCCBuiltin<"__builtin_ia32_pmovswb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_wb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovswb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_wb_256 : + GCCBuiltin<"__builtin_ia32_pmovuswb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_wb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovuswb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_wb_512 : + GCCBuiltin<"__builtin_ia32_pmovwb512_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_wb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovwb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_wb_512 : + GCCBuiltin<"__builtin_ia32_pmovswb512_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_wb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovswb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_wb_512 : + GCCBuiltin<"__builtin_ia32_pmovuswb512_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_wb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovuswb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; +} // Misc. let TargetPrefix = "x86" in { def int_x86_avx512_mask_cmp_ps_512 : Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -491,9 +491,10 @@ def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore, +// Do not use mld, mst directly. Use masked_store masked_load, masked_truncstore +def mst : SDNode<"ISD::MSTORE", SDTMaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def masked_load : SDNode<"ISD::MLOAD", SDTMaskedLoad, +def mld : SDNode<"ISD::MLOAD", SDTMaskedLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -678,6 +679,12 @@ return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; +// masked load fragments. +def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (mld node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + // extending load fragments. def extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ return cast(N)->getExtensionType() == ISD::EXTLOAD; @@ -789,6 +796,12 @@ return !cast(N)->isTruncatingStore(); }]>; +// masked store fragments. +def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (mst node:$src1, node:$src2, node:$src3), [{ + return !cast(N)->isTruncatingStore(); +}]>; + // truncstore fragments. def truncstore : PatFrag<(ops node:$val, node:$ptr), (unindexedstore node:$val, node:$ptr), [{ @@ -815,6 +828,21 @@ return cast(N)->getMemoryVT() == MVT::f64; }]>; +def truncstorevi8 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def truncstorevi16 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def truncstorevi32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + // indexed store fragments. def istore : PatFrag<(ops node:$val, node:$base, node:$offset), (ist node:$val, node:$base, node:$offset), [{ @@ -889,6 +917,27 @@ return cast(N)->getMemoryVT() == MVT::f32; }]>; +// masked truncstore fragments +def masked_truncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (mst node:$src1, node:$src2, node:$src3), [{ + return cast(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_truncstore node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_truncstore node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_truncstore node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + // setcc convenience fragments. def setoeq : PatFrag<(ops node:$lhs, node:$rhs), (setcc node:$lhs, node:$rhs, SETOEQ)>; Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -282,9 +282,8 @@ // Vector integer truncate. VTRUNC, - - // Vector integer truncate with mask. - VTRUNCM, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, VTRUNCS, // Vector FP extend. VFPEXT, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1348,6 +1348,24 @@ setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); @@ -1531,6 +1549,7 @@ setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1541,6 +1560,11 @@ setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -12460,10 +12484,8 @@ Subtarget->hasDQI() && Subtarget->hasVLX()) return Op; // legal, will go to VPMOVB2M, VPMOVQ2M } - if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { - if (VT.getVectorElementType().getSizeInBits() >=8) - return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + if (VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); unsigned NumElts = InVT.getVectorNumElements(); assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); @@ -12479,6 +12501,11 @@ return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); } + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (((!InVT.is512BitVector() && Subtarget->hasVLX()) || InVT.is512BitVector()) && + (InVT.getVectorElementType() != MVT::i16 || Subtarget->hasBWI())) + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { @@ -15195,7 +15222,7 @@ /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting for \p Mask when lowering masking intrinsics. +/// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, @@ -15203,8 +15230,8 @@ EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + SDValue VMask = SDValue(); + unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); assert(MaskVT.isSimple() && "invalid mask type"); @@ -15212,11 +15239,20 @@ if (isAllOnes(Mask)) return Op; - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + if (MaskVT.bitsGT(Mask.getValueType())){ + EVT newMaskVT = EVT::getIntegerVT(*DAG.getContext(), + MaskVT.getSizeInBits()); + VMask = DAG.getBitcast(MaskVT, + DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask)); + } else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } switch (Op.getOpcode()) { default: break; @@ -15225,10 +15261,15 @@ case X86ISD::CMPM: case X86ISD::CMPMU: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + OpcodeSelect = X86ISD::SELECT; + break; } if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. @@ -16051,6 +16092,47 @@ return Chain; } +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, + const IntrinsicData *IntrData, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + EVT VT = DataToTruncate.getValueType(); + EVT SVT = EVT::getVectorVT(*DAG.getContext(), + ElementType, VT.getVectorNumElements()); + + + if (isAllOnes(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); + + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), + MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + + MachineMemOperand *MMO = + DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); + + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); +} + static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); @@ -16184,6 +16266,15 @@ MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); } + case TRUNCATE_TO_MEM_VI8: { + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, Subtarget, DAG, IntrData, MVT::i8); + } + case TRUNCATE_TO_MEM_VI16: { + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, Subtarget, DAG, IntrData, MVT::i16); + } + case TRUNCATE_TO_MEM_VI32: { + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, Subtarget, DAG, IntrData, MVT::i32); + } case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); @@ -18894,7 +18985,8 @@ case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; @@ -24030,6 +24122,13 @@ unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + if (TLI.isTruncStoreLegal(VT, StVT)) { + return SDValue(); + } + // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); @@ -24141,6 +24240,11 @@ unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + if (TLI.isTruncStoreLegal(VT, StVT)) { + return SDValue(); + } + // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -5595,82 +5595,217 @@ // Integer truncate and extend operations //------------------------------------------------- -multiclass avx512_trunc_sat opc, string OpcodeStr, - RegisterClass dstRC, RegisterClass srcRC, - RegisterClass KRC, X86MemOperand x86memop> { - def rr : AVX512XS8I opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, + X86MemOperand x86memop> { + + defm rr : AVX512_maskable, + EVEX, T8XS; + + // for intrinsic patter match + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + undef)), + (!cast(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.ImmAllZerosV)), + (!cast(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.RC:$src0)), + (!cast(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, + DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + let mayStore = 1 in { + def mr : AVX512XS8I, EVEX; - def rrk : AVX512XS8I, EVEX, EVEX_K; + }//mayStore = 1 +} - def rrkz : AVX512XS8I, EVEX, EVEX_KZ; +multiclass avx512_trunc_mr_lowering { + + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast(NAME#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; + + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast(NAME#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc_sat_mr_lowering { + + def: Pat<(!cast("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), + (!cast(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, + (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), + (SrcInfo.VT SrcInfo.RC:$src))>; + + def: Pat<(!cast("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), + (!cast(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, + (SrcInfo.VT SrcInfo.RC:$src))>; +} + +multiclass avx512_trunc opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, + Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common, + avx512_trunc_mr_lowering, EVEX_V128; + + defm Z256: avx512_trunc_common, + avx512_trunc_mr_lowering, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common, + avx512_trunc_mr_lowering, EVEX_V512; +} + +multiclass avx512_trunc_sat opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common, + avx512_trunc_sat_mr_lowering, EVEX_V128; + + defm Z256: avx512_trunc_common, + avx512_trunc_sat_mr_lowering, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common, + avx512_trunc_sat_mr_lowering, EVEX_V512; +} - def mr : AVX512XS8I, EVEX; +multiclass avx512_trunc_qb opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<8, CD8VO>; +} +multiclass avx512_trunc_sat_qb opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<8, CD8VO>; +} - def mrk : AVX512XS8I, EVEX, EVEX_K; +multiclass avx512_trunc_qw opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VQ>; +} +multiclass avx512_trunc_sat_qw opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_sat_qd opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_db opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<8, CD8VQ>; +} +multiclass avx512_trunc_sat_db opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_dw opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<16, CD8VH>; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; - -def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>; -def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>; -def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>; -def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; -def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; - -def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>; -def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; +multiclass avx512_trunc_wb opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_wb opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; +defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; +defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; +defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; +defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; +defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; +defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; +defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; +defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; +defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; +defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; +defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; +defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; multiclass avx512_extend_common opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -114,19 +114,17 @@ SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>]>>; -def X86vtrunc : SDNode<"X86ISD::VTRUNC", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<0, 1>]>>; +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + def X86trunc : SDNode<"X86ISD::TRUNC", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>>; - -def X86vtruncm : SDNode<"X86ISD::VTRUNCM", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisVec<2>, SDTCisInt<2>, - SDTCisOpSmallerThanOp<0, 2>]>>; def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -21,10 +21,12 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, + INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, + TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, BLEND }; @@ -138,6 +140,42 @@ EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -644,6 +682,114 @@ X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1,24 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX - -; KNL-LABEL: trunc_16x32_to_16x8 -; KNL: vpmovdb -; KNL: ret -define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone { - %x = trunc <16 x i32> %i to <16 x i8> - ret <16 x i8> %x -} - -; KNL-LABEL: trunc_8x64_to_8x16 -; KNL: vpmovqw -; KNL: ret -define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone { - %x = trunc <8 x i64> %i to <8 x i16> - ret <8 x i16> %x -} - -;SKX-LABEL: zext_8x8mem_to_8x16: + ;SKX-LABEL: zext_8x8mem_to_8x16: ;SKX: ## BB#0: ;SKX-NEXT: vpmovw2m %xmm0, %k1 ;SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z} @@ -895,13 +878,6 @@ ret <8 x i32> %y } -; KNL-LABEL: trunc_v16i32_to_v16i16 -; KNL: vpmovdw -; KNL: ret -define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { - %1 = trunc <16 x i32> %x to <16 x i16> - ret <16 x i16> %1 -} ; KNL-LABEL: trunc_i32_to_i1 ; KNL: movw $-4, %ax Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -3118,3 +3118,394 @@ %res2 = fadd <16 x float> %res, %res1 ret <16 x float> %res2 } + + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512: +; CHECK: vpmovqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512: +; CHECK: vpmovqb %zmm0, (%rdi) +; CHECK: vpmovqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512: +; CHECK: vpmovsqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512: +; CHECK: vpmovsqb %zmm0, (%rdi) +; CHECK: vpmovsqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512: +; CHECK: vpmovusqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512: +; CHECK: vpmovusqb %zmm0, (%rdi) +; CHECK: vpmovusqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512: +; CHECK: vpmovqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512: +; CHECK: vpmovqw %zmm0, (%rdi) +; CHECK: vpmovqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512: +; CHECK: vpmovsqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512: +; CHECK: vpmovsqw %zmm0, (%rdi) +; CHECK: vpmovsqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512: +; CHECK: vpmovusqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512: +; CHECK: vpmovusqw %zmm0, (%rdi) +; CHECK: vpmovusqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512: +; CHECK: vpmovqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512: +; CHECK: vpmovqd %zmm0, (%rdi) +; CHECK: vpmovqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: +; CHECK: vpmovsqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512: +; CHECK: vpmovsqd %zmm0, (%rdi) +; CHECK: vpmovsqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: +; CHECK: vpmovusqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512: +; CHECK: vpmovusqd %zmm0, (%rdi) +; CHECK: vpmovusqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512: +; CHECK: vpmovdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512: +; CHECK: vpmovdb %zmm0, (%rdi) +; CHECK: vpmovdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512: +; CHECK: vpmovsdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512: +; CHECK: vpmovsdb %zmm0, (%rdi) +; CHECK: vpmovsdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512: +; CHECK: vpmovusdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512: +; CHECK: vpmovusdb %zmm0, (%rdi) +; CHECK: vpmovusdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512: +; CHECK: vpmovdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512: +; CHECK: vpmovdw %zmm0, (%rdi) +; CHECK: vpmovdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512: +; CHECK: vpmovsdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512: +; CHECK: vpmovsdw %zmm0, (%rdi) +; CHECK: vpmovsdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512: +; CHECK: vpmovusdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512: +; CHECK: vpmovusdw %zmm0, (%rdi) +; CHECK: vpmovusdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} Index: test/CodeGen/X86/avx512-trunc-ext.ll =================================================================== --- test/CodeGen/X86/avx512-trunc-ext.ll +++ test/CodeGen/X86/avx512-trunc-ext.ll @@ -1,961 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX - - -; KNL-LABEL: trunc_16x32_to_16x8 -; KNL: vpmovdb -; KNL: ret -define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone { - %x = trunc <16 x i32> %i to <16 x i8> - ret <16 x i8> %x -} - -; KNL-LABEL: trunc_8x64_to_8x16 -; KNL: vpmovqw -; KNL: ret -define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone { - %x = trunc <8 x i64> %i to <8 x i16> - ret <8 x i16> %x -} - -;SKX-LABEL: zext_8x8mem_to_8x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = zext <8 x i8> %a to <8 x i16> - %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer - ret <8 x i16> %ret -} - -;SKX-LABEL: sext_8x8mem_to_8x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = sext <8 x i8> %a to <8 x i16> - %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer - ret <8 x i16> %ret -} - -;SKX-LABEL: zext_16x8mem_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { - %a = load <16 x i8>,<16 x i8> *%i,align 1 - %x = zext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer - ret <16 x i16> %ret -} - -;SKX-LABEL: sext_16x8mem_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { - %a = load <16 x i8>,<16 x i8> *%i,align 1 - %x = sext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer - ret <16 x i16> %ret -} - -;SKX-LABEL: zext_16x8_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxbw %xmm0, %ymm0 -;SKX-NEXT: retq -define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { - %x = zext <16 x i8> %a to <16 x i16> - ret <16 x i16> %x -} - -;SKX-LABEL: zext_16x8_to_16x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm1, %k1 -;SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { - %x = zext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer - ret <16 x i16> %ret -} - -;SKX-LABEL: sext_16x8_to_16x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbw %xmm0, %ymm0 -;SKX-NEXT: retq -define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { - %x = sext <16 x i8> %a to <16 x i16> - ret <16 x i16> %x -} - -;SKX-LABEL: sext_16x8_to_16x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm1, %k1 -;SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { - %x = sext <16 x i8> %a to <16 x i16> - %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer - ret <16 x i16> %ret -} - -;SKX-LABEL: zext_32x8mem_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm0, %k1 -;SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { - %a = load <32 x i8>,<32 x i8> *%i,align 1 - %x = zext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer - ret <32 x i16> %ret -} - -;SKX-LABEL: sext_32x8mem_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm0, %k1 -;SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { - %a = load <32 x i8>,<32 x i8> *%i,align 1 - %x = sext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer - ret <32 x i16> %ret -} - -;SKX-LABEL: zext_32x8_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxbw %ymm0, %zmm0 -;SKX-NEXT: retq -define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { - %x = zext <32 x i8> %a to <32 x i16> - ret <32 x i16> %x -} - -;SKX-LABEL: zext_32x8_to_32x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm1, %k1 -;SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { - %x = zext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer - ret <32 x i16> %ret -} - -;SKX-LABEL: sext_32x8_to_32x16: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbw %ymm0, %zmm0 -;SKX-NEXT: retq -define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { - %x = sext <32 x i8> %a to <32 x i16> - ret <32 x i16> %x -} - -;SKX-LABEL: sext_32x8_to_32x16_mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %ymm1, %k1 -;SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { - %x = sext <32 x i8> %a to <32 x i16> - %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer - ret <32 x i16> %ret -} - -;SKX-LABEL: zext_4x8mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = zext <4 x i8> %a to <4 x i32> - %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer - ret <4 x i32> %ret -} - -;SKX-LABEL: sext_4x8mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = sext <4 x i8> %a to <4 x i32> - %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer - ret <4 x i32> %ret -} - -;SKX-LABEL: zext_8x8mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = zext <8 x i8> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer - ret <8 x i32> %ret -} - -;SKX-LABEL: sext_8x8mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = sext <8 x i8> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer - ret <8 x i32> %ret -} - -;KNL-LABEL: zext_16x8mem_to_16x32: -;KNL: vpmovzxbd (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq -define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { - %a = load <16 x i8>,<16 x i8> *%i,align 1 - %x = zext <16 x i8> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -;KNL-LABEL: sext_16x8mem_to_16x32: -;KNL: vpmovsxbd (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq -define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { - %a = load <16 x i8>,<16 x i8> *%i,align 1 - %x = sext <16 x i8> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -;KNL-LABEL: zext_16x8_to_16x32_mask: -;KNL: vpmovzxbd %xmm0, %zmm0 {%k1} {z} -;KNL-NEXT: retq -define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { - %x = zext <16 x i8> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -;KNL-LABEL: sext_16x8_to_16x32_mask: -;KNL: vpmovsxbd %xmm0, %zmm0 {%k1} {z} -;KNL-NEXT: retq -define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { - %x = sext <16 x i8> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -; KNL-LABEL: zext_16x8_to_16x32 -; KNL: vpmovzxbd {{.*}}%zmm -; KNL: ret -define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { - %x = zext <16 x i8> %i to <16 x i32> - ret <16 x i32> %x -} - -; KNL-LABEL: sext_16x8_to_16x32 -; KNL: vpmovsxbd {{.*}}%zmm -; KNL: ret -define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { - %x = sext <16 x i8> %i to <16 x i32> - ret <16 x i32> %x -} - -;SKX-LABEL: zext_2x8mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { - %a = load <2 x i8>,<2 x i8> *%i,align 1 - %x = zext <2 x i8> %a to <2 x i64> - %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer - ret <2 x i64> %ret -} -;SKX-LABEL: sext_2x8mem_to_2x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { - %a = load <2 x i8>,<2 x i8> *%i,align 1 - %x = sext <2 x i8> %a to <2 x i64> - %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer - ret <2 x i64> %ret -} -;SKX-LABEL: sext_2x8mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbq (%rdi), %xmm0 -;SKX-NEXT: retq -define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone { - %a = load <2 x i8>,<2 x i8> *%i,align 1 - %x = sext <2 x i8> %a to <2 x i64> - ret <2 x i64> %x -} - -;SKX-LABEL: zext_4x8mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = zext <4 x i8> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: sext_4x8mem_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = sext <4 x i8> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: sext_4x8mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxbq (%rdi), %ymm0 -;SKX-NEXT: retq -define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone { - %a = load <4 x i8>,<4 x i8> *%i,align 1 - %x = sext <4 x i8> %a to <4 x i64> - ret <4 x i64> %x -} - -;KNL-LABEL: zext_8x8mem_to_8x64: -;KNL: vpmovzxbq (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq -define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = zext <8 x i8> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;KNL-LABEL: sext_8x8mem_to_8x64mask: -;KNL: vpmovsxbq (%rdi), %zmm0 {%k1} {z} -;KNL-NEXT: retq -define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = sext <8 x i8> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;KNL-LABEL: sext_8x8mem_to_8x64: -;KNL: vpmovsxbq (%rdi), %zmm0 -;KNL-NEXT: retq -define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone { - %a = load <8 x i8>,<8 x i8> *%i,align 1 - %x = sext <8 x i8> %a to <8 x i64> - ret <8 x i64> %x -} - -;SKX-LABEL: zext_4x16mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i16>,<4 x i16> *%i,align 1 - %x = zext <4 x i16> %a to <4 x i32> - %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer - ret <4 x i32> %ret -} - -;SKX-LABEL: sext_4x16mem_to_4x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i16>,<4 x i16> *%i,align 1 - %x = sext <4 x i16> %a to <4 x i32> - %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer - ret <4 x i32> %ret -} - -;SKX-LABEL: sext_4x16mem_to_4x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwd (%rdi), %xmm0 -;SKX-NEXT: retq -define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone { - %a = load <4 x i16>,<4 x i16> *%i,align 1 - %x = sext <4 x i16> %a to <4 x i32> - ret <4 x i32> %x -} - - -;SKX-LABEL: zext_8x16mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i16>,<8 x i16> *%i,align 1 - %x = zext <8 x i16> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer - ret <8 x i32> %ret -} - -;SKX-LABEL: sext_8x16mem_to_8x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i16>,<8 x i16> *%i,align 1 - %x = sext <8 x i16> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer - ret <8 x i32> %ret -} - -;SKX-LABEL: sext_8x16mem_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwd (%rdi), %ymm0 -;SKX-NEXT: retq -define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone { - %a = load <8 x i16>,<8 x i16> *%i,align 1 - %x = sext <8 x i16> %a to <8 x i32> - ret <8 x i32> %x -} - -;SKX-LABEL: zext_8x16_to_8x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm1, %k1 -;SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { - %x = zext <8 x i16> %a to <8 x i32> - %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer - ret <8 x i32> %ret -} - -;SKX-LABEL: zext_8x16_to_8x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxwd %xmm0, %ymm0 -;SKX-NEXT: retq -define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone { - %x = zext <8 x i16> %a to <8 x i32> - ret <8 x i32> %x -} - -;SKX-LABEL: zext_16x16mem_to_16x32: -;KNL-LABEL: zext_16x16mem_to_16x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovzxwd (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { - %a = load <16 x i16>,<16 x i16> *%i,align 1 - %x = zext <16 x i16> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -;SKX-LABEL: sext_16x16mem_to_16x32mask: -;KNL-LABEL: sext_16x16mem_to_16x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovsxwd (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { - %a = load <16 x i16>,<16 x i16> *%i,align 1 - %x = sext <16 x i16> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -;SKX-LABEL: sext_16x16mem_to_16x32: -;KNL-LABEL: sext_16x16mem_to_16x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwd (%rdi), %zmm0 -;KNL: vpmovsxwd (%rdi), %zmm0 -;SKX-NEXT: retq -define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone { - %a = load <16 x i16>,<16 x i16> *%i,align 1 - %x = sext <16 x i16> %a to <16 x i32> - ret <16 x i32> %x -} -;SKX-LABEL: zext_16x16_to_16x32mask: -;KNL-LABEL: zext_16x16_to_16x32mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovb2m %xmm1, %k1 -;SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} -;KNL: vpmovzxwd %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone { - %x = zext <16 x i16> %a to <16 x i32> - %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer - ret <16 x i32> %ret -} - -;SKX-LABEL: zext_16x16_to_16x32: -;KNL-LABEL: zext_16x16_to_16x32: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxwd %ymm0, %zmm0 -;KNL: vpmovzxwd %ymm0, %zmm0 -;SKX-NEXT: retq -define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone { - %x = zext <16 x i16> %a to <16 x i32> - ret <16 x i32> %x -} - -;SKX-LABEL: zext_2x16mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { - %a = load <2 x i16>,<2 x i16> *%i,align 1 - %x = zext <2 x i16> %a to <2 x i64> - %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer - ret <2 x i64> %ret -} - -;SKX-LABEL: sext_2x16mem_to_2x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { - %a = load <2 x i16>,<2 x i16> *%i,align 1 - %x = sext <2 x i16> %a to <2 x i64> - %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer - ret <2 x i64> %ret -} - -;SKX-LABEL: sext_2x16mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwq (%rdi), %xmm0 -;SKX-NEXT: retq -define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone { - %a = load <2 x i16>,<2 x i16> *%i,align 1 - %x = sext <2 x i16> %a to <2 x i64> - ret <2 x i64> %x -} - -;SKX-LABEL: zext_4x16mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i16>,<4 x i16> *%i,align 1 - %x = zext <4 x i16> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: sext_4x16mem_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i16>,<4 x i16> *%i,align 1 - %x = sext <4 x i16> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: sext_4x16mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwq (%rdi), %ymm0 -;SKX-NEXT: retq -define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone { - %a = load <4 x i16>,<4 x i16> *%i,align 1 - %x = sext <4 x i16> %a to <4 x i64> - ret <4 x i64> %x -} - -;SKX-LABEL: zext_8x16mem_to_8x64: -;KNL-LABEL: zext_8x16mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovzxwq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i16>,<8 x i16> *%i,align 1 - %x = zext <8 x i16> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;SKX-LABEL: sext_8x16mem_to_8x64mask: -;KNL-LABEL: sext_8x16mem_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} -;KNL: vpmovsxwq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i16>,<8 x i16> *%i,align 1 - %x = sext <8 x i16> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;SKX-LABEL: sext_8x16mem_to_8x64: -;KNL-LABEL: sext_8x16mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxwq (%rdi), %zmm0 -;KNL: vpmovsxwq (%rdi), %zmm0 -;SKX-NEXT: retq -define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone { - %a = load <8 x i16>,<8 x i16> *%i,align 1 - %x = sext <8 x i16> %a to <8 x i64> - ret <8 x i64> %x -} - -;SKX-LABEL: zext_8x16_to_8x64mask: -;KNL-LABEL: zext_8x16_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm1, %k1 -;SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} -;KNL: vpmovzxwq %xmm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { - %x = zext <8 x i16> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;SKX-LABEL: zext_8x16_to_8x64: -;KNL-LABEL: zext_8x16_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovzxwq %xmm0, %zmm0 -;KNL: vpmovzxwq %xmm0, %zmm0 -;SKX-NEXT: retq -; KNL: ret -define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone { - %ret = zext <8 x i16> %a to <8 x i64> - ret <8 x i64> %ret -} - -;SKX-LABEL: zext_2x32mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { - %a = load <2 x i32>,<2 x i32> *%i,align 1 - %x = zext <2 x i32> %a to <2 x i64> - %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer - ret <2 x i64> %ret -} - -;SKX-LABEL: sext_2x32mem_to_2x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovq2m %xmm0, %k1 -;SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} -;SKX-NEXT: retq -define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { - %a = load <2 x i32>,<2 x i32> *%i,align 1 - %x = sext <2 x i32> %a to <2 x i64> - %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer - ret <2 x i64> %ret -} - -;SKX-LABEL: sext_2x32mem_to_2x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq (%rdi), %xmm0 -;SKX-NEXT: retq -define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone { - %a = load <2 x i32>,<2 x i32> *%i,align 1 - %x = sext <2 x i32> %a to <2 x i64> - ret <2 x i64> %x -} - -;SKX-LABEL: zext_4x32mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i32>,<4 x i32> *%i,align 1 - %x = zext <4 x i32> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: sext_4x32mem_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm0, %k1 -;SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { - %a = load <4 x i32>,<4 x i32> *%i,align 1 - %x = sext <4 x i32> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: sext_4x32mem_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq (%rdi), %ymm0 -;SKX-NEXT: retq -define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone { - %a = load <4 x i32>,<4 x i32> *%i,align 1 - %x = sext <4 x i32> %a to <4 x i64> - ret <4 x i64> %x -} - -;SKX-LABEL: sext_4x32_to_4x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq %xmm0, %ymm0 -;SKX-NEXT: retq -define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone { - %x = sext <4 x i32> %a to <4 x i64> - ret <4 x i64> %x -} - -;SKX-LABEL: zext_4x32_to_4x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovd2m %xmm1, %k1 -;SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} -;SKX-NEXT: retq -define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone { - %x = zext <4 x i32> %a to <4 x i64> - %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer - ret <4 x i64> %ret -} - -;SKX-LABEL: zext_8x32mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i32>,<8 x i32> *%i,align 1 - %x = zext <8 x i32> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;SKX-LABEL: sext_8x32mem_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm0, %k1 -;SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { - %a = load <8 x i32>,<8 x i32> *%i,align 1 - %x = sext <8 x i32> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} - -;SKX-LABEL: sext_8x32mem_to_8x64: -;KNL-LABEL: sext_8x32mem_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq (%rdi), %zmm0 -;KNL: vpmovsxdq (%rdi), %zmm0 -;SKX-NEXT: retq -define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { - %a = load <8 x i32>,<8 x i32> *%i,align 1 - %x = sext <8 x i32> %a to <8 x i64> - ret <8 x i64> %x -} - -;SKX-LABEL: sext_8x32_to_8x64: -;KNL-LABEL: sext_8x32_to_8x64: -;SKX: ## BB#0: -;SKX-NEXT: vpmovsxdq %ymm0, %zmm0 -;KNL: vpmovsxdq %ymm0, %zmm0 -;SKX-NEXT: retq -define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone { - %x = sext <8 x i32> %a to <8 x i64> - ret <8 x i64> %x -} - -;SKX-LABEL: zext_8x32_to_8x64mask: -;KNL-LABEL: zext_8x32_to_8x64mask: -;SKX: ## BB#0: -;SKX-NEXT: vpmovw2m %xmm1, %k1 -;SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} -;KNL: vpmovzxdq %ymm0, %zmm0 {%k1} {z} -;SKX-NEXT: retq -define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone { - %x = zext <8 x i32> %a to <8 x i64> - %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer - ret <8 x i64> %ret -} -;KNL-LABEL: fptrunc_test -;KNL: vcvtpd2ps {{.*}}%zmm -;KNL: ret -define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone { - %b = fptrunc <8 x double> %a to <8 x float> - ret <8 x float> %b -} - -;KNL-LABEL: fpext_test -;KNL: vcvtps2pd {{.*}}%zmm -;KNL: ret -define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone { - %b = fpext <8 x float> %a to <8 x double> - ret <8 x double> %b -} - -; KNL-LABEL: zext_16i1_to_16xi32 -; KNL: vpbroadcastd LCP{{.*}}(%rip), %zmm0 {%k1} {z} -; KNL: ret -define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { - %a = bitcast i16 %b to <16 x i1> - %c = zext <16 x i1> %a to <16 x i32> - ret <16 x i32> %c -} - -; KNL-LABEL: zext_8i1_to_8xi64 -; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z} -; KNL: ret -define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { - %a = bitcast i8 %b to <8 x i1> - %c = zext <8 x i1> %a to <8 x i64> - ret <8 x i64> %c -} - -; KNL-LABEL: trunc_16i8_to_16i1 -; KNL: vpmovsxbd -; KNL: vpandd -; KNL: vptestmd -; KNL: ret -; SKX-LABEL: trunc_16i8_to_16i1 -; SKX: vpmovb2m %xmm -define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { - %mask_b = trunc <16 x i8>%a to <16 x i1> - %mask = bitcast <16 x i1> %mask_b to i16 - ret i16 %mask -} - -; KNL-LABEL: trunc_16i32_to_16i1 -; KNL: vpandd -; KNL: vptestmd -; KNL: ret -; SKX-LABEL: trunc_16i32_to_16i1 -; SKX: vpmovd2m %zmm -define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { - %mask_b = trunc <16 x i32>%a to <16 x i1> - %mask = bitcast <16 x i1> %mask_b to i16 - ret i16 %mask -} - -; SKX-LABEL: trunc_4i32_to_4i1 -; SKX: vpmovd2m %xmm -; SKX: kandw -; SKX: vpmovm2d -define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { - %mask_a = trunc <4 x i32>%a to <4 x i1> - %mask_b = trunc <4 x i32>%b to <4 x i1> - %a_and_b = and <4 x i1>%mask_a, %mask_b - %res = sext <4 x i1>%a_and_b to <4 x i32> - ret <4 x i32>%res -} - -; KNL-LABEL: trunc_8i16_to_8i1 -; KNL: vpmovsxwq -; KNL: vpandq LCP{{.*}}(%rip){1to8} -; KNL: vptestmq -; KNL: ret - -; SKX-LABEL: trunc_8i16_to_8i1 -; SKX: vpmovw2m %xmm -define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { - %mask_b = trunc <8 x i16>%a to <8 x i1> - %mask = bitcast <8 x i1> %mask_b to i8 - ret i8 %mask -} - -; KNL-LABEL: sext_8i1_8i32 -; KNL: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z} -; SKX: vpmovm2d -; KNL: ret -define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { - %x = icmp slt <8 x i32> %a1, %a2 - %x1 = xor <8 x i1>%x, - %y = sext <8 x i1> %x1 to <8 x i32> - ret <8 x i32> %y -} - -; KNL-LABEL: trunc_v16i32_to_v16i16 -; KNL: vpmovdw -; KNL: ret -define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { - %1 = trunc <16 x i32> %x to <16 x i16> - ret <16 x i16> %1 -} - -; KNL-LABEL: trunc_i32_to_i1 -; KNL: movw $-4, %ax -; KNL: kmovw %eax, %k1 -; KNL: korw -define i16 @trunc_i32_to_i1(i32 %a) { - %a_i = trunc i32 %a to i1 - %maskv = insertelement <16 x i1> , i1 %a_i, i32 0 - %res = bitcast <16 x i1> %maskv to i16 - ret i16 %res -} - -; KNL-LABEL: sext_8i1_8i16 -; SKX: vpmovm2w -; KNL: ret -define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { - %x = icmp slt <8 x i32> %a1, %a2 - %y = sext <8 x i1> %x to <8 x i16> - ret <8 x i16> %y -} - -; KNL-LABEL: sext_16i1_16i32 -; SKX: vpmovm2d -; KNL: ret -define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { - %x = icmp slt <16 x i32> %a1, %a2 - %y = sext <16 x i1> %x to <16 x i32> - ret <16 x i32> %y -} - -; KNL-LABEL: sext_8i1_8i64 -; SKX: vpmovm2q -; KNL: ret -define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { - %x = icmp slt <8 x i32> %a1, %a2 - %y = sext <8 x i1> %x to <8 x i64> - ret <8 x i64> %y -} - -; KNL-LABEL: @extload_v8i64 -; KNL: vpmovsxbq -define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { - %sign_load = load <8 x i8>, <8 x i8>* %a - %c = sext <8 x i8> %sign_load to <8 x i64> - store <8 x i64> %c, <8 x i64>* %res - ret void -} - -;SKX-LABEL: test21: -;SKX: vmovdqu16 %zmm0, %zmm3 {%k1} -;SKX-NEXT: kshiftrq $32, %k1, %k1 -;SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} -define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { - %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer - ret <64 x i16> %ret -} - Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -0,0 +1,364 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX + + attributes #0 = { nounwind } + +; KNL-LABEL: trunc_16x32_to_16x8 +; KNL: vpmovdb +; KNL: ret +define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 { + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +; KNL-LABEL: trunc_8x64_to_8x16 +; KNL: vpmovqw +; KNL: ret +define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 { + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +; KNL-LABEL: trunc_v16i32_to_v16i16 +; KNL: vpmovdw +; KNL: ret +define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 { + %1 = trunc <16 x i32> %x to <16 x i16> + ret <16 x i16> %1 +} + +define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + ret <2 x i8> %x +} + +define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + store <2 x i8> %x, <2 x i8>* %res + ret void +} + +define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + ret <2 x i16> %x +} + +define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + store <2 x i16> %x, <2 x i16>* %res + ret void +} + +define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + ret <8 x i32> %x +} + +define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + store <8 x i32> %x, <8 x i32>* %res + ret void +} + +define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + ret <4 x i32> %x +} + +define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + store <4 x i32> %x, <4 x i32>* %res + ret void +} + +define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + ret <2 x i32> %x +} + +define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + store <2 x i32> %x, <2 x i32>* %res + ret void +} + +define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 { +; SKX-LABEL: trunc_db_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { +; SKX-LABEL: trunc_db_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 { +; SKX-LABEL: trunc_db_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + ret <16 x i16> %x +} + +define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + store <16 x i16> %x, <16 x i16>* %res + ret void +} + +define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_dw_128(<4 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + ret <32 x i8> %x +} + +define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + store <32 x i8> %x, <32 x i8>* %res + ret void +} + +define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1038,3 +1038,83 @@ %res2 = add <32 x i16> %res, %res1 ret <32 x i16> %res2 } + + +declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; CHECK: vpmovwb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512: +; CHECK: vpmovwb %zmm0, (%rdi) +; CHECK: vpmovwb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: +; CHECK: vpmovswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512: +; CHECK: vpmovswb %zmm0, (%rdi) +; CHECK: vpmovswb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: +; CHECK: vpmovuswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512: +; CHECK: vpmovuswb %zmm0, (%rdi) +; CHECK: vpmovuswb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -3843,3 +3843,161 @@ %res2 = add <16 x i16> %res, %res1 ret <16 x i16> %res2 } + + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128: +; CHECK: vpmovwb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128: +; CHECK: vpmovwb %xmm0, (%rdi) +; CHECK: vpmovwb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128: +; CHECK: vpmovswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128: +; CHECK: vpmovswb %xmm0, (%rdi) +; CHECK: vpmovswb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128: +; CHECK: vpmovuswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128: +; CHECK: vpmovuswb %xmm0, (%rdi) +; CHECK: vpmovuswb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256: +; CHECK: vpmovwb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256: +; CHECK: vpmovwb %ymm0, (%rdi) +; CHECK: vpmovwb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: +; CHECK: vpmovswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256: +; CHECK: vpmovswb %ymm0, (%rdi) +; CHECK: vpmovswb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: +; CHECK: vpmovuswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256: +; CHECK: vpmovuswb %ymm0, (%rdi) +; CHECK: vpmovuswb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3003,4 +3003,786 @@ %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) %res2 = fadd <8 x float> %res, %res1 ret <8 x float> %res2 -} \ No newline at end of file +} + + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128: +; CHECK: vpmovqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128: +; CHECK: vpmovqb %xmm0, (%rdi) +; CHECK: vpmovqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128: +; CHECK: vpmovsqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128: +; CHECK: vpmovsqb %xmm0, (%rdi) +; CHECK: vpmovsqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128: +; CHECK: vpmovusqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128: +; CHECK: vpmovusqb %xmm0, (%rdi) +; CHECK: vpmovusqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256: +; CHECK: vpmovqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256: +; CHECK: vpmovqb %ymm0, (%rdi) +; CHECK: vpmovqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256: +; CHECK: vpmovsqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256: +; CHECK: vpmovsqb %ymm0, (%rdi) +; CHECK: vpmovsqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256: +; CHECK: vpmovusqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256: +; CHECK: vpmovusqb %ymm0, (%rdi) +; CHECK: vpmovusqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128: +; CHECK: vpmovqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128: +; CHECK: vpmovqw %xmm0, (%rdi) +; CHECK: vpmovqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128: +; CHECK: vpmovsqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128: +; CHECK: vpmovsqw %xmm0, (%rdi) +; CHECK: vpmovsqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128: +; CHECK: vpmovusqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128: +; CHECK: vpmovusqw %xmm0, (%rdi) +; CHECK: vpmovusqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256: +; CHECK: vpmovqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256: +; CHECK: vpmovqw %ymm0, (%rdi) +; CHECK: vpmovqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256: +; CHECK: vpmovsqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256: +; CHECK: vpmovsqw %ymm0, (%rdi) +; CHECK: vpmovsqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256: +; CHECK: vpmovusqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256: +; CHECK: vpmovusqw %ymm0, (%rdi) +; CHECK: vpmovusqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128: +; CHECK: vpmovqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128: +; CHECK: vpmovqd %xmm0, (%rdi) +; CHECK: vpmovqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128: +; CHECK: vpmovsqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128: +; CHECK: vpmovsqd %xmm0, (%rdi) +; CHECK: vpmovsqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128: +; CHECK: vpmovusqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128: +; CHECK: vpmovusqd %xmm0, (%rdi) +; CHECK: vpmovusqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256: +; CHECK: vpmovqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256: +; CHECK: vpmovqd %ymm0, (%rdi) +; CHECK: vpmovqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: +; CHECK: vpmovsqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256: +; CHECK: vpmovsqd %ymm0, (%rdi) +; CHECK: vpmovsqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: +; CHECK: vpmovusqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256: +; CHECK: vpmovusqd %ymm0, (%rdi) +; CHECK: vpmovusqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128: +; CHECK: vpmovdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128: +; CHECK: vpmovdb %xmm0, (%rdi) +; CHECK: vpmovdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128: +; CHECK: vpmovsdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128: +; CHECK: vpmovsdb %xmm0, (%rdi) +; CHECK: vpmovsdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128: +; CHECK: vpmovusdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128: +; CHECK: vpmovusdb %xmm0, (%rdi) +; CHECK: vpmovusdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256: +; CHECK: vpmovdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256: +; CHECK: vpmovdb %ymm0, (%rdi) +; CHECK: vpmovdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256: +; CHECK: vpmovsdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256: +; CHECK: vpmovsdb %ymm0, (%rdi) +; CHECK: vpmovsdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256: +; CHECK: vpmovusdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256: +; CHECK: vpmovusdb %ymm0, (%rdi) +; CHECK: vpmovusdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128: +; CHECK: vpmovdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128: +; CHECK: vpmovdw %xmm0, (%rdi) +; CHECK: vpmovdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128: +; CHECK: vpmovsdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128: +; CHECK: vpmovsdw %xmm0, (%rdi) +; CHECK: vpmovsdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128: +; CHECK: vpmovusdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128: +; CHECK: vpmovusdw %xmm0, (%rdi) +; CHECK: vpmovusdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256: +; CHECK: vpmovdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256: +; CHECK: vpmovdw %ymm0, (%rdi) +; CHECK: vpmovdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256: +; CHECK: vpmovsdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256: +; CHECK: vpmovsdw %ymm0, (%rdi) +; CHECK: vpmovsdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256: +; CHECK: vpmovusdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256: +; CHECK: vpmovusdw %ymm0, (%rdi) +; CHECK: vpmovusdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -190,10 +190,13 @@ ; AVX2-LABEL: test15 ; AVX2: vpmaskmovd -; SKX-LABEL: test15 -; SKX: kshiftl -; SKX: kshiftr -; SKX: vmovdqu32 {{.*}}{%k1} +; SKX-LABEL: test15: +; SKX: ## BB#0: +; SKX-NEXT: vpandq {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) Index: test/MC/X86/x86-64-avx512bw.s =================================================================== --- test/MC/X86/x86-64-avx512bw.s +++ test/MC/X86/x86-64-avx512bw.s @@ -3668,6 +3668,126 @@ // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x1d,0xb2,0xc0,0xdf,0xff,0xff] vpabsw -8256(%rdx), %zmm30 +// CHECK: vpmovwb %zmm27, %ymm22 +// CHECK: encoding: [0x62,0x22,0x7e,0x48,0x30,0xde] + vpmovwb %zmm27, %ymm22 + +// CHECK: vpmovwb %zmm27, %ymm22 {%k1} +// CHECK: encoding: [0x62,0x22,0x7e,0x49,0x30,0xde] + vpmovwb %zmm27, %ymm22 {%k1} + +// CHECK: vpmovwb %zmm27, %ymm22 {%k1} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0xc9,0x30,0xde] + vpmovwb %zmm27, %ymm22 {%k1} {z} + +// CHECK: vpmovwb %zmm22, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0x31] + vpmovwb %zmm22, (%rcx) + +// CHECK: vpmovwb %zmm22, (%rcx) {%k4} +// CHECK: encoding: [0x62,0xe2,0x7e,0x4c,0x30,0x31] + vpmovwb %zmm22, (%rcx) {%k4} + +// CHECK: vpmovwb %zmm22, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x30,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovwb %zmm22, 291(%rax,%r14,8) + +// CHECK: vpmovwb %zmm22, 4064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0x72,0x7f] + vpmovwb %zmm22, 4064(%rdx) + +// CHECK: vpmovwb %zmm22, 4096(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0xb2,0x00,0x10,0x00,0x00] + vpmovwb %zmm22, 4096(%rdx) + +// CHECK: vpmovwb %zmm22, -4096(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0x72,0x80] + vpmovwb %zmm22, -4096(%rdx) + +// CHECK: vpmovwb %zmm22, -4128(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0xb2,0xe0,0xef,0xff,0xff] + vpmovwb %zmm22, -4128(%rdx) + +// CHECK: vpmovswb %zmm18, %ymm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x20,0xd7] + vpmovswb %zmm18, %ymm23 + +// CHECK: vpmovswb %zmm18, %ymm23 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x4a,0x20,0xd7] + vpmovswb %zmm18, %ymm23 {%k2} + +// CHECK: vpmovswb %zmm18, %ymm23 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xca,0x20,0xd7] + vpmovswb %zmm18, %ymm23 {%k2} {z} + +// CHECK: vpmovswb %zmm24, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x01] + vpmovswb %zmm24, (%rcx) + +// CHECK: vpmovswb %zmm24, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x4f,0x20,0x01] + vpmovswb %zmm24, (%rcx) {%k7} + +// CHECK: vpmovswb %zmm24, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x48,0x20,0x84,0xf0,0x23,0x01,0x00,0x00] + vpmovswb %zmm24, 291(%rax,%r14,8) + +// CHECK: vpmovswb %zmm24, 4064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x42,0x7f] + vpmovswb %zmm24, 4064(%rdx) + +// CHECK: vpmovswb %zmm24, 4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x82,0x00,0x10,0x00,0x00] + vpmovswb %zmm24, 4096(%rdx) + +// CHECK: vpmovswb %zmm24, -4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x42,0x80] + vpmovswb %zmm24, -4096(%rdx) + +// CHECK: vpmovswb %zmm24, -4128(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x82,0xe0,0xef,0xff,0xff] + vpmovswb %zmm24, -4128(%rdx) + +// CHECK: vpmovuswb %zmm22, %ymm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x48,0x10,0xf4] + vpmovuswb %zmm22, %ymm28 + +// CHECK: vpmovuswb %zmm22, %ymm28 {%k3} +// CHECK: encoding: [0x62,0x82,0x7e,0x4b,0x10,0xf4] + vpmovuswb %zmm22, %ymm28 {%k3} + +// CHECK: vpmovuswb %zmm22, %ymm28 {%k3} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xcb,0x10,0xf4] + vpmovuswb %zmm22, %ymm28 {%k3} {z} + +// CHECK: vpmovuswb %zmm27, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x19] + vpmovuswb %zmm27, (%rcx) + +// CHECK: vpmovuswb %zmm27, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x4a,0x10,0x19] + vpmovuswb %zmm27, (%rcx) {%k2} + +// CHECK: vpmovuswb %zmm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x48,0x10,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovuswb %zmm27, 291(%rax,%r14,8) + +// CHECK: vpmovuswb %zmm27, 4064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x5a,0x7f] + vpmovuswb %zmm27, 4064(%rdx) + +// CHECK: vpmovuswb %zmm27, 4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x9a,0x00,0x10,0x00,0x00] + vpmovuswb %zmm27, 4096(%rdx) + +// CHECK: vpmovuswb %zmm27, -4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x5a,0x80] + vpmovuswb %zmm27, -4096(%rdx) + +// CHECK: vpmovuswb %zmm27, -4128(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x9a,0xe0,0xef,0xff,0xff] + vpmovuswb %zmm27, -4128(%rdx) + // CHECK: vpmulhuw %zmm21, %zmm24, %zmm21 // CHECK: encoding: [0x62,0xa1,0x3d,0x40,0xe4,0xed] vpmulhuw %zmm21, %zmm24, %zmm21 Index: test/MC/X86/x86-64-avx512bw_vl.s =================================================================== --- test/MC/X86/x86-64-avx512bw_vl.s +++ test/MC/X86/x86-64-avx512bw_vl.s @@ -6583,6 +6583,486 @@ // CHECK: encoding: [0x62,0xe2,0x6d,0x20,0x00,0x9a,0xe0,0xef,0xff,0xff] vpshufb -4128(%rdx), %ymm18, %ymm19 +// CHECK: vpmovwb %xmm28, %xmm27 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x30,0xe3] + vpmovwb %xmm28, %xmm27 + +// CHECK: vpmovwb %xmm28, %xmm27 {%k2} +// CHECK: encoding: [0x62,0x02,0x7e,0x0a,0x30,0xe3] + vpmovwb %xmm28, %xmm27 {%k2} + +// CHECK: vpmovwb %xmm28, %xmm27 {%k2} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x8a,0x30,0xe3] + vpmovwb %xmm28, %xmm27 {%k2} {z} + +// CHECK: vpmovwb %ymm26, %xmm26 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x30,0xd2] + vpmovwb %ymm26, %xmm26 + +// CHECK: vpmovwb %ymm26, %xmm26 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x2c,0x30,0xd2] + vpmovwb %ymm26, %xmm26 {%k4} + +// CHECK: vpmovwb %ymm26, %xmm26 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xac,0x30,0xd2] + vpmovwb %ymm26, %xmm26 {%k4} {z} + +// CHECK: vpmovwb %xmm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x39] + vpmovwb %xmm23,(%rcx) + +// CHECK: vpmovwb %xmm23, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0e,0x30,0x39] + vpmovwb %xmm23,(%rcx) {%k6} + +// CHECK: vpmovwb %xmm23, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x30,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmovwb %xmm23,4660(%rax,%r14,8) + +// CHECK: vpmovwb %xmm23, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x7a,0x7f] + vpmovwb %xmm23, 1016(%rdx) + +// CHECK: vpmovwb %xmm23, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xba,0x00,0x04,0x00,0x00] + vpmovwb %xmm23, 1024(%rdx) + +// CHECK: vpmovwb %xmm23, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x7a,0x80] + vpmovwb %xmm23,-1024(%rdx) + +// CHECK: vpmovwb %xmm23, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xba,0xf8,0xfb,0xff,0xff] + vpmovwb %xmm23,-1032(%rdx) + +// CHECK: vpmovwb %ymm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x29] + vpmovwb %ymm21,(%rcx) + +// CHECK: vpmovwb %ymm21, (%rcx) {%k5} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2d,0x30,0x29] + vpmovwb %ymm21,(%rcx) {%k5} + +// CHECK: vpmovwb %ymm21, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x30,0xac,0xf0,0x34,0x12,0x00,0x00] + vpmovwb %ymm21, 4660(%rax,%r14,8) + +// CHECK: vpmovwb %ymm21, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x6a,0x7f] + vpmovwb %ymm21, 2032(%rdx) + +// CHECK: vpmovwb %ymm21, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xaa,0x00,0x08,0x00,0x00] + vpmovwb %ymm21, 2048(%rdx) + +// CHECK: vpmovwb %ymm21, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x6a,0x80] + vpmovwb %ymm21,-2048(%rdx) + +// CHECK: vpmovwb %ymm21, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xaa,0xf0,0xf7,0xff,0xff] + vpmovwb %ymm21, -2064(%rdx) + +// CHECK: vpmovswb %xmm19, %xmm17 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x20,0xd9] + vpmovswb %xmm19, %xmm17 + +// CHECK: vpmovswb %xmm19, %xmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7e,0x09,0x20,0xd9] + vpmovswb %xmm19, %xmm17 {%k1} + +// CHECK: vpmovswb %xmm19, %xmm17 {%k1} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x89,0x20,0xd9] + vpmovswb %xmm19, %xmm17 {%k1} {z} + +// CHECK: vpmovswb %ymm19, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x20,0xdd] + vpmovswb %ymm19, %xmm21 + +// CHECK: vpmovswb %ymm19, %xmm21 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2c,0x20,0xdd] + vpmovswb %ymm19, %xmm21 {%k4} + +// CHECK: vpmovswb %ymm19, %xmm21 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xac,0x20,0xdd] + vpmovswb %ymm19, %xmm21 {%k4} {z} + +// CHECK: vpmovswb %xmm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x11] + vpmovswb %xmm18,(%rcx) + +// CHECK: vpmovswb %xmm18, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x20,0x11] + vpmovswb %xmm18,(%rcx) {%k2} + +// CHECK: vpmovswb %xmm18, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x20,0x94,0xf0,0x34,0x12,0x00,0x00] + vpmovswb %xmm18, 4660(%rax,%r14,8) + +// CHECK: vpmovswb %xmm18, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x52,0x7f] + vpmovswb %xmm18, 1016(%rdx) + +// CHECK: vpmovswb %xmm18, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x92,0x00,0x04,0x00,0x00] + vpmovswb %xmm18, 1024(%rdx) + +// CHECK: vpmovswb %xmm18, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x52,0x80] + vpmovswb %xmm18, -1024(%rdx) + +// CHECK: vpmovswb %xmm18, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x92,0xf8,0xfb,0xff,0xff] + vpmovswb %xmm18, -1032(%rdx) + +// CHECK: vpmovswb %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0x39] + vpmovswb %ymm23,(%rcx) + +// CHECK: vpmovswb %ymm23, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2a,0x20,0x39] + vpmovswb %ymm23,(%rcx) {%k2} + +// CHECK: vpmovswb %ymm23, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x20,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmovswb %ymm23, 4660(%rax,%r14,8) + +// CHECK: vpmovswb %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0x7a,0x7f] + vpmovswb %ymm23, 2032(%rdx) + +// CHECK: vpmovswb %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0xba,0x00,0x08,0x00,0x00] + vpmovswb %ymm23, 2048(%rdx) + +// CHECK: vpmovswb %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0x7a,0x80] + vpmovswb %ymm23, -2048(%rdx) + +// CHECK: vpmovswb %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0xba,0xf0,0xf7,0xff,0xff] + vpmovswb %ymm23, -2064(%rdx) + +// CHECK: vpmovuswb %xmm17, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x10,0xca] + vpmovuswb %xmm17, %xmm26 + +// CHECK: vpmovuswb %xmm17, %xmm26 {%k6} +// CHECK: encoding: [0x62,0x82,0x7e,0x0e,0x10,0xca] + vpmovuswb %xmm17, %xmm26 {%k6} + +// CHECK: vpmovuswb %xmm17, %xmm26 {%k6} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8e,0x10,0xca] + vpmovuswb %xmm17, %xmm26 {%k6} {z} + +// CHECK: vpmovuswb %ymm26, %xmm17 +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x10,0xd1] + vpmovuswb %ymm26, %xmm17 + +// CHECK: vpmovuswb %ymm26, %xmm17 {%k2} +// CHECK: encoding: [0x62,0x22,0x7e,0x2a,0x10,0xd1] + vpmovuswb %ymm26, %xmm17 {%k2} + +// CHECK: vpmovuswb %ymm26, %xmm17 {%k2} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0xaa,0x10,0xd1] + vpmovuswb %ymm26, %xmm17 {%k2} {z} + +// CHECK: vpmovuswb %xmm19, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x19] + vpmovuswb %xmm19,(%rcx) + +// CHECK: vpmovuswb %xmm19, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x09,0x10,0x19] + vpmovuswb %xmm19,(%rcx) {%k1} + +// CHECK: vpmovuswb %xmm19, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x10,0x9c,0xf0,0x34,0x12,0x00,0x00] + vpmovuswb %xmm19, 4660(%rax,%r14,8) + +// CHECK: vpmovuswb %xmm19, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x5a,0x7f] + vpmovuswb %xmm19, 1016(%rdx) + +// CHECK: vpmovuswb %xmm19, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x9a,0x00,0x04,0x00,0x00] + vpmovuswb %xmm19, 1024(%rdx) + +// CHECK: vpmovuswb %xmm19, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x5a,0x80] + vpmovuswb %xmm19, -1024(%rdx) + +// CHECK: vpmovuswb %xmm19, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x9a,0xf8,0xfb,0xff,0xff] + vpmovuswb %xmm19, -1032(%rdx) + +// CHECK: vpmovuswb %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0x39] + vpmovuswb %ymm23,(%rcx) + +// CHECK: vpmovuswb %ymm23, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2e,0x10,0x39] + vpmovuswb %ymm23,(%rcx) {%k6} + +// CHECK: vpmovuswb %ymm23, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x10,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmovuswb %ymm23, 4660(%rax,%r14,8) + +// CHECK: vpmovuswb %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0x7a,0x7f] + vpmovuswb %ymm23, 2032(%rdx) + +// CHECK: vpmovuswb %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0xba,0x00,0x08,0x00,0x00] + vpmovuswb %ymm23, 2048(%rdx) + +// CHECK: vpmovuswb %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0x7a,0x80] + vpmovuswb %ymm23, -2048(%rdx) + +// CHECK: vpmovuswb %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0xba,0xf0,0xf7,0xff,0xff] + vpmovuswb %ymm23, -2064(%rdx) + +// CHECK: vpmovwb %xmm17, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x30,0xcd] + vpmovwb %xmm17, %xmm21 + +// CHECK: vpmovwb %xmm17, %xmm21 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7e,0x09,0x30,0xcd] + vpmovwb %xmm17, %xmm21 {%k1} + +// CHECK: vpmovwb %xmm17, %xmm21 {%k1} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x89,0x30,0xcd] + vpmovwb %xmm17, %xmm21 {%k1} {z} + +// CHECK: vpmovwb %ymm23, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x30,0xfa] + vpmovwb %ymm23, %xmm26 + +// CHECK: vpmovwb %ymm23, %xmm26 {%k7} +// CHECK: encoding: [0x62,0x82,0x7e,0x2f,0x30,0xfa] + vpmovwb %ymm23, %xmm26 {%k7} + +// CHECK: vpmovwb %ymm23, %xmm26 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xaf,0x30,0xfa] + vpmovwb %ymm23, %xmm26 {%k7} {z} + +// CHECK: vpmovwb %xmm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x29] + vpmovwb %xmm21, (%rcx) + +// CHECK: vpmovwb %xmm21, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x30,0x29] + vpmovwb %xmm21, (%rcx) {%k2} + +// CHECK: vpmovwb %xmm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x30,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovwb %xmm21, 291(%rax,%r14,8) + +// CHECK: vpmovwb %xmm21, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x6a,0x7f] + vpmovwb %xmm21, 1016(%rdx) + +// CHECK: vpmovwb %xmm21, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xaa,0x00,0x04,0x00,0x00] + vpmovwb %xmm21, 1024(%rdx) + +// CHECK: vpmovwb %xmm21, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x6a,0x80] + vpmovwb %xmm21, -1024(%rdx) + +// CHECK: vpmovwb %xmm21, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xaa,0xf8,0xfb,0xff,0xff] + vpmovwb %xmm21, -1032(%rdx) + +// CHECK: vpmovwb %ymm20, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x21] + vpmovwb %ymm20, (%rcx) + +// CHECK: vpmovwb %ymm20, (%rcx) {%k4} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2c,0x30,0x21] + vpmovwb %ymm20, (%rcx) {%k4} + +// CHECK: vpmovwb %ymm20, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x30,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovwb %ymm20, 291(%rax,%r14,8) + +// CHECK: vpmovwb %ymm20, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x62,0x7f] + vpmovwb %ymm20, 2032(%rdx) + +// CHECK: vpmovwb %ymm20, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xa2,0x00,0x08,0x00,0x00] + vpmovwb %ymm20, 2048(%rdx) + +// CHECK: vpmovwb %ymm20, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x62,0x80] + vpmovwb %ymm20, -2048(%rdx) + +// CHECK: vpmovwb %ymm20, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xa2,0xf0,0xf7,0xff,0xff] + vpmovwb %ymm20, -2064(%rdx) + +// CHECK: vpmovswb %xmm20, %xmm24 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x20,0xe0] + vpmovswb %xmm20, %xmm24 + +// CHECK: vpmovswb %xmm20, %xmm24 {%k4} +// CHECK: encoding: [0x62,0x82,0x7e,0x0c,0x20,0xe0] + vpmovswb %xmm20, %xmm24 {%k4} + +// CHECK: vpmovswb %xmm20, %xmm24 {%k4} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8c,0x20,0xe0] + vpmovswb %xmm20, %xmm24 {%k4} {z} + +// CHECK: vpmovswb %ymm18, %xmm27 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x20,0xd3] + vpmovswb %ymm18, %xmm27 + +// CHECK: vpmovswb %ymm18, %xmm27 {%k1} +// CHECK: encoding: [0x62,0x82,0x7e,0x29,0x20,0xd3] + vpmovswb %ymm18, %xmm27 {%k1} + +// CHECK: vpmovswb %ymm18, %xmm27 {%k1} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xa9,0x20,0xd3] + vpmovswb %ymm18, %xmm27 {%k1} {z} + +// CHECK: vpmovswb %xmm24, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x01] + vpmovswb %xmm24, (%rcx) + +// CHECK: vpmovswb %xmm24, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x20,0x01] + vpmovswb %xmm24, (%rcx) {%k3} + +// CHECK: vpmovswb %xmm24, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x20,0x84,0xf0,0x23,0x01,0x00,0x00] + vpmovswb %xmm24, 291(%rax,%r14,8) + +// CHECK: vpmovswb %xmm24, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x42,0x7f] + vpmovswb %xmm24, 1016(%rdx) + +// CHECK: vpmovswb %xmm24, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x82,0x00,0x04,0x00,0x00] + vpmovswb %xmm24, 1024(%rdx) + +// CHECK: vpmovswb %xmm24, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x42,0x80] + vpmovswb %xmm24, -1024(%rdx) + +// CHECK: vpmovswb %xmm24, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x82,0xf8,0xfb,0xff,0xff] + vpmovswb %xmm24, -1032(%rdx) + +// CHECK: vpmovswb %ymm27, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x19] + vpmovswb %ymm27, (%rcx) + +// CHECK: vpmovswb %ymm27, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x2f,0x20,0x19] + vpmovswb %ymm27, (%rcx) {%k7} + +// CHECK: vpmovswb %ymm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x20,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovswb %ymm27, 291(%rax,%r14,8) + +// CHECK: vpmovswb %ymm27, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x5a,0x7f] + vpmovswb %ymm27, 2032(%rdx) + +// CHECK: vpmovswb %ymm27, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x9a,0x00,0x08,0x00,0x00] + vpmovswb %ymm27, 2048(%rdx) + +// CHECK: vpmovswb %ymm27, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x5a,0x80] + vpmovswb %ymm27, -2048(%rdx) + +// CHECK: vpmovswb %ymm27, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x9a,0xf0,0xf7,0xff,0xff] + vpmovswb %ymm27, -2064(%rdx) + +// CHECK: vpmovuswb %xmm19, %xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x10,0xdf] + vpmovuswb %xmm19, %xmm23 + +// CHECK: vpmovuswb %xmm19, %xmm23 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0c,0x10,0xdf] + vpmovuswb %xmm19, %xmm23 {%k4} + +// CHECK: vpmovuswb %xmm19, %xmm23 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8c,0x10,0xdf] + vpmovuswb %xmm19, %xmm23 {%k4} {z} + +// CHECK: vpmovuswb %ymm23, %xmm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x10,0xfc] + vpmovuswb %ymm23, %xmm28 + +// CHECK: vpmovuswb %ymm23, %xmm28 {%k6} +// CHECK: encoding: [0x62,0x82,0x7e,0x2e,0x10,0xfc] + vpmovuswb %ymm23, %xmm28 {%k6} + +// CHECK: vpmovuswb %ymm23, %xmm28 {%k6} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xae,0x10,0xfc] + vpmovuswb %ymm23, %xmm28 {%k6} {z} + +// CHECK: vpmovuswb %xmm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x09] + vpmovuswb %xmm25, (%rcx) + +// CHECK: vpmovuswb %xmm25, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x10,0x09] + vpmovuswb %xmm25, (%rcx) {%k3} + +// CHECK: vpmovuswb %xmm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x10,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovuswb %xmm25, 291(%rax,%r14,8) + +// CHECK: vpmovuswb %xmm25, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x4a,0x7f] + vpmovuswb %xmm25, 1016(%rdx) + +// CHECK: vpmovuswb %xmm25, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x8a,0x00,0x04,0x00,0x00] + vpmovuswb %xmm25, 1024(%rdx) + +// CHECK: vpmovuswb %xmm25, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x4a,0x80] + vpmovuswb %xmm25, -1024(%rdx) + +// CHECK: vpmovuswb %xmm25, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x8a,0xf8,0xfb,0xff,0xff] + vpmovuswb %xmm25, -1032(%rdx) + +// CHECK: vpmovuswb %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0x21] + vpmovuswb %ymm28, (%rcx) + +// CHECK: vpmovuswb %ymm28, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x2a,0x10,0x21] + vpmovuswb %ymm28, (%rcx) {%k2} + +// CHECK: vpmovuswb %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x10,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovuswb %ymm28, 291(%rax,%r14,8) + +// CHECK: vpmovuswb %ymm28, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0x62,0x7f] + vpmovuswb %ymm28, 2032(%rdx) + +// CHECK: vpmovuswb %ymm28, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0xa2,0x00,0x08,0x00,0x00] + vpmovuswb %ymm28, 2048(%rdx) + +// CHECK: vpmovuswb %ymm28, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0x62,0x80] + vpmovuswb %ymm28, -2048(%rdx) + +// CHECK: vpmovuswb %ymm28, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0xa2,0xf0,0xf7,0xff,0xff] + vpmovuswb %ymm28, -2064(%rdx) + // CHECK: vpmulhuw %xmm18, %xmm21, %xmm24 // CHECK: encoding: [0x62,0x21,0x55,0x00,0xe4,0xc2] vpmulhuw %xmm18, %xmm21, %xmm24 Index: test/MC/X86/x86-64-avx512f_vl.s =================================================================== --- test/MC/X86/x86-64-avx512f_vl.s +++ test/MC/X86/x86-64-avx512f_vl.s @@ -16285,6 +16285,1206 @@ // CHECK: encoding: [0x62,0x62,0x4d,0x30,0x2c,0x8a,0xfc,0xfd,0xff,0xff] vscalefps -516(%rdx){1to8}, %ymm22, %ymm25 +// CHECK: vpmovqb %xmm29, %xmm24 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x32,0xe8] + vpmovqb %xmm29, %xmm24 + +// CHECK: vpmovqb %xmm29, %xmm24 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x0c,0x32,0xe8] + vpmovqb %xmm29, %xmm24 {%k4} + +// CHECK: vpmovqb %xmm29, %xmm24 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x8c,0x32,0xe8] + vpmovqb %xmm29, %xmm24 {%k4} {z} + +// CHECK: vpmovqb %ymm29, %xmm17 +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x32,0xe9] + vpmovqb %ymm29, %xmm17 + +// CHECK: vpmovqb %ymm29, %xmm17 {%k3} +// CHECK: encoding: [0x62,0x22,0x7e,0x2b,0x32,0xe9] + vpmovqb %ymm29, %xmm17 {%k3} + +// CHECK: vpmovqb %ymm29, %xmm17 {%k3} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0xab,0x32,0xe9] + vpmovqb %ymm29, %xmm17 {%k3} {z} + +// CHECK: vpmovqb %xmm27, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x19] + vpmovqb %xmm27, (%rcx) + +// CHECK: vpmovqb %xmm27, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x0a,0x32,0x19] + vpmovqb %xmm27, (%rcx) {%k2} + +// CHECK: vpmovqb %xmm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x32,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovqb %xmm27, 291(%rax,%r14,8) + +// CHECK: vpmovqb %xmm27, 254(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x5a,0x7f] + vpmovqb %xmm27, 254(%rdx) + +// CHECK: vpmovqb %xmm27, 256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x9a,0x00,0x01,0x00,0x00] + vpmovqb %xmm27, 256(%rdx) + +// CHECK: vpmovqb %xmm27, -256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x5a,0x80] + vpmovqb %xmm27, -256(%rdx) + +// CHECK: vpmovqb %xmm27, -258(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x9a,0xfe,0xfe,0xff,0xff] + vpmovqb %xmm27, -258(%rdx) + +// CHECK: vpmovqb %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0x21] + vpmovqb %ymm28, (%rcx) + +// CHECK: vpmovqb %ymm28, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x2f,0x32,0x21] + vpmovqb %ymm28, (%rcx) {%k7} + +// CHECK: vpmovqb %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x32,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovqb %ymm28, 291(%rax,%r14,8) + +// CHECK: vpmovqb %ymm28, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0x62,0x7f] + vpmovqb %ymm28, 508(%rdx) + +// CHECK: vpmovqb %ymm28, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0xa2,0x00,0x02,0x00,0x00] + vpmovqb %ymm28, 512(%rdx) + +// CHECK: vpmovqb %ymm28, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0x62,0x80] + vpmovqb %ymm28, -512(%rdx) + +// CHECK: vpmovqb %ymm28, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0xa2,0xfc,0xfd,0xff,0xff] + vpmovqb %ymm28, -516(%rdx) + +// CHECK: vpmovsqb %xmm19, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x22,0xda] + vpmovsqb %xmm19, %xmm26 + +// CHECK: vpmovsqb %xmm19, %xmm26 {%k1} +// CHECK: encoding: [0x62,0x82,0x7e,0x09,0x22,0xda] + vpmovsqb %xmm19, %xmm26 {%k1} + +// CHECK: vpmovsqb %xmm19, %xmm26 {%k1} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x89,0x22,0xda] + vpmovsqb %xmm19, %xmm26 {%k1} {z} + +// CHECK: vpmovsqb %ymm20, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x22,0xe4] + vpmovsqb %ymm20, %xmm20 + +// CHECK: vpmovsqb %ymm20, %xmm20 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2e,0x22,0xe4] + vpmovsqb %ymm20, %xmm20 {%k6} + +// CHECK: vpmovsqb %ymm20, %xmm20 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xae,0x22,0xe4] + vpmovsqb %ymm20, %xmm20 {%k6} {z} + +// CHECK: vpmovsqb %xmm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x09] + vpmovsqb %xmm25, (%rcx) + +// CHECK: vpmovsqb %xmm25, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x0f,0x22,0x09] + vpmovsqb %xmm25, (%rcx) {%k7} + +// CHECK: vpmovsqb %xmm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x22,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsqb %xmm25, 291(%rax,%r14,8) + +// CHECK: vpmovsqb %xmm25, 254(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x4a,0x7f] + vpmovsqb %xmm25, 254(%rdx) + +// CHECK: vpmovsqb %xmm25, 256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x8a,0x00,0x01,0x00,0x00] + vpmovsqb %xmm25, 256(%rdx) + +// CHECK: vpmovsqb %xmm25, -256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x4a,0x80] + vpmovsqb %xmm25, -256(%rdx) + +// CHECK: vpmovsqb %xmm25, -258(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x8a,0xfe,0xfe,0xff,0xff] + vpmovsqb %xmm25, -258(%rdx) + +// CHECK: vpmovsqb %ymm17, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x09] + vpmovsqb %ymm17, (%rcx) + +// CHECK: vpmovsqb %ymm17, (%rcx) {%k4} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2c,0x22,0x09] + vpmovsqb %ymm17, (%rcx) {%k4} + +// CHECK: vpmovsqb %ymm17, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x22,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsqb %ymm17, 291(%rax,%r14,8) + +// CHECK: vpmovsqb %ymm17, 508(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x4a,0x7f] + vpmovsqb %ymm17, 508(%rdx) + +// CHECK: vpmovsqb %ymm17, 512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x8a,0x00,0x02,0x00,0x00] + vpmovsqb %ymm17, 512(%rdx) + +// CHECK: vpmovsqb %ymm17, -512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x4a,0x80] + vpmovsqb %ymm17, -512(%rdx) + +// CHECK: vpmovsqb %ymm17, -516(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x8a,0xfc,0xfd,0xff,0xff] + vpmovsqb %ymm17, -516(%rdx) + +// CHECK: vpmovusqb %xmm22, %xmm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x12,0xf4] + vpmovusqb %xmm22, %xmm28 + +// CHECK: vpmovusqb %xmm22, %xmm28 {%k2} +// CHECK: encoding: [0x62,0x82,0x7e,0x0a,0x12,0xf4] + vpmovusqb %xmm22, %xmm28 {%k2} + +// CHECK: vpmovusqb %xmm22, %xmm28 {%k2} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8a,0x12,0xf4] + vpmovusqb %xmm22, %xmm28 {%k2} {z} + +// CHECK: vpmovusqb %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x12,0xfe] + vpmovusqb %ymm23, %xmm22 + +// CHECK: vpmovusqb %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2f,0x12,0xfe] + vpmovusqb %ymm23, %xmm22 {%k7} + +// CHECK: vpmovusqb %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaf,0x12,0xfe] + vpmovusqb %ymm23, %xmm22 {%k7} {z} + +// CHECK: vpmovusqb %xmm26, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x11] + vpmovusqb %xmm26, (%rcx) + +// CHECK: vpmovusqb %xmm26, (%rcx) {%k5} +// CHECK: encoding: [0x62,0x62,0x7e,0x0d,0x12,0x11] + vpmovusqb %xmm26, (%rcx) {%k5} + +// CHECK: vpmovusqb %xmm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x12,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqb %xmm26, 291(%rax,%r14,8) + +// CHECK: vpmovusqb %xmm26, 254(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x52,0x7f] + vpmovusqb %xmm26, 254(%rdx) + +// CHECK: vpmovusqb %xmm26, 256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x92,0x00,0x01,0x00,0x00] + vpmovusqb %xmm26, 256(%rdx) + +// CHECK: vpmovusqb %xmm26, -256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x52,0x80] + vpmovusqb %xmm26, -256(%rdx) + +// CHECK: vpmovusqb %xmm26, -258(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x92,0xfe,0xfe,0xff,0xff] + vpmovusqb %xmm26, -258(%rdx) + +// CHECK: vpmovusqb %ymm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0x31] + vpmovusqb %ymm30, (%rcx) + +// CHECK: vpmovusqb %ymm30, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x2a,0x12,0x31] + vpmovusqb %ymm30, (%rcx) {%k2} + +// CHECK: vpmovusqb %ymm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x12,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovusqb %ymm30, 291(%rax,%r14,8) + +// CHECK: vpmovusqb %ymm30, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0x72,0x7f] + vpmovusqb %ymm30, 508(%rdx) + +// CHECK: vpmovusqb %ymm30, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0xb2,0x00,0x02,0x00,0x00] + vpmovusqb %ymm30, 512(%rdx) + +// CHECK: vpmovusqb %ymm30, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0x72,0x80] + vpmovusqb %ymm30, -512(%rdx) + +// CHECK: vpmovusqb %ymm30, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0xb2,0xfc,0xfd,0xff,0xff] + vpmovusqb %ymm30, -516(%rdx) + +// CHECK: vpmovqw %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x34,0xd3] + vpmovqw %xmm18, %xmm19 + +// CHECK: vpmovqw %xmm18, %xmm19 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0c,0x34,0xd3] + vpmovqw %xmm18, %xmm19 {%k4} + +// CHECK: vpmovqw %xmm18, %xmm19 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8c,0x34,0xd3] + vpmovqw %xmm18, %xmm19 {%k4} {z} + +// CHECK: vpmovqw %ymm22, %xmm19 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x34,0xf3] + vpmovqw %ymm22, %xmm19 + +// CHECK: vpmovqw %ymm22, %xmm19 {%k5} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2d,0x34,0xf3] + vpmovqw %ymm22, %xmm19 {%k5} + +// CHECK: vpmovqw %ymm22, %xmm19 {%k5} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xad,0x34,0xf3] + vpmovqw %ymm22, %xmm19 {%k5} {z} + +// CHECK: vpmovqw %xmm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0x29] + vpmovqw %xmm21, (%rcx) + +// CHECK: vpmovqw %xmm21, (%rcx) {%k3} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0b,0x34,0x29] + vpmovqw %xmm21, (%rcx) {%k3} + +// CHECK: vpmovqw %xmm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x34,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovqw %xmm21, 291(%rax,%r14,8) + +// CHECK: vpmovqw %xmm21, 508(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0x6a,0x7f] + vpmovqw %xmm21, 508(%rdx) + +// CHECK: vpmovqw %xmm21, 512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0xaa,0x00,0x02,0x00,0x00] + vpmovqw %xmm21, 512(%rdx) + +// CHECK: vpmovqw %xmm21, -512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0x6a,0x80] + vpmovqw %xmm21, -512(%rdx) + +// CHECK: vpmovqw %xmm21, -516(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0xaa,0xfc,0xfd,0xff,0xff] + vpmovqw %xmm21, -516(%rdx) + +// CHECK: vpmovqw %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0x21] + vpmovqw %ymm28, (%rcx) + +// CHECK: vpmovqw %ymm28, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x34,0x21] + vpmovqw %ymm28, (%rcx) {%k6} + +// CHECK: vpmovqw %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x34,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovqw %ymm28, 291(%rax,%r14,8) + +// CHECK: vpmovqw %ymm28, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0x62,0x7f] + vpmovqw %ymm28, 1016(%rdx) + +// CHECK: vpmovqw %ymm28, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0xa2,0x00,0x04,0x00,0x00] + vpmovqw %ymm28, 1024(%rdx) + +// CHECK: vpmovqw %ymm28, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0x62,0x80] + vpmovqw %ymm28, -1024(%rdx) + +// CHECK: vpmovqw %ymm28, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0xa2,0xf8,0xfb,0xff,0xff] + vpmovqw %ymm28, -1032(%rdx) + +// CHECK: vpmovsqw %xmm18, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x24,0xd2] + vpmovsqw %xmm18, %xmm26 + +// CHECK: vpmovsqw %xmm18, %xmm26 {%k7} +// CHECK: encoding: [0x62,0x82,0x7e,0x0f,0x24,0xd2] + vpmovsqw %xmm18, %xmm26 {%k7} + +// CHECK: vpmovsqw %xmm18, %xmm26 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8f,0x24,0xd2] + vpmovsqw %xmm18, %xmm26 {%k7} {z} + +// CHECK: vpmovsqw %ymm20, %xmm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x24,0xe4] + vpmovsqw %ymm20, %xmm28 + +// CHECK: vpmovsqw %ymm20, %xmm28 {%k4} +// CHECK: encoding: [0x62,0x82,0x7e,0x2c,0x24,0xe4] + vpmovsqw %ymm20, %xmm28 {%k4} + +// CHECK: vpmovsqw %ymm20, %xmm28 {%k4} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xac,0x24,0xe4] + vpmovsqw %ymm20, %xmm28 {%k4} {z} + +// CHECK: vpmovsqw %xmm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0x31] + vpmovsqw %xmm30, (%rcx) + +// CHECK: vpmovsqw %xmm30, (%rcx) {%k4} +// CHECK: encoding: [0x62,0x62,0x7e,0x0c,0x24,0x31] + vpmovsqw %xmm30, (%rcx) {%k4} + +// CHECK: vpmovsqw %xmm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x24,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovsqw %xmm30, 291(%rax,%r14,8) + +// CHECK: vpmovsqw %xmm30, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0x72,0x7f] + vpmovsqw %xmm30, 508(%rdx) + +// CHECK: vpmovsqw %xmm30, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0xb2,0x00,0x02,0x00,0x00] + vpmovsqw %xmm30, 512(%rdx) + +// CHECK: vpmovsqw %xmm30, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0x72,0x80] + vpmovsqw %xmm30, -512(%rdx) + +// CHECK: vpmovsqw %xmm30, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0xb2,0xfc,0xfd,0xff,0xff] + vpmovsqw %xmm30, -516(%rdx) + +// CHECK: vpmovsqw %ymm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0x29] + vpmovsqw %ymm21, (%rcx) + +// CHECK: vpmovsqw %ymm21, (%rcx) {%k5} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2d,0x24,0x29] + vpmovsqw %ymm21, (%rcx) {%k5} + +// CHECK: vpmovsqw %ymm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x24,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovsqw %ymm21, 291(%rax,%r14,8) + +// CHECK: vpmovsqw %ymm21, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0x6a,0x7f] + vpmovsqw %ymm21, 1016(%rdx) + +// CHECK: vpmovsqw %ymm21, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0xaa,0x00,0x04,0x00,0x00] + vpmovsqw %ymm21, 1024(%rdx) + +// CHECK: vpmovsqw %ymm21, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0x6a,0x80] + vpmovsqw %ymm21, -1024(%rdx) + +// CHECK: vpmovsqw %ymm21, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0xaa,0xf8,0xfb,0xff,0xff] + vpmovsqw %ymm21, -1032(%rdx) + +// CHECK: vpmovusqw %xmm20, %xmm29 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x14,0xe5] + vpmovusqw %xmm20, %xmm29 + +// CHECK: vpmovusqw %xmm20, %xmm29 {%k1} +// CHECK: encoding: [0x62,0x82,0x7e,0x09,0x14,0xe5] + vpmovusqw %xmm20, %xmm29 {%k1} + +// CHECK: vpmovusqw %xmm20, %xmm29 {%k1} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x89,0x14,0xe5] + vpmovusqw %xmm20, %xmm29 {%k1} {z} + +// CHECK: vpmovusqw %ymm21, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x14,0xec] + vpmovusqw %ymm21, %xmm20 + +// CHECK: vpmovusqw %ymm21, %xmm20 {%k5} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2d,0x14,0xec] + vpmovusqw %ymm21, %xmm20 {%k5} + +// CHECK: vpmovusqw %ymm21, %xmm20 {%k5} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xad,0x14,0xec] + vpmovusqw %ymm21, %xmm20 {%k5} {z} + +// CHECK: vpmovusqw %xmm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x11] + vpmovusqw %xmm18, (%rcx) + +// CHECK: vpmovusqw %xmm18, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x09,0x14,0x11] + vpmovusqw %xmm18, (%rcx) {%k1} + +// CHECK: vpmovusqw %xmm18, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x14,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqw %xmm18, 291(%rax,%r14,8) + +// CHECK: vpmovusqw %xmm18, 508(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x52,0x7f] + vpmovusqw %xmm18, 508(%rdx) + +// CHECK: vpmovusqw %xmm18, 512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x92,0x00,0x02,0x00,0x00] + vpmovusqw %xmm18, 512(%rdx) + +// CHECK: vpmovusqw %xmm18, -512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x52,0x80] + vpmovusqw %xmm18, -512(%rdx) + +// CHECK: vpmovusqw %xmm18, -516(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x92,0xfc,0xfd,0xff,0xff] + vpmovusqw %xmm18, -516(%rdx) + +// CHECK: vpmovusqw %ymm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x11] + vpmovusqw %ymm18, (%rcx) + +// CHECK: vpmovusqw %ymm18, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2a,0x14,0x11] + vpmovusqw %ymm18, (%rcx) {%k2} + +// CHECK: vpmovusqw %ymm18, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x14,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqw %ymm18, 291(%rax,%r14,8) + +// CHECK: vpmovusqw %ymm18, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x52,0x7f] + vpmovusqw %ymm18, 1016(%rdx) + +// CHECK: vpmovusqw %ymm18, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x92,0x00,0x04,0x00,0x00] + vpmovusqw %ymm18, 1024(%rdx) + +// CHECK: vpmovusqw %ymm18, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x52,0x80] + vpmovusqw %ymm18, -1024(%rdx) + +// CHECK: vpmovusqw %ymm18, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x92,0xf8,0xfb,0xff,0xff] + vpmovusqw %ymm18, -1032(%rdx) + +// CHECK: vpmovqd %xmm25, %xmm21 +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x35,0xcd] + vpmovqd %xmm25, %xmm21 + +// CHECK: vpmovqd %xmm25, %xmm21 {%k5} +// CHECK: encoding: [0x62,0x22,0x7e,0x0d,0x35,0xcd] + vpmovqd %xmm25, %xmm21 {%k5} + +// CHECK: vpmovqd %xmm25, %xmm21 {%k5} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0x8d,0x35,0xcd] + vpmovqd %xmm25, %xmm21 {%k5} {z} + +// CHECK: vpmovqd %ymm22, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x35,0xf5] + vpmovqd %ymm22, %xmm21 + +// CHECK: vpmovqd %ymm22, %xmm21 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2e,0x35,0xf5] + vpmovqd %ymm22, %xmm21 {%k6} + +// CHECK: vpmovqd %ymm22, %xmm21 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xae,0x35,0xf5] + vpmovqd %ymm22, %xmm21 {%k6} {z} + +// CHECK: vpmovqd %xmm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0x29] + vpmovqd %xmm29, (%rcx) + +// CHECK: vpmovqd %xmm29, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x0e,0x35,0x29] + vpmovqd %xmm29, (%rcx) {%k6} + +// CHECK: vpmovqd %xmm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x35,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovqd %xmm29, 291(%rax,%r14,8) + +// CHECK: vpmovqd %xmm29, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0x6a,0x7f] + vpmovqd %xmm29, 1016(%rdx) + +// CHECK: vpmovqd %xmm29, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0xaa,0x00,0x04,0x00,0x00] + vpmovqd %xmm29, 1024(%rdx) + +// CHECK: vpmovqd %xmm29, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0x6a,0x80] + vpmovqd %xmm29, -1024(%rdx) + +// CHECK: vpmovqd %xmm29, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0xaa,0xf8,0xfb,0xff,0xff] + vpmovqd %xmm29, -1032(%rdx) + +// CHECK: vpmovqd %ymm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0x31] + vpmovqd %ymm30, (%rcx) + +// CHECK: vpmovqd %ymm30, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x2a,0x35,0x31] + vpmovqd %ymm30, (%rcx) {%k2} + +// CHECK: vpmovqd %ymm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x35,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovqd %ymm30, 291(%rax,%r14,8) + +// CHECK: vpmovqd %ymm30, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0x72,0x7f] + vpmovqd %ymm30, 2032(%rdx) + +// CHECK: vpmovqd %ymm30, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0xb2,0x00,0x08,0x00,0x00] + vpmovqd %ymm30, 2048(%rdx) + +// CHECK: vpmovqd %ymm30, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0x72,0x80] + vpmovqd %ymm30, -2048(%rdx) + +// CHECK: vpmovqd %ymm30, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0xb2,0xf0,0xf7,0xff,0xff] + vpmovqd %ymm30, -2064(%rdx) + +// CHECK: vpmovsqd %xmm21, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x25,0xed] + vpmovsqd %xmm21, %xmm21 + +// CHECK: vpmovsqd %xmm21, %xmm21 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0a,0x25,0xed] + vpmovsqd %xmm21, %xmm21 {%k2} + +// CHECK: vpmovsqd %xmm21, %xmm21 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8a,0x25,0xed] + vpmovsqd %xmm21, %xmm21 {%k2} {z} + +// CHECK: vpmovsqd %ymm29, %xmm29 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x25,0xed] + vpmovsqd %ymm29, %xmm29 + +// CHECK: vpmovsqd %ymm29, %xmm29 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x2c,0x25,0xed] + vpmovsqd %ymm29, %xmm29 {%k4} + +// CHECK: vpmovsqd %ymm29, %xmm29 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xac,0x25,0xed] + vpmovsqd %ymm29, %xmm29 {%k4} {z} + +// CHECK: vpmovsqd %xmm17, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x09] + vpmovsqd %xmm17, (%rcx) + +// CHECK: vpmovsqd %xmm17, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x25,0x09] + vpmovsqd %xmm17, (%rcx) {%k2} + +// CHECK: vpmovsqd %xmm17, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x25,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsqd %xmm17, 291(%rax,%r14,8) + +// CHECK: vpmovsqd %xmm17, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x4a,0x7f] + vpmovsqd %xmm17, 1016(%rdx) + +// CHECK: vpmovsqd %xmm17, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x8a,0x00,0x04,0x00,0x00] + vpmovsqd %xmm17, 1024(%rdx) + +// CHECK: vpmovsqd %xmm17, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x4a,0x80] + vpmovsqd %xmm17, -1024(%rdx) + +// CHECK: vpmovsqd %xmm17, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x8a,0xf8,0xfb,0xff,0xff] + vpmovsqd %xmm17, -1032(%rdx) + +// CHECK: vpmovsqd %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0x39] + vpmovsqd %ymm23, (%rcx) + +// CHECK: vpmovsqd %ymm23, (%rcx) {%k5} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2d,0x25,0x39] + vpmovsqd %ymm23, (%rcx) {%k5} + +// CHECK: vpmovsqd %ymm23, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x25,0xbc,0xf0,0x23,0x01,0x00,0x00] + vpmovsqd %ymm23, 291(%rax,%r14,8) + +// CHECK: vpmovsqd %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0x7a,0x7f] + vpmovsqd %ymm23, 2032(%rdx) + +// CHECK: vpmovsqd %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0xba,0x00,0x08,0x00,0x00] + vpmovsqd %ymm23, 2048(%rdx) + +// CHECK: vpmovsqd %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0x7a,0x80] + vpmovsqd %ymm23, -2048(%rdx) + +// CHECK: vpmovsqd %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0xba,0xf0,0xf7,0xff,0xff] + vpmovsqd %ymm23, -2064(%rdx) + +// CHECK: vpmovusqd %xmm21, %xmm25 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x15,0xe9] + vpmovusqd %xmm21, %xmm25 + +// CHECK: vpmovusqd %xmm21, %xmm25 {%k5} +// CHECK: encoding: [0x62,0x82,0x7e,0x0d,0x15,0xe9] + vpmovusqd %xmm21, %xmm25 {%k5} + +// CHECK: vpmovusqd %xmm21, %xmm25 {%k5} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8d,0x15,0xe9] + vpmovusqd %xmm21, %xmm25 {%k5} {z} + +// CHECK: vpmovusqd %ymm21, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x15,0xec] + vpmovusqd %ymm21, %xmm20 + +// CHECK: vpmovusqd %ymm21, %xmm20 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2a,0x15,0xec] + vpmovusqd %ymm21, %xmm20 {%k2} + +// CHECK: vpmovusqd %ymm21, %xmm20 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaa,0x15,0xec] + vpmovusqd %ymm21, %xmm20 {%k2} {z} + +// CHECK: vpmovusqd %xmm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x11] + vpmovusqd %xmm18, (%rcx) + +// CHECK: vpmovusqd %xmm18, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x09,0x15,0x11] + vpmovusqd %xmm18, (%rcx) {%k1} + +// CHECK: vpmovusqd %xmm18, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x15,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqd %xmm18, 291(%rax,%r14,8) + +// CHECK: vpmovusqd %xmm18, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x52,0x7f] + vpmovusqd %xmm18, 1016(%rdx) + +// CHECK: vpmovusqd %xmm18, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x92,0x00,0x04,0x00,0x00] + vpmovusqd %xmm18, 1024(%rdx) + +// CHECK: vpmovusqd %xmm18, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x52,0x80] + vpmovusqd %xmm18, -1024(%rdx) + +// CHECK: vpmovusqd %xmm18, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x92,0xf8,0xfb,0xff,0xff] + vpmovusqd %xmm18, -1032(%rdx) + +// CHECK: vpmovusqd %ymm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0x29] + vpmovusqd %ymm29, (%rcx) + +// CHECK: vpmovusqd %ymm29, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x15,0x29] + vpmovusqd %ymm29, (%rcx) {%k6} + +// CHECK: vpmovusqd %ymm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x15,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovusqd %ymm29, 291(%rax,%r14,8) + +// CHECK: vpmovusqd %ymm29, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0x6a,0x7f] + vpmovusqd %ymm29, 2032(%rdx) + +// CHECK: vpmovusqd %ymm29, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0xaa,0x00,0x08,0x00,0x00] + vpmovusqd %ymm29, 2048(%rdx) + +// CHECK: vpmovusqd %ymm29, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0x6a,0x80] + vpmovusqd %ymm29, -2048(%rdx) + +// CHECK: vpmovusqd %ymm29, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0xaa,0xf0,0xf7,0xff,0xff] + vpmovusqd %ymm29, -2064(%rdx) + +// CHECK: vpmovdb %xmm21, %xmm30 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x31,0xee] + vpmovdb %xmm21, %xmm30 + +// CHECK: vpmovdb %xmm21, %xmm30 {%k3} +// CHECK: encoding: [0x62,0x82,0x7e,0x0b,0x31,0xee] + vpmovdb %xmm21, %xmm30 {%k3} + +// CHECK: vpmovdb %xmm21, %xmm30 {%k3} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8b,0x31,0xee] + vpmovdb %xmm21, %xmm30 {%k3} {z} + +// CHECK: vpmovdb %ymm21, %xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x31,0xef] + vpmovdb %ymm21, %xmm23 + +// CHECK: vpmovdb %ymm21, %xmm23 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2c,0x31,0xef] + vpmovdb %ymm21, %xmm23 {%k4} + +// CHECK: vpmovdb %ymm21, %xmm23 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xac,0x31,0xef] + vpmovdb %ymm21, %xmm23 {%k4} {z} + +// CHECK: vpmovdb %xmm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0x29] + vpmovdb %xmm29, (%rcx) + +// CHECK: vpmovdb %xmm29, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x31,0x29] + vpmovdb %xmm29, (%rcx) {%k3} + +// CHECK: vpmovdb %xmm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x31,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovdb %xmm29, 291(%rax,%r14,8) + +// CHECK: vpmovdb %xmm29, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0x6a,0x7f] + vpmovdb %xmm29, 508(%rdx) + +// CHECK: vpmovdb %xmm29, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0xaa,0x00,0x02,0x00,0x00] + vpmovdb %xmm29, 512(%rdx) + +// CHECK: vpmovdb %xmm29, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0x6a,0x80] + vpmovdb %xmm29, -512(%rdx) + +// CHECK: vpmovdb %xmm29, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0xaa,0xfc,0xfd,0xff,0xff] + vpmovdb %xmm29, -516(%rdx) + +// CHECK: vpmovdb %ymm26, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x11] + vpmovdb %ymm26, (%rcx) + +// CHECK: vpmovdb %ymm26, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x31,0x11] + vpmovdb %ymm26, (%rcx) {%k6} + +// CHECK: vpmovdb %ymm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x31,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovdb %ymm26, 291(%rax,%r14,8) + +// CHECK: vpmovdb %ymm26, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x52,0x7f] + vpmovdb %ymm26, 1016(%rdx) + +// CHECK: vpmovdb %ymm26, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x92,0x00,0x04,0x00,0x00] + vpmovdb %ymm26, 1024(%rdx) + +// CHECK: vpmovdb %ymm26, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x52,0x80] + vpmovdb %ymm26, -1024(%rdx) + +// CHECK: vpmovdb %ymm26, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x92,0xf8,0xfb,0xff,0xff] + vpmovdb %ymm26, -1032(%rdx) + +// CHECK: vpmovsdb %xmm27, %xmm30 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x21,0xde] + vpmovsdb %xmm27, %xmm30 + +// CHECK: vpmovsdb %xmm27, %xmm30 {%k1} +// CHECK: encoding: [0x62,0x02,0x7e,0x09,0x21,0xde] + vpmovsdb %xmm27, %xmm30 {%k1} + +// CHECK: vpmovsdb %xmm27, %xmm30 {%k1} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x89,0x21,0xde] + vpmovsdb %xmm27, %xmm30 {%k1} {z} + +// CHECK: vpmovsdb %ymm27, %xmm26 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x21,0xda] + vpmovsdb %ymm27, %xmm26 + +// CHECK: vpmovsdb %ymm27, %xmm26 {%k3} +// CHECK: encoding: [0x62,0x02,0x7e,0x2b,0x21,0xda] + vpmovsdb %ymm27, %xmm26 {%k3} + +// CHECK: vpmovsdb %ymm27, %xmm26 {%k3} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xab,0x21,0xda] + vpmovsdb %ymm27, %xmm26 {%k3} {z} + +// CHECK: vpmovsdb %xmm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0x31] + vpmovsdb %xmm30, (%rcx) + +// CHECK: vpmovsdb %xmm30, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x21,0x31] + vpmovsdb %xmm30, (%rcx) {%k3} + +// CHECK: vpmovsdb %xmm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x21,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovsdb %xmm30, 291(%rax,%r14,8) + +// CHECK: vpmovsdb %xmm30, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0x72,0x7f] + vpmovsdb %xmm30, 508(%rdx) + +// CHECK: vpmovsdb %xmm30, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0xb2,0x00,0x02,0x00,0x00] + vpmovsdb %xmm30, 512(%rdx) + +// CHECK: vpmovsdb %xmm30, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0x72,0x80] + vpmovsdb %xmm30, -512(%rdx) + +// CHECK: vpmovsdb %xmm30, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0xb2,0xfc,0xfd,0xff,0xff] + vpmovsdb %xmm30, -516(%rdx) + +// CHECK: vpmovsdb %ymm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x09] + vpmovsdb %ymm25, (%rcx) + +// CHECK: vpmovsdb %ymm25, (%rcx) {%k5} +// CHECK: encoding: [0x62,0x62,0x7e,0x2d,0x21,0x09] + vpmovsdb %ymm25, (%rcx) {%k5} + +// CHECK: vpmovsdb %ymm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x21,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsdb %ymm25, 291(%rax,%r14,8) + +// CHECK: vpmovsdb %ymm25, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x4a,0x7f] + vpmovsdb %ymm25, 1016(%rdx) + +// CHECK: vpmovsdb %ymm25, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x8a,0x00,0x04,0x00,0x00] + vpmovsdb %ymm25, 1024(%rdx) + +// CHECK: vpmovsdb %ymm25, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x4a,0x80] + vpmovsdb %ymm25, -1024(%rdx) + +// CHECK: vpmovsdb %ymm25, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x8a,0xf8,0xfb,0xff,0xff] + vpmovsdb %ymm25, -1032(%rdx) + +// CHECK: vpmovusdb %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x11,0xee] + vpmovusdb %xmm29, %xmm30 + +// CHECK: vpmovusdb %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x02,0x7e,0x0f,0x11,0xee] + vpmovusdb %xmm29, %xmm30 {%k7} + +// CHECK: vpmovusdb %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x8f,0x11,0xee] + vpmovusdb %xmm29, %xmm30 {%k7} {z} + +// CHECK: vpmovusdb %ymm17, %xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x11,0xcf] + vpmovusdb %ymm17, %xmm23 + +// CHECK: vpmovusdb %ymm17, %xmm23 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2e,0x11,0xcf] + vpmovusdb %ymm17, %xmm23 {%k6} + +// CHECK: vpmovusdb %ymm17, %xmm23 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xae,0x11,0xcf] + vpmovusdb %ymm17, %xmm23 {%k6} {z} + +// CHECK: vpmovusdb %xmm26, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x11] + vpmovusdb %xmm26, (%rcx) + +// CHECK: vpmovusdb %xmm26, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x0f,0x11,0x11] + vpmovusdb %xmm26, (%rcx) {%k7} + +// CHECK: vpmovusdb %xmm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x11,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusdb %xmm26, 291(%rax,%r14,8) + +// CHECK: vpmovusdb %xmm26, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x52,0x7f] + vpmovusdb %xmm26, 508(%rdx) + +// CHECK: vpmovusdb %xmm26, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x92,0x00,0x02,0x00,0x00] + vpmovusdb %xmm26, 512(%rdx) + +// CHECK: vpmovusdb %xmm26, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x52,0x80] + vpmovusdb %xmm26, -512(%rdx) + +// CHECK: vpmovusdb %xmm26, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x92,0xfc,0xfd,0xff,0xff] + vpmovusdb %xmm26, -516(%rdx) + +// CHECK: vpmovusdb %ymm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x09] + vpmovusdb %ymm25, (%rcx) + +// CHECK: vpmovusdb %ymm25, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x11,0x09] + vpmovusdb %ymm25, (%rcx) {%k6} + +// CHECK: vpmovusdb %ymm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x11,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovusdb %ymm25, 291(%rax,%r14,8) + +// CHECK: vpmovusdb %ymm25, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x4a,0x7f] + vpmovusdb %ymm25, 1016(%rdx) + +// CHECK: vpmovusdb %ymm25, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x8a,0x00,0x04,0x00,0x00] + vpmovusdb %ymm25, 1024(%rdx) + +// CHECK: vpmovusdb %ymm25, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x4a,0x80] + vpmovusdb %ymm25, -1024(%rdx) + +// CHECK: vpmovusdb %ymm25, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x8a,0xf8,0xfb,0xff,0xff] + vpmovusdb %ymm25, -1032(%rdx) + +// CHECK: vpmovdw %xmm25, %xmm17 +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x33,0xc9] + vpmovdw %xmm25, %xmm17 + +// CHECK: vpmovdw %xmm25, %xmm17 {%k5} +// CHECK: encoding: [0x62,0x22,0x7e,0x0d,0x33,0xc9] + vpmovdw %xmm25, %xmm17 {%k5} + +// CHECK: vpmovdw %xmm25, %xmm17 {%k5} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0x8d,0x33,0xc9] + vpmovdw %xmm25, %xmm17 {%k5} {z} + +// CHECK: vpmovdw %ymm19, %xmm25 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x33,0xd9] + vpmovdw %ymm19, %xmm25 + +// CHECK: vpmovdw %ymm19, %xmm25 {%k4} +// CHECK: encoding: [0x62,0x82,0x7e,0x2c,0x33,0xd9] + vpmovdw %ymm19, %xmm25 {%k4} + +// CHECK: vpmovdw %ymm19, %xmm25 {%k4} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xac,0x33,0xd9] + vpmovdw %ymm19, %xmm25 {%k4} {z} + +// CHECK: vpmovdw %xmm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0x29] + vpmovdw %xmm21, (%rcx) + +// CHECK: vpmovdw %xmm21, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x33,0x29] + vpmovdw %xmm21, (%rcx) {%k2} + +// CHECK: vpmovdw %xmm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x33,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovdw %xmm21, 291(%rax,%r14,8) + +// CHECK: vpmovdw %xmm21, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0x6a,0x7f] + vpmovdw %xmm21, 1016(%rdx) + +// CHECK: vpmovdw %xmm21, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0xaa,0x00,0x04,0x00,0x00] + vpmovdw %xmm21, 1024(%rdx) + +// CHECK: vpmovdw %xmm21, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0x6a,0x80] + vpmovdw %xmm21, -1024(%rdx) + +// CHECK: vpmovdw %xmm21, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0xaa,0xf8,0xfb,0xff,0xff] + vpmovdw %xmm21, -1032(%rdx) + +// CHECK: vpmovdw %ymm22, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0x31] + vpmovdw %ymm22, (%rcx) + +// CHECK: vpmovdw %ymm22, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2e,0x33,0x31] + vpmovdw %ymm22, (%rcx) {%k6} + +// CHECK: vpmovdw %ymm22, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x33,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovdw %ymm22, 291(%rax,%r14,8) + +// CHECK: vpmovdw %ymm22, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0x72,0x7f] + vpmovdw %ymm22, 2032(%rdx) + +// CHECK: vpmovdw %ymm22, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0xb2,0x00,0x08,0x00,0x00] + vpmovdw %ymm22, 2048(%rdx) + +// CHECK: vpmovdw %ymm22, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0x72,0x80] + vpmovdw %ymm22, -2048(%rdx) + +// CHECK: vpmovdw %ymm22, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0xb2,0xf0,0xf7,0xff,0xff] + vpmovdw %ymm22, -2064(%rdx) + +// CHECK: vpmovsdw %xmm18, %xmm18 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x23,0xd2] + vpmovsdw %xmm18, %xmm18 + +// CHECK: vpmovsdw %xmm18, %xmm18 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0e,0x23,0xd2] + vpmovsdw %xmm18, %xmm18 {%k6} + +// CHECK: vpmovsdw %xmm18, %xmm18 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8e,0x23,0xd2] + vpmovsdw %xmm18, %xmm18 {%k6} {z} + +// CHECK: vpmovsdw %ymm18, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x23,0xd4] + vpmovsdw %ymm18, %xmm20 + +// CHECK: vpmovsdw %ymm18, %xmm20 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2a,0x23,0xd4] + vpmovsdw %ymm18, %xmm20 {%k2} + +// CHECK: vpmovsdw %ymm18, %xmm20 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaa,0x23,0xd4] + vpmovsdw %ymm18, %xmm20 {%k2} {z} + +// CHECK: vpmovsdw %xmm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0x29] + vpmovsdw %xmm29, (%rcx) + +// CHECK: vpmovsdw %xmm29, (%rcx) {%k1} +// CHECK: encoding: [0x62,0x62,0x7e,0x09,0x23,0x29] + vpmovsdw %xmm29, (%rcx) {%k1} + +// CHECK: vpmovsdw %xmm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x23,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovsdw %xmm29, 291(%rax,%r14,8) + +// CHECK: vpmovsdw %xmm29, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0x6a,0x7f] + vpmovsdw %xmm29, 1016(%rdx) + +// CHECK: vpmovsdw %xmm29, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0xaa,0x00,0x04,0x00,0x00] + vpmovsdw %xmm29, 1024(%rdx) + +// CHECK: vpmovsdw %xmm29, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0x6a,0x80] + vpmovsdw %xmm29, -1024(%rdx) + +// CHECK: vpmovsdw %xmm29, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0xaa,0xf8,0xfb,0xff,0xff] + vpmovsdw %xmm29, -1032(%rdx) + +// CHECK: vpmovsdw %ymm19, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x19] + vpmovsdw %ymm19, (%rcx) + +// CHECK: vpmovsdw %ymm19, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2e,0x23,0x19] + vpmovsdw %ymm19, (%rcx) {%k6} + +// CHECK: vpmovsdw %ymm19, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x23,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovsdw %ymm19, 291(%rax,%r14,8) + +// CHECK: vpmovsdw %ymm19, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x5a,0x7f] + vpmovsdw %ymm19, 2032(%rdx) + +// CHECK: vpmovsdw %ymm19, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x9a,0x00,0x08,0x00,0x00] + vpmovsdw %ymm19, 2048(%rdx) + +// CHECK: vpmovsdw %ymm19, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x5a,0x80] + vpmovsdw %ymm19, -2048(%rdx) + +// CHECK: vpmovsdw %ymm19, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x9a,0xf0,0xf7,0xff,0xff] + vpmovsdw %ymm19, -2064(%rdx) + +// CHECK: vpmovusdw %xmm18, %xmm18 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x13,0xd2] + vpmovusdw %xmm18, %xmm18 + +// CHECK: vpmovusdw %xmm18, %xmm18 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0a,0x13,0xd2] + vpmovusdw %xmm18, %xmm18 {%k2} + +// CHECK: vpmovusdw %xmm18, %xmm18 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8a,0x13,0xd2] + vpmovusdw %xmm18, %xmm18 {%k2} {z} + +// CHECK: vpmovusdw %ymm25, %xmm28 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x13,0xcc] + vpmovusdw %ymm25, %xmm28 + +// CHECK: vpmovusdw %ymm25, %xmm28 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x2c,0x13,0xcc] + vpmovusdw %ymm25, %xmm28 {%k4} + +// CHECK: vpmovusdw %ymm25, %xmm28 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xac,0x13,0xcc] + vpmovusdw %ymm25, %xmm28 {%k4} {z} + +// CHECK: vpmovusdw %xmm20, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0x21] + vpmovusdw %xmm20, (%rcx) + +// CHECK: vpmovusdw %xmm20, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0e,0x13,0x21] + vpmovusdw %xmm20, (%rcx) {%k6} + +// CHECK: vpmovusdw %xmm20, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x13,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovusdw %xmm20, 291(%rax,%r14,8) + +// CHECK: vpmovusdw %xmm20, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0x62,0x7f] + vpmovusdw %xmm20, 1016(%rdx) + +// CHECK: vpmovusdw %xmm20, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0xa2,0x00,0x04,0x00,0x00] + vpmovusdw %xmm20, 1024(%rdx) + +// CHECK: vpmovusdw %xmm20, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0x62,0x80] + vpmovusdw %xmm20, -1024(%rdx) + +// CHECK: vpmovusdw %xmm20, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0xa2,0xf8,0xfb,0xff,0xff] + vpmovusdw %xmm20, -1032(%rdx) + +// CHECK: vpmovusdw %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0x39] + vpmovusdw %ymm23, (%rcx) + +// CHECK: vpmovusdw %ymm23, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x29,0x13,0x39] + vpmovusdw %ymm23, (%rcx) {%k1} + +// CHECK: vpmovusdw %ymm23, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x13,0xbc,0xf0,0x23,0x01,0x00,0x00] + vpmovusdw %ymm23, 291(%rax,%r14,8) + +// CHECK: vpmovusdw %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0x7a,0x7f] + vpmovusdw %ymm23, 2032(%rdx) + +// CHECK: vpmovusdw %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0xba,0x00,0x08,0x00,0x00] + vpmovusdw %ymm23, 2048(%rdx) + +// CHECK: vpmovusdw %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0x7a,0x80] + vpmovusdw %ymm23, -2048(%rdx) + +// CHECK: vpmovusdw %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0xba,0xf0,0xf7,0xff,0xff] + vpmovusdw %ymm23, -2064(%rdx) + // CHECK: vcvtps2pd %xmm27, %xmm20 // CHECK: encoding: [0x62,0x81,0x7c,0x08,0x5a,0xe3] vcvtps2pd %xmm27, %xmm20