diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1876,6 +1876,84 @@ TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2pd128_mask, "V2dV8xV2dUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_mask, "V4dV8xV4dUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2pd512_mask, "V8dV8xV8dUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsh2ss_round_mask, "V4fV4fV8xV4fUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtss2sh_round_mask, "V8xV8xV4fV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsd2sh_round_mask, "V8xV8xV2dV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsh2sd_round_mask, "V2dV2dV8xV2dUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtw2ph128_mask, "V8xV8sV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_mask, "V16xV16sV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtw2ph512_mask, "V32xV32sV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph128_mask, "V8xV8UsV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_mask, "V16xV16UsV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph512_mask, "V32xV32UsV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph128_mask, "V8xV4iV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_mask, "V8xV8iV8xUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph512_mask, "V16xV16iV16xUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph128_mask, "V8xV4UiV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_mask, "V8xV8UiV8xUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph512_mask, "V16xV16UiV16xUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph128_mask, "V8xV2OiV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_mask, "V8xV4OiV8xUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph512_mask, "V8xV8OiV8xUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph128_mask, "V8xV2UOiV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_mask, "V8xV4UOiV8xUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph512_mask, "V8xV8UOiV8xUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsh2si32, "iV8xIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtusi2sh, "V8xV8xUiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsi2sh, "V8xV8xiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttsh2si32, "iV8xIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_vcvtph2psx128_mask, "V4fV8xV4fUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_mask, "V8fV8xV8fUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtph2psx512_mask, "V16fV16xV16fUsIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtps2phx128_mask, "V8xV4fV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16") + // generic select intrinsics TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl") diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -92,6 +92,12 @@ TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_vcvtsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtusi642sh, "V8xV8xUOiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvtsi642sh, "V8xV8xOiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri") // UINTR diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12723,10 +12723,16 @@ case X86::BI__builtin_ia32_cvtdq2ps512_mask: case X86::BI__builtin_ia32_cvtqq2ps512_mask: case X86::BI__builtin_ia32_cvtqq2pd512_mask: + case X86::BI__builtin_ia32_vcvtw2ph512_mask: + case X86::BI__builtin_ia32_vcvtdq2ph512_mask: + case X86::BI__builtin_ia32_vcvtqq2ph512_mask: return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true); case X86::BI__builtin_ia32_cvtudq2ps512_mask: case X86::BI__builtin_ia32_cvtuqq2ps512_mask: case X86::BI__builtin_ia32_cvtuqq2pd512_mask: + case X86::BI__builtin_ia32_vcvtuw2ph512_mask: + case X86::BI__builtin_ia32_vcvtudq2ph512_mask: + case X86::BI__builtin_ia32_vcvtuqq2ph512_mask: return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false); case X86::BI__builtin_ia32_vfmaddss3: diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -947,6 +947,996 @@ return __b[0]; } +#define _mm512_cvt_roundpd_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ + (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ + (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( + (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( + (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( + (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_pd(A, R) \ + ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ + (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_pd(U, A, R) \ + ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ + (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) { + return (__m512d)__builtin_ia32_vcvtph2pd512_mask( + (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) { + return (__m512d)__builtin_ia32_vcvtph2pd512_mask( + (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { + return (__m512d)__builtin_ia32_vcvtph2pd512_mask( + (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_ss(A, B, R) \ + ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \ + (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A, + __m128h __B) { + return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( + (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W, + __mmask8 __U, + __m128 __A, + __m128h __B) { + return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U, + __m128 __A, + __m128h __B) { + return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( + (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundss_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ + (__v8hf)_mm_undefined_ph(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \ + (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( + (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( + (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U, + __m128h __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( + (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsd_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ + (__v8hf)_mm_undefined_ph(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \ + (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A, + __m128d __B) { + return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( + (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128d __B) { + return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( + (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) { + return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( + (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_sd(A, B, R) \ + ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \ + (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A, + __m128h __B) { + return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( + (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W, + __mmask8 __U, + __m128d __A, + __m128h __B) { + return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( + (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) { + return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( + (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epi16(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ + (__v32hi)_mm512_undefined_epi32(), \ + (__mmask32)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ + (__v32hi)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epi16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2w512_mask( + (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epi16(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2w512_mask( \ + (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \ + (__v32hi)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epi16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2w512_mask( + (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi16_ph(A, R) \ + ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \ + ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \ + (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_cvtepi16_ph(__m512i __A) { + return (__m512h)__builtin_ia32_vcvtw2ph512_mask( + (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtw2ph512_mask( + (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtw2ph512_mask( + (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epu16(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \ + (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \ + (__v32hu)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epu16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2uw512_mask( + (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epu16(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \ + (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \ + (__v32hu)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epu16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2uw512_mask( + (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu16_ph(A, R) \ + ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \ + ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \ + (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_cvtepu16_ph(__m512i __A) { + return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( + (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( + (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( + (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epi32(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \ + (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \ + (__v16si)_mm512_setzero_epi32(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epi32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2dq512_mask( + (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epu32(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \ + (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \ + (__v16su)_mm512_setzero_epi32(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epu32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2udq512_mask( + (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi32_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \ + (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \ + (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_ph(__m512i __A) { + return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( + (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( + (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( + (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu32_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \ + (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \ + (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_cvtepu32_ph(__m512i __A) { + return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( + (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( + (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( + (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epi32(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \ + (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \ + (__v16si)_mm512_setzero_epi32(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epi32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2dq512_mask( + (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epu32(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ + (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ + (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \ + (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epu32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2udq512_mask( + (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi64_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ + (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ + (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_ph(__m512i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( + (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( + (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( + (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epi64(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \ + (__v8di)_mm512_undefined_epi32(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \ + (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epi64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2qq512_mask( + (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu64_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ + (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ + (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_cvtepu64_ph(__m512i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( + (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( + (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( + (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epu64(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epu64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( + (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epi64(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ + (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ + (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epi64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2qq512_mask( + (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epu64(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epu64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( + (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_i32(A, R) \ + ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R))) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) { + return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS128 +_mm_cvtsh_u32(__m128h __A) { + return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvt_roundsh_i64(A, R) \ + ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R))) + +static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) { + return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvtsh_u64(__m128h __A) { + return (unsigned long long)__builtin_ia32_vcvtsh2usi64( + (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); +} +#endif // __x86_64__ + +#define _mm_cvt_roundu32_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_cvtu32_sh(__m128h __A, unsigned int __B) { + __A[0] = __B; + return __A; +} + +#ifdef __x86_64__ +#define _mm_cvt_roundu64_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \ + (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_cvtu64_sh(__m128h __A, unsigned long long __B) { + __A[0] = __B; + return __A; +} +#endif + +#define _mm_cvt_roundi32_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A, + int __B) { + __A[0] = __B; + return __A; +} + +#ifdef __x86_64__ +#define _mm_cvt_roundi64_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A, + long long __B) { + __A[0] = __B; + return __A; +} +#endif + +#define _mm_cvtt_roundsh_i32(A, R) \ + ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R))) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) { + return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundsh_i64(A, R) \ + ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R))) + +static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) { + return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvtt_roundsh_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS128 +_mm_cvttsh_u32(__m128h __A) { + return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundsh_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvttsh_u64(__m128h __A) { + return (unsigned long long)__builtin_ia32_vcvttsh2usi64( + (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm512_cvtx_roundph_ps(A, R) \ + ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \ + ((__m512)__builtin_ia32_vcvtph2psx512_mask( \ + (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) { + return (__m512)__builtin_ia32_vcvtph2psx512_mask( + (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) { + return (__m512)__builtin_ia32_vcvtph2psx512_mask( + (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) { + return (__m512)__builtin_ia32_vcvtph2psx512_mask( + (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtx_roundps_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \ + (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \ + (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) { + return (__m256h)__builtin_ia32_vcvtps2phx512_mask( + (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) { + return (__m256h)__builtin_ia32_vcvtps2phx512_mask( + (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) { + return (__m256h)__builtin_ia32_vcvtps2phx512_mask( + (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_reduce_add_ph(__m512h __W) { return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -327,6 +327,772 @@ ((__mmask8)__builtin_ia32_cmpph128_mask( \ (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( + (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, + __mmask8 __U, + __m128d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( + (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( + (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( + (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { + return (__m128d)__builtin_ia32_vcvtph2pd128_mask( + (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, + __mmask8 __U, + __m128h __A) { + return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, + (__mmask8)__U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { + return (__m128d)__builtin_ia32_vcvtph2pd128_mask( + (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { + return (__m256d)__builtin_ia32_vcvtph2pd256_mask( + (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { + return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, + (__mmask8)__U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { + return (__m256d)__builtin_ia32_vcvtph2pd256_mask( + (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epi16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epi16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { + return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_cvtepi16_ph(__m256i __A) { + return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, + (__v16hf)_mm256_cvtepi16_ph(__A), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epu16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epu16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { + return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_cvtepu16_ph(__m256i __A) { + return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, + (__v16hf)_mm256_cvtepu16_ph(__A), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epi32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epu32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( + (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( + (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepi32_ph(__m256i __A) { + return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( + (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( + (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepu32_ph(__m256i __A) { + return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epi32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epu32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( + (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( + (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_ph(__m256i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( + (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( + (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epi64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( + (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( + (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepu64_ph(__m256i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( + (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( + (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epu64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epi64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epu64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { + return (__m128)__builtin_ia32_vcvtph2psx128_mask( + (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, + __mmask8 __U, + __m128h __A) { + return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { + return (__m128)__builtin_ia32_vcvtph2psx128_mask( + (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { + return (__m256)__builtin_ia32_vcvtph2psx256_mask( + (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { + return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { + return (__m256)__builtin_ia32_vcvtph2psx256_mask( + (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx128_mask( + (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, + __mmask8 __U, + __m128 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx128_mask( + (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx256_mask( + (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx256_mask( + (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3878,6 +3878,10 @@ case X86::BI__builtin_ia32_vcvttss2si64: case X86::BI__builtin_ia32_vcvttss2usi32: case X86::BI__builtin_ia32_vcvttss2usi64: + case X86::BI__builtin_ia32_vcvttsh2si32: + case X86::BI__builtin_ia32_vcvttsh2si64: + case X86::BI__builtin_ia32_vcvttsh2usi32: + case X86::BI__builtin_ia32_vcvttsh2usi64: ArgNum = 1; break; case X86::BI__builtin_ia32_maxpd512: @@ -3888,6 +3892,8 @@ case X86::BI__builtin_ia32_minph512: ArgNum = 2; break; + case X86::BI__builtin_ia32_vcvtph2pd512_mask: + case X86::BI__builtin_ia32_vcvtph2psx512_mask: case X86::BI__builtin_ia32_cvtps2pd512_mask: case X86::BI__builtin_ia32_cvttpd2dq512_mask: case X86::BI__builtin_ia32_cvttpd2qq512_mask: @@ -3897,6 +3903,12 @@ case X86::BI__builtin_ia32_cvttps2qq512_mask: case X86::BI__builtin_ia32_cvttps2udq512_mask: case X86::BI__builtin_ia32_cvttps2uqq512_mask: + case X86::BI__builtin_ia32_vcvttph2w512_mask: + case X86::BI__builtin_ia32_vcvttph2uw512_mask: + case X86::BI__builtin_ia32_vcvttph2dq512_mask: + case X86::BI__builtin_ia32_vcvttph2udq512_mask: + case X86::BI__builtin_ia32_vcvttph2qq512_mask: + case X86::BI__builtin_ia32_vcvttph2uqq512_mask: case X86::BI__builtin_ia32_exp2pd_mask: case X86::BI__builtin_ia32_exp2ps_mask: case X86::BI__builtin_ia32_getexppd512_mask: @@ -3916,6 +3928,8 @@ case X86::BI__builtin_ia32_cmpsd_mask: case X86::BI__builtin_ia32_cmpss_mask: case X86::BI__builtin_ia32_cmpsh_mask: + case X86::BI__builtin_ia32_vcvtsh2sd_round_mask: + case X86::BI__builtin_ia32_vcvtsh2ss_round_mask: case X86::BI__builtin_ia32_cvtss2sd_round_mask: case X86::BI__builtin_ia32_getexpsd128_round_mask: case X86::BI__builtin_ia32_getexpss128_round_mask: @@ -3965,6 +3979,10 @@ case X86::BI__builtin_ia32_vcvtss2si64: case X86::BI__builtin_ia32_vcvtss2usi32: case X86::BI__builtin_ia32_vcvtss2usi64: + case X86::BI__builtin_ia32_vcvtsh2si32: + case X86::BI__builtin_ia32_vcvtsh2si64: + case X86::BI__builtin_ia32_vcvtsh2usi32: + case X86::BI__builtin_ia32_vcvtsh2usi64: case X86::BI__builtin_ia32_sqrtpd512: case X86::BI__builtin_ia32_sqrtps512: ArgNum = 1; @@ -3988,11 +4006,17 @@ case X86::BI__builtin_ia32_cvtusi2sd64: case X86::BI__builtin_ia32_cvtusi2ss32: case X86::BI__builtin_ia32_cvtusi2ss64: + case X86::BI__builtin_ia32_vcvtusi2sh: + case X86::BI__builtin_ia32_vcvtusi642sh: + case X86::BI__builtin_ia32_vcvtsi2sh: + case X86::BI__builtin_ia32_vcvtsi642sh: ArgNum = 2; HasRC = true; break; case X86::BI__builtin_ia32_cvtdq2ps512_mask: case X86::BI__builtin_ia32_cvtudq2ps512_mask: + case X86::BI__builtin_ia32_vcvtpd2ph512_mask: + case X86::BI__builtin_ia32_vcvtps2phx512_mask: case X86::BI__builtin_ia32_cvtpd2ps512_mask: case X86::BI__builtin_ia32_cvtpd2dq512_mask: case X86::BI__builtin_ia32_cvtpd2qq512_mask: @@ -4006,6 +4030,18 @@ case X86::BI__builtin_ia32_cvtqq2ps512_mask: case X86::BI__builtin_ia32_cvtuqq2pd512_mask: case X86::BI__builtin_ia32_cvtuqq2ps512_mask: + case X86::BI__builtin_ia32_vcvtdq2ph512_mask: + case X86::BI__builtin_ia32_vcvtudq2ph512_mask: + case X86::BI__builtin_ia32_vcvtw2ph512_mask: + case X86::BI__builtin_ia32_vcvtuw2ph512_mask: + case X86::BI__builtin_ia32_vcvtph2w512_mask: + case X86::BI__builtin_ia32_vcvtph2uw512_mask: + case X86::BI__builtin_ia32_vcvtph2dq512_mask: + case X86::BI__builtin_ia32_vcvtph2udq512_mask: + case X86::BI__builtin_ia32_vcvtph2qq512_mask: + case X86::BI__builtin_ia32_vcvtph2uqq512_mask: + case X86::BI__builtin_ia32_vcvtqq2ph512_mask: + case X86::BI__builtin_ia32_vcvtuqq2ph512_mask: ArgNum = 3; HasRC = true; break; @@ -4026,6 +4062,8 @@ case X86::BI__builtin_ia32_scalefsd_round_mask: case X86::BI__builtin_ia32_scalefss_round_mask: case X86::BI__builtin_ia32_cvtsd2ss_round_mask: + case X86::BI__builtin_ia32_vcvtss2sh_round_mask: + case X86::BI__builtin_ia32_vcvtsd2sh_round_mask: case X86::BI__builtin_ia32_sqrtsd_round_mask: case X86::BI__builtin_ia32_sqrtss_round_mask: case X86::BI__builtin_ia32_vfmaddsd3_mask: diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -1542,6 +1542,1096 @@ return _mm_cvtsi16_si128(A); } +__m128h test_mm512_cvt_roundpd_ph(__m512d A) { + // CHECK-LABEL: test_mm512_cvt_roundpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 + return _mm512_cvt_roundpd_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_mask_cvt_roundpd_ph(__m128h A, __mmask8 B, __m512d C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 + return _mm512_mask_cvt_roundpd_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_maskz_cvt_roundpd_ph(__mmask8 A, __m512d B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 + return _mm512_maskz_cvt_roundpd_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_cvtpd_ph(__m512d A) { + // CHECK-LABEL: test_mm512_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 + return _mm512_cvtpd_ph(A); +} + +__m128h test_mm512_mask_cvtpd_ph(__m128h A, __mmask8 B, __m512d C) { + // CHECK-LABEL: test_mm512_mask_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 + return _mm512_mask_cvtpd_ph(A, B, C); +} + +__m128h test_mm512_maskz_cvtpd_ph(__mmask8 A, __m512d B) { + // CHECK-LABEL: test_mm512_maskz_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 + return _mm512_maskz_cvtpd_ph(A, B); +} + +__m512d test_mm512_cvt_roundph_pd(__m128h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512 + return _mm512_cvt_roundph_pd(A, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_mask_cvt_roundph_pd(__m512d A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512 + return _mm512_mask_cvt_roundph_pd(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_maskz_cvt_roundph_pd(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512 + return _mm512_maskz_cvt_roundph_pd(A, B, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_cvtph_pd(__m128h A) { + // CHECK-LABEL: test_mm512_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512 + return _mm512_cvtph_pd(A); +} + +__m512d test_mm512_mask_cvtph_pd(__m512d A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512 + return _mm512_mask_cvtph_pd(A, B, C); +} + +__m512d test_mm512_maskz_cvtph_pd(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512 + return _mm512_maskz_cvtph_pd(A, B); +} + +__m128 test_mm_cvt_roundsh_ss(__m128 A, __m128h B) { + // CHECK-LABEL: test_mm_cvt_roundsh_ss + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round + return _mm_cvt_roundsh_ss(A, B, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_mask_cvt_roundsh_ss(__m128 A, __mmask8 B, __m128 C, __m128h D) { + // CHECK-LABEL: test_mm_mask_cvt_roundsh_ss + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round + return _mm_mask_cvt_roundsh_ss(A, B, C, D, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_maskz_cvt_roundsh_ss(__mmask8 A, __m128 B, __m128h C) { + // CHECK-LABEL: test_mm_maskz_cvt_roundsh_ss + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round + return _mm_maskz_cvt_roundsh_ss(A, B, C, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_cvtsh_ss(__m128 A, __m128h B) { + // CHECK-LABEL: test_mm_cvtsh_ss + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round + return _mm_cvtsh_ss(A, B); +} + +__m128 test_mm_mask_cvtsh_ss(__m128 A, __mmask8 B, __m128 C, __m128h D) { + // CHECK-LABEL: test_mm_mask_cvtsh_ss + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round + return _mm_mask_cvtsh_ss(A, B, C, D); +} + +__m128 test_mm_maskz_cvtsh_ss(__mmask8 A, __m128 B, __m128h C) { + // CHECK-LABEL: test_mm_maskz_cvtsh_ss + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round + return _mm_maskz_cvtsh_ss(A, B, C); +} + +__m128h test_mm_cvt_roundss_sh(__m128h A, __m128 B) { + // CHECK-LABEL: test_mm_cvt_roundss_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round + return _mm_cvt_roundss_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_cvt_roundss_sh(__m128h A, __mmask8 B, __m128h C, __m128 D) { + // CHECK-LABEL: test_mm_mask_cvt_roundss_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round + return _mm_mask_cvt_roundss_sh(A, B, C, D, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_cvt_roundss_sh(__mmask8 A, __m128h B, __m128 C) { + // CHECK-LABEL: test_mm_maskz_cvt_roundss_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round + return _mm_maskz_cvt_roundss_sh(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_cvtss_sh(__m128h A, __m128 B) { + // CHECK-LABEL: test_mm_cvtss_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round + return _mm_cvtss_sh(A, B); +} + +__m128h test_mm_mask_cvtss_sh(__m128h A, __mmask8 B, __m128h C, __m128 D) { + // CHECK-LABEL: test_mm_mask_cvtss_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round + return _mm_mask_cvtss_sh(A, B, C, D); +} + +__m128h test_mm_maskz_cvtss_sh(__mmask8 A, __m128h B, __m128 C) { + // CHECK-LABEL: test_mm_maskz_cvtss_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round + return _mm_maskz_cvtss_sh(A, B, C); +} + +__m128h test_mm_cvt_roundsd_sh(__m128h A, __m128d B) { + // CHECK-LABEL: test_mm_cvt_roundsd_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round + return _mm_cvt_roundsd_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_cvt_roundsd_sh(__m128h A, __mmask8 B, __m128h C, __m128d D) { + // CHECK-LABEL: test_mm_mask_cvt_roundsd_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round + return _mm_mask_cvt_roundsd_sh(A, B, C, D, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_cvt_roundsd_sh(__mmask8 A, __m128h B, __m128d C) { + // CHECK-LABEL: test_mm_maskz_cvt_roundsd_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round + return _mm_maskz_cvt_roundsd_sh(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_cvtsd_sh(__m128h A, __m128d B) { + // CHECK-LABEL: test_mm_cvtsd_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round + return _mm_cvtsd_sh(A, B); +} + +__m128h test_mm_mask_cvtsd_sh(__m128h A, __mmask8 B, __m128h C, __m128d D) { + // CHECK-LABEL: test_mm_mask_cvtsd_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round + return _mm_mask_cvtsd_sh(A, B, C, D); +} + +__m128h test_mm_maskz_cvtsd_sh(__mmask8 A, __m128h B, __m128d C) { + // CHECK-LABEL: test_mm_maskz_cvtsd_sh + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round + return _mm_maskz_cvtsd_sh(A, B, C); +} + +__m128d test_mm_cvt_roundsh_sd(__m128d A, __m128h B) { + // CHECK-LABEL: test_mm_cvt_roundsh_sd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round + return _mm_cvt_roundsh_sd(A, B, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_mask_cvt_roundsh_sd(__m128d A, __mmask8 B, __m128d C, __m128h D) { + // CHECK-LABEL: test_mm_mask_cvt_roundsh_sd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round + return _mm_mask_cvt_roundsh_sd(A, B, C, D, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_maskz_cvt_roundsh_sd(__mmask8 A, __m128d B, __m128h C) { + // CHECK-LABEL: test_mm_maskz_cvt_roundsh_sd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round + return _mm_maskz_cvt_roundsh_sd(A, B, C, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_cvtsh_sd(__m128d A, __m128h B) { + // CHECK-LABEL: test_mm_cvtsh_sd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round + return _mm_cvtsh_sd(A, B); +} + +__m128d test_mm_mask_cvtsh_sd(__m128d A, __mmask8 B, __m128d C, __m128h D) { + // CHECK-LABEL: test_mm_mask_cvtsh_sd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round + return _mm_mask_cvtsh_sd(A, B, C, D); +} + +__m128d test_mm_maskz_cvtsh_sd(__mmask8 A, __m128d B, __m128h C) { + // CHECK-LABEL: test_mm_maskz_cvtsh_sd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round + return _mm_maskz_cvtsh_sd(A, B, C); +} + +__m512i test_mm512_cvt_roundph_epi16(__m512h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512 + return _mm512_cvt_roundph_epi16(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvt_roundph_epi16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512 + return _mm512_mask_cvt_roundph_epi16(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvt_roundph_epi16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512 + return _mm512_maskz_cvt_roundph_epi16(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvtph_epi16(__m512h A) { + // CHECK-LABEL: test_mm512_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512 + return _mm512_cvtph_epi16(A); +} + +__m512i test_mm512_mask_cvtph_epi16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512 + return _mm512_mask_cvtph_epi16(A, B, C); +} + +__m512i test_mm512_maskz_cvtph_epi16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512 + return _mm512_maskz_cvtph_epi16(A, B); +} + +__m512i test_mm512_cvtt_roundph_epi16(__m512h A) { + // CHECK-LABEL: test_mm512_cvtt_roundph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512 + return _mm512_cvtt_roundph_epi16(A, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvtt_roundph_epi16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512 + return _mm512_mask_cvtt_roundph_epi16(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvtt_roundph_epi16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512 + return _mm512_maskz_cvtt_roundph_epi16(A, B, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvttph_epi16(__m512h A) { + // CHECK-LABEL: test_mm512_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512 + return _mm512_cvttph_epi16(A); +} + +__m512i test_mm512_mask_cvttph_epi16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512 + return _mm512_mask_cvttph_epi16(A, B, C); +} + +__m512i test_mm512_maskz_cvttph_epi16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512 + return _mm512_maskz_cvttph_epi16(A, B); +} + +__m512h test_mm512_cvt_roundepi16_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvt_roundepi16_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16 + return _mm512_cvt_roundepi16_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_cvt_roundepi16_ph(__m512h A, __mmask32 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundepi16_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16 + return _mm512_mask_cvt_roundepi16_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_cvt_roundepi16_ph(__mmask32 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundepi16_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16 + return _mm512_maskz_cvt_roundepi16_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_cvtepi16_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half> + return _mm512_cvtepi16_ph(A); +} + +__m512h test_mm512_mask_cvtepi16_ph(__m512h A, __mmask32 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half> + return _mm512_mask_cvtepi16_ph(A, B, C); +} + +__m512h test_mm512_maskz_cvtepi16_ph(__mmask32 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half> + return _mm512_maskz_cvtepi16_ph(A, B); +} + +__m512i test_mm512_cvt_roundph_epu16(__m512h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512 + return _mm512_cvt_roundph_epu16(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvt_roundph_epu16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512 + return _mm512_mask_cvt_roundph_epu16(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvt_roundph_epu16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512 + return _mm512_maskz_cvt_roundph_epu16(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvtph_epu16(__m512h A) { + // CHECK-LABEL: test_mm512_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512 + return _mm512_cvtph_epu16(A); +} + +__m512i test_mm512_mask_cvtph_epu16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512 + return _mm512_mask_cvtph_epu16(A, B, C); +} + +__m512i test_mm512_maskz_cvtph_epu16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512 + return _mm512_maskz_cvtph_epu16(A, B); +} + +__m512i test_mm512_cvtt_roundph_epu16(__m512h A) { + // CHECK-LABEL: test_mm512_cvtt_roundph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512 + return _mm512_cvtt_roundph_epu16(A, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvtt_roundph_epu16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512 + return _mm512_mask_cvtt_roundph_epu16(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvtt_roundph_epu16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512 + return _mm512_maskz_cvtt_roundph_epu16(A, B, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvttph_epu16(__m512h A) { + // CHECK-LABEL: test_mm512_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512 + return _mm512_cvttph_epu16(A); +} + +__m512i test_mm512_mask_cvttph_epu16(__m512i A, __mmask32 B, __m512h C) { + // CHECK-LABEL: test_mm512_mask_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512 + return _mm512_mask_cvttph_epu16(A, B, C); +} + +__m512i test_mm512_maskz_cvttph_epu16(__mmask32 A, __m512h B) { + // CHECK-LABEL: test_mm512_maskz_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512 + return _mm512_maskz_cvttph_epu16(A, B); +} + +__m512h test_mm512_cvt_roundepu16_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvt_roundepu16_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16 + return _mm512_cvt_roundepu16_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_cvt_roundepu16_ph(__m512h A, __mmask32 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundepu16_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16 + return _mm512_mask_cvt_roundepu16_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_cvt_roundepu16_ph(__mmask32 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundepu16_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16 + return _mm512_maskz_cvt_roundepu16_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_cvtepu16_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half> + return _mm512_cvtepu16_ph(A); +} + +__m512h test_mm512_mask_cvtepu16_ph(__m512h A, __mmask32 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half> + return _mm512_mask_cvtepu16_ph(A, B, C); +} + +__m512h test_mm512_maskz_cvtepu16_ph(__mmask32 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half> + return _mm512_maskz_cvtepu16_ph(A, B); +} + +__m512i test_mm512_cvt_roundph_epi32(__m256h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512 + return _mm512_cvt_roundph_epi32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvt_roundph_epi32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512 + return _mm512_mask_cvt_roundph_epi32(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvt_roundph_epi32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512 + return _mm512_maskz_cvt_roundph_epi32(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvtph_epi32(__m256h A) { + // CHECK-LABEL: test_mm512_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512 + return _mm512_cvtph_epi32(A); +} + +__m512i test_mm512_mask_cvtph_epi32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512 + return _mm512_mask_cvtph_epi32(A, B, C); +} + +__m512i test_mm512_maskz_cvtph_epi32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512 + return _mm512_maskz_cvtph_epi32(A, B); +} + +__m512i test_mm512_cvt_roundph_epu32(__m256h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512 + return _mm512_cvt_roundph_epu32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvt_roundph_epu32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512 + return _mm512_mask_cvt_roundph_epu32(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvt_roundph_epu32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512 + return _mm512_maskz_cvt_roundph_epu32(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvtph_epu32(__m256h A) { + // CHECK-LABEL: test_mm512_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512 + return _mm512_cvtph_epu32(A); +} + +__m512i test_mm512_mask_cvtph_epu32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512 + return _mm512_mask_cvtph_epu32(A, B, C); +} + +__m512i test_mm512_maskz_cvtph_epu32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512 + return _mm512_maskz_cvtph_epu32(A, B); +} + +__m256h test_mm512_cvt_roundepi32_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvt_roundepi32_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32 + return _mm512_cvt_roundepi32_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_mask_cvt_roundepi32_ph(__m256h A, __mmask16 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundepi32_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32 + return _mm512_mask_cvt_roundepi32_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_maskz_cvt_roundepi32_ph(__mmask16 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundepi32_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32 + return _mm512_maskz_cvt_roundepi32_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_cvtepi32_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvtepi32_ph + // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half> + return _mm512_cvtepi32_ph(A); +} + +__m256h test_mm512_mask_cvtepi32_ph(__m256h A, __mmask16 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvtepi32_ph + // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half> + return _mm512_mask_cvtepi32_ph(A, B, C); +} + +__m256h test_mm512_maskz_cvtepi32_ph(__mmask16 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvtepi32_ph + // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half> + return _mm512_maskz_cvtepi32_ph(A, B); +} + +__m256h test_mm512_cvt_roundepu32_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvt_roundepu32_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32 + return _mm512_cvt_roundepu32_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_mask_cvt_roundepu32_ph(__m256h A, __mmask16 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundepu32_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32 + return _mm512_mask_cvt_roundepu32_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_maskz_cvt_roundepu32_ph(__mmask16 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundepu32_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32 + return _mm512_maskz_cvt_roundepu32_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_cvtepu32_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvtepu32_ph + // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half> + return _mm512_cvtepu32_ph(A); +} + +__m256h test_mm512_mask_cvtepu32_ph(__m256h A, __mmask16 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvtepu32_ph + // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half> + return _mm512_mask_cvtepu32_ph(A, B, C); +} + +__m256h test_mm512_maskz_cvtepu32_ph(__mmask16 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvtepu32_ph + // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half> + return _mm512_maskz_cvtepu32_ph(A, B); +} + +__m512i test_mm512_cvtt_roundph_epi32(__m256h A) { + // CHECK-LABEL: test_mm512_cvtt_roundph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512 + return _mm512_cvtt_roundph_epi32(A, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvtt_roundph_epi32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512 + return _mm512_mask_cvtt_roundph_epi32(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvtt_roundph_epi32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512 + return _mm512_maskz_cvtt_roundph_epi32(A, B, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvttph_epi32(__m256h A) { + // CHECK-LABEL: test_mm512_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512 + return _mm512_cvttph_epi32(A); +} + +__m512i test_mm512_mask_cvttph_epi32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512 + return _mm512_mask_cvttph_epi32(A, B, C); +} + +__m512i test_mm512_maskz_cvttph_epi32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512 + return _mm512_maskz_cvttph_epi32(A, B); +} + +__m512i test_mm512_cvtt_roundph_epu32(__m256h A) { + // CHECK-LABEL: test_mm512_cvtt_roundph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512 + return _mm512_cvtt_roundph_epu32(A, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvtt_roundph_epu32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512 + return _mm512_mask_cvtt_roundph_epu32(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvtt_roundph_epu32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512 + return _mm512_maskz_cvtt_roundph_epu32(A, B, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvttph_epu32(__m256h A) { + // CHECK-LABEL: test_mm512_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512 + return _mm512_cvttph_epu32(A); +} + +__m512i test_mm512_mask_cvttph_epu32(__m512i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512 + return _mm512_mask_cvttph_epu32(A, B, C); +} + +__m512i test_mm512_maskz_cvttph_epu32(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512 + return _mm512_maskz_cvttph_epu32(A, B); +} + +__m128h test_mm512_cvt_roundepi64_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvt_roundepi64_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64 + return _mm512_cvt_roundepi64_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_mask_cvt_roundepi64_ph(__m128h A, __mmask8 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundepi64_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64 + return _mm512_mask_cvt_roundepi64_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_maskz_cvt_roundepi64_ph(__mmask8 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundepi64_ph + // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64 + return _mm512_maskz_cvt_roundepi64_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_cvtepi64_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvtepi64_ph + // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half> + return _mm512_cvtepi64_ph(A); +} + +__m128h test_mm512_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvtepi64_ph + // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half> + return _mm512_mask_cvtepi64_ph(A, B, C); +} + +__m128h test_mm512_maskz_cvtepi64_ph(__mmask8 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvtepi64_ph + // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half> + return _mm512_maskz_cvtepi64_ph(A, B); +} + +__m512i test_mm512_cvt_roundph_epi64(__m128h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512 + return _mm512_cvt_roundph_epi64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvt_roundph_epi64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512 + return _mm512_mask_cvt_roundph_epi64(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvt_roundph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512 + return _mm512_maskz_cvt_roundph_epi64(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvtph_epi64(__m128h A) { + // CHECK-LABEL: test_mm512_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512 + return _mm512_cvtph_epi64(A); +} + +__m512i test_mm512_mask_cvtph_epi64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512 + return _mm512_mask_cvtph_epi64(A, B, C); +} + +__m512i test_mm512_maskz_cvtph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512 + return _mm512_maskz_cvtph_epi64(A, B); +} + +__m128h test_mm512_cvt_roundepu64_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvt_roundepu64_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64 + return _mm512_cvt_roundepu64_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_mask_cvt_roundepu64_ph(__m128h A, __mmask8 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundepu64_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64 + return _mm512_mask_cvt_roundepu64_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_maskz_cvt_roundepu64_ph(__mmask8 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundepu64_ph + // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64 + return _mm512_maskz_cvt_roundepu64_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm512_cvtepu64_ph(__m512i A) { + // CHECK-LABEL: test_mm512_cvtepu64_ph + // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half> + return _mm512_cvtepu64_ph(A); +} + +__m128h test_mm512_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m512i C) { + // CHECK-LABEL: test_mm512_mask_cvtepu64_ph + // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half> + return _mm512_mask_cvtepu64_ph(A, B, C); +} + +__m128h test_mm512_maskz_cvtepu64_ph(__mmask8 A, __m512i B) { + // CHECK-LABEL: test_mm512_maskz_cvtepu64_ph + // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half> + return _mm512_maskz_cvtepu64_ph(A, B); +} + +__m512i test_mm512_cvt_roundph_epu64(__m128h A) { + // CHECK-LABEL: test_mm512_cvt_roundph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512 + return _mm512_cvt_roundph_epu64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvt_roundph_epu64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512 + return _mm512_mask_cvt_roundph_epu64(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvt_roundph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512 + return _mm512_maskz_cvt_roundph_epu64(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvtph_epu64(__m128h A) { + // CHECK-LABEL: test_mm512_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512 + return _mm512_cvtph_epu64(A); +} + +__m512i test_mm512_mask_cvtph_epu64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512 + return _mm512_mask_cvtph_epu64(A, B, C); +} + +__m512i test_mm512_maskz_cvtph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512 + return _mm512_maskz_cvtph_epu64(A, B); +} + +__m512i test_mm512_cvtt_roundph_epi64(__m128h A) { + // CHECK-LABEL: test_mm512_cvtt_roundph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512 + return _mm512_cvtt_roundph_epi64(A, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvtt_roundph_epi64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512 + return _mm512_mask_cvtt_roundph_epi64(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvtt_roundph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512 + return _mm512_maskz_cvtt_roundph_epi64(A, B, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvttph_epi64(__m128h A) { + // CHECK-LABEL: test_mm512_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512 + return _mm512_cvttph_epi64(A); +} + +__m512i test_mm512_mask_cvttph_epi64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512 + return _mm512_mask_cvttph_epi64(A, B, C); +} + +__m512i test_mm512_maskz_cvttph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512 + return _mm512_maskz_cvttph_epi64(A, B); +} + +__m512i test_mm512_cvtt_roundph_epu64(__m128h A) { + // CHECK-LABEL: test_mm512_cvtt_roundph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512 + return _mm512_cvtt_roundph_epu64(A, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_mask_cvtt_roundph_epu64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512 + return _mm512_mask_cvtt_roundph_epu64(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_maskz_cvtt_roundph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512 + return _mm512_maskz_cvtt_roundph_epu64(A, B, _MM_FROUND_NO_EXC); +} + +__m512i test_mm512_cvttph_epu64(__m128h A) { + // CHECK-LABEL: test_mm512_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512 + return _mm512_cvttph_epu64(A); +} + +__m512i test_mm512_mask_cvttph_epu64(__m512i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm512_mask_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512 + return _mm512_mask_cvttph_epu64(A, B, C); +} + +__m512i test_mm512_maskz_cvttph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm512_maskz_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512 + return _mm512_maskz_cvttph_epu64(A, B); +} + +int test_mm_cvt_roundsh_i32(__m128h A) { + // CHECK-LABEL: test_mm_cvt_roundsh_i32 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2si32 + return _mm_cvt_roundsh_i32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +int test_mm_cvtsh_i32(__m128h A) { + // CHECK-LABEL: test_mm_cvtsh_i32 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2si32 + return _mm_cvtsh_i32(A); +} + +unsigned int test_mm_cvt_roundsh_u32(__m128h A) { + // CHECK-LABEL: test_mm_cvt_roundsh_u32 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi32 + return _mm_cvt_roundsh_u32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +unsigned int test_mm_cvtsh_u32(__m128h A) { + // CHECK-LABEL: test_mm_cvtsh_u32 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi32 + return _mm_cvtsh_u32(A); +} + +#ifdef __x86_64__ +long long test_mm_cvt_roundsh_i64(__m128h A) { + // CHECK-LABEL: test_mm_cvt_roundsh_i64 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2si64 + return _mm_cvt_roundsh_i64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +long long test_mm_cvtsh_i64(__m128h A) { + // CHECK-LABEL: test_mm_cvtsh_i64 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2si64 + return _mm_cvtsh_i64(A); +} + +unsigned long long test_mm_cvt_roundsh_u64(__m128h A) { + // CHECK-LABEL: test_mm_cvt_roundsh_u64 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi64 + return _mm_cvt_roundsh_u64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +unsigned long long test_mm_cvtsh_u64(__m128h A) { + // CHECK-LABEL: test_mm_cvtsh_u64 + // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi64 + return _mm_cvtsh_u64(A); +} +#endif + +__m128h test_mm_cvt_roundu32_sh(__m128h A, unsigned int B) { + // CHECK-LABEL: test_mm_cvt_roundu32_sh + // CHECK: @llvm.x86.avx512fp16.vcvtusi2sh + return _mm_cvt_roundu32_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_cvtu32_sh(__m128h A, unsigned int B) { + // CHECK-LABEL: test_mm_cvtu32_sh + // CHECK: %{{.*}} = uitofp i32 %{{.*}} to half + return _mm_cvtu32_sh(A, B); +} + +#ifdef __x86_64__ +__m128h test_mm_cvt_roundu64_sh(__m128h A, unsigned long long B) { + // CHECK-LABEL: test_mm_cvt_roundu64_sh + // CHECK: @llvm.x86.avx512fp16.vcvtusi642sh + return _mm_cvt_roundu64_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_cvtu64_sh(__m128h A, unsigned long long B) { + // CHECK-LABEL: test_mm_cvtu64_sh + // CHECK: %{{.*}} = uitofp i64 %{{.*}} to half + return _mm_cvtu64_sh(A, B); +} +#endif + +__m128h test_mm_cvt_roundi32_sh(__m128h A, int B) { + // CHECK-LABEL: test_mm_cvt_roundi32_sh + // CHECK: @llvm.x86.avx512fp16.vcvtsi2sh + return _mm_cvt_roundi32_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_cvti32_sh(__m128h A, int B) { + // CHECK-LABEL: test_mm_cvti32_sh + // CHECK: %{{.*}} = sitofp i32 %{{.*}} to half + return _mm_cvti32_sh(A, B); +} + +#ifdef __x86_64__ +__m128h test_mm_cvt_roundi64_sh(__m128h A, long long B) { + // CHECK-LABEL: test_mm_cvt_roundi64_sh + // CHECK: @llvm.x86.avx512fp16.vcvtsi642sh + return _mm_cvt_roundi64_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_cvti64_sh(__m128h A, long long B) { + // CHECK-LABEL: test_mm_cvti64_sh + // CHECK: %{{.*}} = sitofp i64 %{{.*}} to half + return _mm_cvti64_sh(A, B); +} +#endif + +int test_mm_cvtt_roundsh_i32(__m128h A) { + // CHECK-LABEL: test_mm_cvtt_roundsh_i32 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2si32 + return _mm_cvtt_roundsh_i32(A, _MM_FROUND_NO_EXC); +} + +int test_mm_cvttsh_i32(__m128h A) { + // CHECK-LABEL: test_mm_cvttsh_i32 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2si32 + return _mm_cvttsh_i32(A); +} + +#ifdef __x86_64__ +long long test_mm_cvtt_roundsh_i64(__m128h A) { + // CHECK-LABEL: test_mm_cvtt_roundsh_i64 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2si64 + return _mm_cvtt_roundsh_i64(A, _MM_FROUND_NO_EXC); +} + +long long test_mm_cvttsh_i64(__m128h A) { + // CHECK-LABEL: test_mm_cvttsh_i64 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2si64 + return _mm_cvttsh_i64(A); +} +#endif + +unsigned int test_mm_cvtt_roundsh_u32(__m128h A) { + // CHECK-LABEL: test_mm_cvtt_roundsh_u32 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi32 + return _mm_cvtt_roundsh_u32(A, _MM_FROUND_NO_EXC); +} + +unsigned int test_mm_cvttsh_u32(__m128h A) { + // CHECK-LABEL: test_mm_cvttsh_u32 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi32 + return _mm_cvttsh_u32(A); +} + +#ifdef __x86_64__ +unsigned long long test_mm_cvtt_roundsh_u64(__m128h A) { + // CHECK-LABEL: test_mm_cvtt_roundsh_u64 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi64 + return _mm_cvtt_roundsh_u64(A, _MM_FROUND_NO_EXC); +} + +unsigned long long test_mm_cvttsh_u64(__m128h A) { + // CHECK-LABEL: test_mm_cvttsh_u64 + // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi64 + return _mm_cvttsh_u64(A); +} +#endif + +__m512 test_mm512_cvtx_roundph_ps(__m256h A) { + // CHECK-LABEL: test_mm512_cvtx_roundph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512 + return _mm512_cvtx_roundph_ps(A, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_mask_cvtx_roundph_ps(__m512 A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvtx_roundph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512 + return _mm512_mask_cvtx_roundph_ps(A, B, C, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_maskz_cvtx_roundph_ps(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvtx_roundph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512 + return _mm512_maskz_cvtx_roundph_ps(A, B, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_cvtxph_ps(__m256h A) { + // CHECK-LABEL: test_mm512_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512 + return _mm512_cvtxph_ps(A); +} + +__m512 test_mm512_mask_cvtxph_ps(__m512 A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm512_mask_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512 + return _mm512_mask_cvtxph_ps(A, B, C); +} + +__m512 test_mm512_maskz_cvtxph_ps(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm512_maskz_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512 + return _mm512_maskz_cvtxph_ps(A, B); +} + +__m256h test_mm512_cvtx_roundps_ph(__m512 A) { + // CHECK-LABEL: test_mm512_cvtx_roundps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512 + return _mm512_cvtx_roundps_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_mask_cvtx_roundps_ph(__m256h A, __mmask16 B, __m512 C) { + // CHECK-LABEL: test_mm512_mask_cvtx_roundps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512 + return _mm512_mask_cvtx_roundps_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_maskz_cvtx_roundps_ph(__mmask16 A, __m512 B) { + // CHECK-LABEL: test_mm512_maskz_cvtx_roundps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512 + return _mm512_maskz_cvtx_roundps_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm512_cvtxps_ph(__m512 A) { + // CHECK-LABEL: test_mm512_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512 + return _mm512_cvtxps_ph(A); +} + +__m256h test_mm512_mask_cvtxps_ph(__m256h A, __mmask16 B, __m512 C) { + // CHECK-LABEL: test_mm512_mask_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512 + return _mm512_mask_cvtxps_ph(A, B, C); +} + +__m256h test_mm512_maskz_cvtxps_ph(__mmask16 A, __m512 B) { + // CHECK-LABEL: test_mm512_maskz_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512 + return _mm512_maskz_cvtxps_ph(A, B); +} + _Float16 test_mm512_reduce_add_ph(__m512h __W) { // CHECK-LABEL: @test_mm512_reduce_add_ph // CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -1215,6 +1215,798 @@ return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); } +__m128h test_mm_cvtpd_ph(__m128d A) { + // CHECK-LABEL: test_mm_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128 + return _mm_cvtpd_ph(A); +} + +__m128h test_mm_mask_cvtpd_ph(__m128h A, __mmask8 B, __m128d C) { + // CHECK-LABEL: test_mm_mask_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128 + return _mm_mask_cvtpd_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtpd_ph(__mmask8 A, __m128d B) { + // CHECK-LABEL: test_mm_maskz_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128 + return _mm_maskz_cvtpd_ph(A, B); +} + +__m128h test_mm256_cvtpd_ph(__m256d A) { + // CHECK-LABEL: test_mm256_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.256 + return _mm256_cvtpd_ph(A); +} + +__m128h test_mm256_mask_cvtpd_ph(__m128h A, __mmask8 B, __m256d C) { + // CHECK-LABEL: test_mm256_mask_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.256 + return _mm256_mask_cvtpd_ph(A, B, C); +} + +__m128h test_mm256_maskz_cvtpd_ph(__mmask8 A, __m256d B) { + // CHECK-LABEL: test_mm256_maskz_cvtpd_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.256 + return _mm256_maskz_cvtpd_ph(A, B); +} + +__m128d test_mm_cvtph_pd(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.128 + return _mm_cvtph_pd(A); +} + +__m128d test_mm_mask_cvtph_pd(__m128d A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.128 + return _mm_mask_cvtph_pd(A, B, C); +} + +__m128d test_mm_maskz_cvtph_pd(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.128 + return _mm_maskz_cvtph_pd(A, B); +} + +__m256d test_mm256_cvtph_pd(__m128h A) { + // CHECK-LABEL: test_mm256_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.256 + return _mm256_cvtph_pd(A); +} + +__m256d test_mm256_mask_cvtph_pd(__m256d A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.256 + return _mm256_mask_cvtph_pd(A, B, C); +} + +__m256d test_mm256_maskz_cvtph_pd(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_pd + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.256 + return _mm256_maskz_cvtph_pd(A, B); +} + +__m128i test_mm_cvtph_epi16(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.128 + return _mm_cvtph_epi16(A); +} + +__m128i test_mm_mask_cvtph_epi16(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.128 + return _mm_mask_cvtph_epi16(A, B, C); +} + +__m128i test_mm_maskz_cvtph_epi16(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.128 + return _mm_maskz_cvtph_epi16(A, B); +} + +__m256i test_mm256_cvtph_epi16(__m256h A) { + // CHECK-LABEL: test_mm256_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.256 + return _mm256_cvtph_epi16(A); +} + +__m256i test_mm256_mask_cvtph_epi16(__m256i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.256 + return _mm256_mask_cvtph_epi16(A, B, C); +} + +__m256i test_mm256_maskz_cvtph_epi16(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.256 + return _mm256_maskz_cvtph_epi16(A, B); +} + +__m128i test_mm_cvttph_epi16(__m128h A) { + // CHECK-LABEL: test_mm_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.128 + return _mm_cvttph_epi16(A); +} + +__m128i test_mm_mask_cvttph_epi16(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.128 + return _mm_mask_cvttph_epi16(A, B, C); +} + +__m128i test_mm_maskz_cvttph_epi16(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.128 + return _mm_maskz_cvttph_epi16(A, B); +} + +__m256i test_mm256_cvttph_epi16(__m256h A) { + // CHECK-LABEL: test_mm256_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.256 + return _mm256_cvttph_epi16(A); +} + +__m256i test_mm256_mask_cvttph_epi16(__m256i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm256_mask_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.256 + return _mm256_mask_cvttph_epi16(A, B, C); +} + +__m256i test_mm256_maskz_cvttph_epi16(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm256_maskz_cvttph_epi16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.256 + return _mm256_maskz_cvttph_epi16(A, B); +} + +__m128h test_mm_cvtepi16_ph(__m128i A) { + // CHECK-LABEL: test_mm_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <8 x i16> %{{.*}} to <8 x half> + return _mm_cvtepi16_ph(A); +} + +__m128h test_mm_mask_cvtepi16_ph(__m128h A, __mmask8 B, __m128i C) { + // CHECK-LABEL: test_mm_mask_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <8 x i16> %{{.*}} to <8 x half> + return _mm_mask_cvtepi16_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtepi16_ph(__mmask8 A, __m128i B) { + // CHECK-LABEL: test_mm_maskz_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <8 x i16> %{{.*}} to <8 x half> + return _mm_maskz_cvtepi16_ph(A, B); +} + +__m256h test_mm256_cvtepi16_ph(__m256i A) { + // CHECK-LABEL: test_mm256_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <16 x i16> %{{.*}} to <16 x half> + return _mm256_cvtepi16_ph(A); +} + +__m256h test_mm256_mask_cvtepi16_ph(__m256h A, __mmask16 B, __m256i C) { + // CHECK-LABEL: test_mm256_mask_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <16 x i16> %{{.*}} to <16 x half> + return _mm256_mask_cvtepi16_ph(A, B, C); +} + +__m256h test_mm256_maskz_cvtepi16_ph(__mmask16 A, __m256i B) { + // CHECK-LABEL: test_mm256_maskz_cvtepi16_ph + // CHECK: %{{.*}} = sitofp <16 x i16> %{{.*}} to <16 x half> + return _mm256_maskz_cvtepi16_ph(A, B); +} + +__m128i test_mm_cvtph_epu16(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.128 + return _mm_cvtph_epu16(A); +} + +__m128i test_mm_mask_cvtph_epu16(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.128 + return _mm_mask_cvtph_epu16(A, B, C); +} + +__m128i test_mm_maskz_cvtph_epu16(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.128 + return _mm_maskz_cvtph_epu16(A, B); +} + +__m256i test_mm256_cvtph_epu16(__m256h A) { + // CHECK-LABEL: test_mm256_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.256 + return _mm256_cvtph_epu16(A); +} + +__m256i test_mm256_mask_cvtph_epu16(__m256i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.256 + return _mm256_mask_cvtph_epu16(A, B, C); +} + +__m256i test_mm256_maskz_cvtph_epu16(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.256 + return _mm256_maskz_cvtph_epu16(A, B); +} + +__m128i test_mm_cvttph_epu16(__m128h A) { + // CHECK-LABEL: test_mm_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.128 + return _mm_cvttph_epu16(A); +} + +__m128i test_mm_mask_cvttph_epu16(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.128 + return _mm_mask_cvttph_epu16(A, B, C); +} + +__m128i test_mm_maskz_cvttph_epu16(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.128 + return _mm_maskz_cvttph_epu16(A, B); +} + +__m256i test_mm256_cvttph_epu16(__m256h A) { + // CHECK-LABEL: test_mm256_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.256 + return _mm256_cvttph_epu16(A); +} + +__m256i test_mm256_mask_cvttph_epu16(__m256i A, __mmask16 B, __m256h C) { + // CHECK-LABEL: test_mm256_mask_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.256 + return _mm256_mask_cvttph_epu16(A, B, C); +} + +__m256i test_mm256_maskz_cvttph_epu16(__mmask16 A, __m256h B) { + // CHECK-LABEL: test_mm256_maskz_cvttph_epu16 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.256 + return _mm256_maskz_cvttph_epu16(A, B); +} + +__m128h test_mm_cvtepu16_ph(__m128i A) { + // CHECK-LABEL: test_mm_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <8 x i16> %{{.*}} to <8 x half> + return _mm_cvtepu16_ph(A); +} + +__m128h test_mm_mask_cvtepu16_ph(__m128h A, __mmask8 B, __m128i C) { + // CHECK-LABEL: test_mm_mask_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <8 x i16> %{{.*}} to <8 x half> + return _mm_mask_cvtepu16_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtepu16_ph(__mmask8 A, __m128i B) { + // CHECK-LABEL: test_mm_maskz_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <8 x i16> %{{.*}} to <8 x half> + return _mm_maskz_cvtepu16_ph(A, B); +} + +__m256h test_mm256_cvtepu16_ph(__m256i A) { + // CHECK-LABEL: test_mm256_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <16 x i16> %{{.*}} to <16 x half> + return _mm256_cvtepu16_ph(A); +} + +__m256h test_mm256_mask_cvtepu16_ph(__m256h A, __mmask16 B, __m256i C) { + // CHECK-LABEL: test_mm256_mask_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <16 x i16> %{{.*}} to <16 x half> + return _mm256_mask_cvtepu16_ph(A, B, C); +} + +__m256h test_mm256_maskz_cvtepu16_ph(__mmask16 A, __m256i B) { + // CHECK-LABEL: test_mm256_maskz_cvtepu16_ph + // CHECK: %{{.*}} = uitofp <16 x i16> %{{.*}} to <16 x half> + return _mm256_maskz_cvtepu16_ph(A, B); +} + +__m128i test_mm_cvtph_epi32(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.128 + return _mm_cvtph_epi32(A); +} + +__m128i test_mm_mask_cvtph_epi32(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.128 + return _mm_mask_cvtph_epi32(A, B, C); +} + +__m128i test_mm_maskz_cvtph_epi32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.128 + return _mm_maskz_cvtph_epi32(A, B); +} + +__m256i test_mm256_cvtph_epi32(__m128h A) { + // CHECK-LABEL: test_mm256_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.256 + return _mm256_cvtph_epi32(A); +} + +__m256i test_mm256_mask_cvtph_epi32(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.256 + return _mm256_mask_cvtph_epi32(A, B, C); +} + +__m256i test_mm256_maskz_cvtph_epi32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.256 + return _mm256_maskz_cvtph_epi32(A, B); +} + +__m128i test_mm_cvtph_epu32(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.128 + return _mm_cvtph_epu32(A); +} + +__m128i test_mm_mask_cvtph_epu32(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.128 + return _mm_mask_cvtph_epu32(A, B, C); +} + +__m128i test_mm_maskz_cvtph_epu32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.128 + return _mm_maskz_cvtph_epu32(A, B); +} + +__m256i test_mm256_cvtph_epu32(__m128h A) { + // CHECK-LABEL: test_mm256_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.256 + return _mm256_cvtph_epu32(A); +} + +__m256i test_mm256_mask_cvtph_epu32(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.256 + return _mm256_mask_cvtph_epu32(A, B, C); +} + +__m256i test_mm256_maskz_cvtph_epu32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.256 + return _mm256_maskz_cvtph_epu32(A, B); +} + +__m128h test_mm_cvtepi32_ph(__m128i A) { + // CHECK-LABEL: test_mm_cvtepi32_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtdq2ph.128 + return _mm_cvtepi32_ph(A); +} + +__m128h test_mm_mask_cvtepi32_ph(__m128h A, __mmask8 B, __m128i C) { + // CHECK-LABEL: test_mm_mask_cvtepi32_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtdq2ph.128 + return _mm_mask_cvtepi32_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtepi32_ph(__mmask8 A, __m128i B) { + // CHECK-LABEL: test_mm_maskz_cvtepi32_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtdq2ph.128 + return _mm_maskz_cvtepi32_ph(A, B); +} + +__m128h test_mm256_cvtepi32_ph(__m256i A) { + // CHECK-LABEL: test_mm256_cvtepi32_ph + // CHECK: %{{.*}} = sitofp <8 x i32> %{{.*}} to <8 x half> + return _mm256_cvtepi32_ph(A); +} + +__m128h test_mm256_mask_cvtepi32_ph(__m128h A, __mmask8 B, __m256i C) { + // CHECK-LABEL: test_mm256_mask_cvtepi32_ph + // CHECK: %{{.*}} = sitofp <8 x i32> %{{.*}} to <8 x half> + return _mm256_mask_cvtepi32_ph(A, B, C); +} + +__m128h test_mm256_maskz_cvtepi32_ph(__mmask8 A, __m256i B) { + // CHECK-LABEL: test_mm256_maskz_cvtepi32_ph + // CHECK: %{{.*}} = sitofp <8 x i32> %{{.*}} to <8 x half> + return _mm256_maskz_cvtepi32_ph(A, B); +} + +__m128h test_mm_cvtepu32_ph(__m128i A) { + // CHECK-LABEL: test_mm_cvtepu32_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtudq2ph.128 + return _mm_cvtepu32_ph(A); +} + +__m128h test_mm_mask_cvtepu32_ph(__m128h A, __mmask8 B, __m128i C) { + // CHECK-LABEL: test_mm_mask_cvtepu32_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtudq2ph.128 + return _mm_mask_cvtepu32_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtepu32_ph(__mmask8 A, __m128i B) { + // CHECK-LABEL: test_mm_maskz_cvtepu32_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtudq2ph.128 + return _mm_maskz_cvtepu32_ph(A, B); +} + +__m128h test_mm256_cvtepu32_ph(__m256i A) { + // CHECK-LABEL: test_mm256_cvtepu32_ph + // CHECK: %{{.*}} = uitofp <8 x i32> %{{.*}} to <8 x half> + return _mm256_cvtepu32_ph(A); +} + +__m128h test_mm256_mask_cvtepu32_ph(__m128h A, __mmask8 B, __m256i C) { + // CHECK-LABEL: test_mm256_mask_cvtepu32_ph + // CHECK: %{{.*}} = uitofp <8 x i32> %{{.*}} to <8 x half> + return _mm256_mask_cvtepu32_ph(A, B, C); +} + +__m128h test_mm256_maskz_cvtepu32_ph(__mmask8 A, __m256i B) { + // CHECK-LABEL: test_mm256_maskz_cvtepu32_ph + // CHECK: %{{.*}} = uitofp <8 x i32> %{{.*}} to <8 x half> + return _mm256_maskz_cvtepu32_ph(A, B); +} + +__m128i test_mm_cvttph_epi32(__m128h A) { + // CHECK-LABEL: test_mm_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.128 + return _mm_cvttph_epi32(A); +} + +__m128i test_mm_mask_cvttph_epi32(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.128 + return _mm_mask_cvttph_epi32(A, B, C); +} + +__m128i test_mm_maskz_cvttph_epi32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.128 + return _mm_maskz_cvttph_epi32(A, B); +} + +__m256i test_mm256_cvttph_epi32(__m128h A) { + // CHECK-LABEL: test_mm256_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.256 + return _mm256_cvttph_epi32(A); +} + +__m256i test_mm256_mask_cvttph_epi32(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.256 + return _mm256_mask_cvttph_epi32(A, B, C); +} + +__m256i test_mm256_maskz_cvttph_epi32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvttph_epi32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.256 + return _mm256_maskz_cvttph_epi32(A, B); +} + +__m128i test_mm_cvttph_epu32(__m128h A) { + // CHECK-LABEL: test_mm_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.128 + return _mm_cvttph_epu32(A); +} + +__m128i test_mm_mask_cvttph_epu32(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.128 + return _mm_mask_cvttph_epu32(A, B, C); +} + +__m128i test_mm_maskz_cvttph_epu32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.128 + return _mm_maskz_cvttph_epu32(A, B); +} + +__m256i test_mm256_cvttph_epu32(__m128h A) { + // CHECK-LABEL: test_mm256_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.256 + return _mm256_cvttph_epu32(A); +} + +__m256i test_mm256_mask_cvttph_epu32(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.256 + return _mm256_mask_cvttph_epu32(A, B, C); +} + +__m256i test_mm256_maskz_cvttph_epu32(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvttph_epu32 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.256 + return _mm256_maskz_cvttph_epu32(A, B); +} + +__m128h test_mm_cvtepi64_ph(__m128i A) { + // CHECK-LABEL: test_mm_cvtepi64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.128 + return _mm_cvtepi64_ph(A); +} + +__m128h test_mm_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m128i C) { + // CHECK-LABEL: test_mm_mask_cvtepi64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.128 + return _mm_mask_cvtepi64_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtepi64_ph(__mmask8 A, __m128i B) { + // CHECK-LABEL: test_mm_maskz_cvtepi64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.128 + return _mm_maskz_cvtepi64_ph(A, B); +} + +__m128h test_mm256_cvtepi64_ph(__m256i A) { + // CHECK-LABEL: test_mm256_cvtepi64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.256 + return _mm256_cvtepi64_ph(A); +} + +__m128h test_mm256_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m256i C) { + // CHECK-LABEL: test_mm256_mask_cvtepi64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.256 + return _mm256_mask_cvtepi64_ph(A, B, C); +} + +__m128h test_mm256_maskz_cvtepi64_ph(__mmask8 A, __m256i B) { + // CHECK-LABEL: test_mm256_maskz_cvtepi64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.256 + return _mm256_maskz_cvtepi64_ph(A, B); +} + +__m128i test_mm_cvtph_epi64(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.128 + return _mm_cvtph_epi64(A); +} + +__m128i test_mm_mask_cvtph_epi64(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.128 + return _mm_mask_cvtph_epi64(A, B, C); +} + +__m128i test_mm_maskz_cvtph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.128 + return _mm_maskz_cvtph_epi64(A, B); +} + +__m256i test_mm256_cvtph_epi64(__m128h A) { + // CHECK-LABEL: test_mm256_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.256 + return _mm256_cvtph_epi64(A); +} + +__m256i test_mm256_mask_cvtph_epi64(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.256 + return _mm256_mask_cvtph_epi64(A, B, C); +} + +__m256i test_mm256_maskz_cvtph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.256 + return _mm256_maskz_cvtph_epi64(A, B); +} + +__m128h test_mm_cvtepu64_ph(__m128i A) { + // CHECK-LABEL: test_mm_cvtepu64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128 + return _mm_cvtepu64_ph(A); +} + +__m128h test_mm_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m128i C) { + // CHECK-LABEL: test_mm_mask_cvtepu64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128 + return _mm_mask_cvtepu64_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtepu64_ph(__mmask8 A, __m128i B) { + // CHECK-LABEL: test_mm_maskz_cvtepu64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128 + return _mm_maskz_cvtepu64_ph(A, B); +} + +__m128h test_mm256_cvtepu64_ph(__m256i A) { + // CHECK-LABEL: test_mm256_cvtepu64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256 + return _mm256_cvtepu64_ph(A); +} + +__m128h test_mm256_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m256i C) { + // CHECK-LABEL: test_mm256_mask_cvtepu64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256 + return _mm256_mask_cvtepu64_ph(A, B, C); +} + +__m128h test_mm256_maskz_cvtepu64_ph(__mmask8 A, __m256i B) { + // CHECK-LABEL: test_mm256_maskz_cvtepu64_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256 + return _mm256_maskz_cvtepu64_ph(A, B); +} + +__m128i test_mm_cvtph_epu64(__m128h A) { + // CHECK-LABEL: test_mm_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.128 + return _mm_cvtph_epu64(A); +} + +__m128i test_mm_mask_cvtph_epu64(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.128 + return _mm_mask_cvtph_epu64(A, B, C); +} + +__m128i test_mm_maskz_cvtph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.128 + return _mm_maskz_cvtph_epu64(A, B); +} + +__m256i test_mm256_cvtph_epu64(__m128h A) { + // CHECK-LABEL: test_mm256_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.256 + return _mm256_cvtph_epu64(A); +} + +__m256i test_mm256_mask_cvtph_epu64(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.256 + return _mm256_mask_cvtph_epu64(A, B, C); +} + +__m256i test_mm256_maskz_cvtph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvtph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.256 + return _mm256_maskz_cvtph_epu64(A, B); +} + +__m128i test_mm_cvttph_epi64(__m128h A) { + // CHECK-LABEL: test_mm_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.128 + return _mm_cvttph_epi64(A); +} + +__m128i test_mm_mask_cvttph_epi64(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.128 + return _mm_mask_cvttph_epi64(A, B, C); +} + +__m128i test_mm_maskz_cvttph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.128 + return _mm_maskz_cvttph_epi64(A, B); +} + +__m256i test_mm256_cvttph_epi64(__m128h A) { + // CHECK-LABEL: test_mm256_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.256 + return _mm256_cvttph_epi64(A); +} + +__m256i test_mm256_mask_cvttph_epi64(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.256 + return _mm256_mask_cvttph_epi64(A, B, C); +} + +__m256i test_mm256_maskz_cvttph_epi64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvttph_epi64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.256 + return _mm256_maskz_cvttph_epi64(A, B); +} + +__m128i test_mm_cvttph_epu64(__m128h A) { + // CHECK-LABEL: test_mm_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.128 + return _mm_cvttph_epu64(A); +} + +__m128i test_mm_mask_cvttph_epu64(__m128i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.128 + return _mm_mask_cvttph_epu64(A, B, C); +} + +__m128i test_mm_maskz_cvttph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.128 + return _mm_maskz_cvttph_epu64(A, B); +} + +__m256i test_mm256_cvttph_epu64(__m128h A) { + // CHECK-LABEL: test_mm256_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.256 + return _mm256_cvttph_epu64(A); +} + +__m256i test_mm256_mask_cvttph_epu64(__m256i A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.256 + return _mm256_mask_cvttph_epu64(A, B, C); +} + +__m256i test_mm256_maskz_cvttph_epu64(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvttph_epu64 + // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.256 + return _mm256_maskz_cvttph_epu64(A, B); +} + +__m128 test_mm_cvtxph_ps(__m128h A) { + // CHECK-LABEL: test_mm_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.128 + return _mm_cvtxph_ps(A); +} + +__m128 test_mm_mask_cvtxph_ps(__m128 A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm_mask_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.128 + return _mm_mask_cvtxph_ps(A, B, C); +} + +__m128 test_mm_maskz_cvtxph_ps(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm_maskz_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.128 + return _mm_maskz_cvtxph_ps(A, B); +} + +__m256 test_mm256_cvtxph_ps(__m128h A) { + // CHECK-LABEL: test_mm256_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.256 + return _mm256_cvtxph_ps(A); +} + +__m256 test_mm256_mask_cvtxph_ps(__m256 A, __mmask8 B, __m128h C) { + // CHECK-LABEL: test_mm256_mask_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.256 + return _mm256_mask_cvtxph_ps(A, B, C); +} + +__m256 test_mm256_maskz_cvtxph_ps(__mmask8 A, __m128h B) { + // CHECK-LABEL: test_mm256_maskz_cvtxph_ps + // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.256 + return _mm256_maskz_cvtxph_ps(A, B); +} + +__m128h test_mm_cvtxps_ph(__m128 A) { + // CHECK-LABEL: test_mm_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.128 + return _mm_cvtxps_ph(A); +} + +__m128h test_mm_mask_cvtxps_ph(__m128h A, __mmask8 B, __m128 C) { + // CHECK-LABEL: test_mm_mask_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.128 + return _mm_mask_cvtxps_ph(A, B, C); +} + +__m128h test_mm_maskz_cvtxps_ph(__mmask8 A, __m128 B) { + // CHECK-LABEL: test_mm_maskz_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.128 + return _mm_maskz_cvtxps_ph(A, B); +} + +__m128h test_mm256_cvtxps_ph(__m256 A) { + // CHECK-LABEL: test_mm256_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.256 + return _mm256_cvtxps_ph(A); +} + +__m128h test_mm256_mask_cvtxps_ph(__m128h A, __mmask8 B, __m256 C) { + // CHECK-LABEL: test_mm256_mask_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.256 + return _mm256_mask_cvtxps_ph(A, B, C); +} + +__m128h test_mm256_maskz_cvtxps_ph(__mmask8 A, __m256 B) { + // CHECK-LABEL: test_mm256_maskz_cvtxps_ph + // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.256 + return _mm256_maskz_cvtxps_ph(A, B); +} + __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { // CHECK-LABEL: @test_mm_mask_blend_ph // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5224,4 +5224,321 @@ Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ], [ IntrNoMem, ImmArg>, ImmArg> ]>; + + def int_x86_avx512fp16_mask_vcvtph2psx_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2psx128_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2psx_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2psx256_mask">, + Intrinsic<[ llvm_v8f32_ty ], + [ llvm_v8f16_ty, llvm_v8f32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2psx_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2psx512_mask">, + Intrinsic<[ llvm_v16f32_ty ], + [ llvm_v16f16_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtps2phx_128 + : GCCBuiltin<"__builtin_ia32_vcvtps2phx128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtps2phx_256 + : GCCBuiltin<"__builtin_ia32_vcvtps2phx256_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtps2phx_512 + : GCCBuiltin<"__builtin_ia32_vcvtps2phx512_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtpd2ph_128 + : GCCBuiltin<"__builtin_ia32_vcvtpd2ph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtpd2ph_256 + : GCCBuiltin<"__builtin_ia32_vcvtpd2ph256_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v4f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtpd2ph_512 + : GCCBuiltin<"__builtin_ia32_vcvtpd2ph512_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f64_ty, llvm_v8f16_ty, llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtph2pd_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2pd128_mask">, + Intrinsic<[ llvm_v2f64_ty ], + [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2pd_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2pd256_mask">, + Intrinsic<[ llvm_v4f64_ty ], + [ llvm_v8f16_ty, llvm_v4f64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2pd_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2pd512_mask">, + Intrinsic<[ llvm_v8f64_ty ], + [ llvm_v8f16_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtsh2ss_round + : GCCBuiltin<"__builtin_ia32_vcvtsh2ss_round_mask">, + Intrinsic<[ llvm_v4f32_ty ], + [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtss2sh_round + : GCCBuiltin<"__builtin_ia32_vcvtss2sh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtsd2sh_round + : GCCBuiltin<"__builtin_ia32_vcvtsd2sh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtsh2sd_round + : GCCBuiltin<"__builtin_ia32_vcvtsh2sd_round_mask">, + Intrinsic<[ llvm_v2f64_ty ], + [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_vcvtph2w_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2w128_mask">, + Intrinsic<[ llvm_v8i16_ty ], + [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2w_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2w256_mask">, + Intrinsic<[ llvm_v16i16_ty ], + [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2w_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2w512_mask">, + Intrinsic<[ llvm_v32i16_ty ], + [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvttph2w_128 + : GCCBuiltin<"__builtin_ia32_vcvttph2w128_mask">, + Intrinsic<[ llvm_v8i16_ty ], + [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2w_256 + : GCCBuiltin<"__builtin_ia32_vcvttph2w256_mask">, + Intrinsic<[ llvm_v16i16_ty ], + [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2w_512 + : GCCBuiltin<"__builtin_ia32_vcvttph2w512_mask">, + Intrinsic<[ llvm_v32i16_ty ], + [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtph2uw_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2uw128_mask">, + Intrinsic<[ llvm_v8i16_ty ], + [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2uw_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2uw256_mask">, + Intrinsic<[ llvm_v16i16_ty ], + [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2uw_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2uw512_mask">, + Intrinsic<[ llvm_v32i16_ty ], + [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvttph2uw_128 + : GCCBuiltin<"__builtin_ia32_vcvttph2uw128_mask">, + Intrinsic<[ llvm_v8i16_ty ], + [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2uw_256 + : GCCBuiltin<"__builtin_ia32_vcvttph2uw256_mask">, + Intrinsic<[ llvm_v16i16_ty ], + [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2uw_512 + : GCCBuiltin<"__builtin_ia32_vcvttph2uw512_mask">, + Intrinsic<[ llvm_v32i16_ty ], + [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_vcvtph2dq_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2dq128_mask">, + Intrinsic<[ llvm_v4i32_ty ], + [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2dq_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2dq256_mask">, + Intrinsic<[ llvm_v8i32_ty ], + [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2dq_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2dq512_mask">, + Intrinsic<[ llvm_v16i32_ty ], + [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtph2udq_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2udq128_mask">, + Intrinsic<[ llvm_v4i32_ty ], + [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2udq_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2udq256_mask">, + Intrinsic<[ llvm_v8i32_ty ], + [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2udq_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2udq512_mask">, + Intrinsic<[ llvm_v16i32_ty ], + [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtdq2ph_128 + : GCCBuiltin<"__builtin_ia32_vcvtdq2ph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtudq2ph_128 + : GCCBuiltin<"__builtin_ia32_vcvtudq2ph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2dq_128 + : GCCBuiltin<"__builtin_ia32_vcvttph2dq128_mask">, + Intrinsic<[ llvm_v4i32_ty ], + [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2dq_256 + : GCCBuiltin<"__builtin_ia32_vcvttph2dq256_mask">, + Intrinsic<[ llvm_v8i32_ty ], + [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2dq_512 + : GCCBuiltin<"__builtin_ia32_vcvttph2dq512_mask">, + Intrinsic<[ llvm_v16i32_ty ], + [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvttph2udq_128 + : GCCBuiltin<"__builtin_ia32_vcvttph2udq128_mask">, + Intrinsic<[ llvm_v4i32_ty ], + [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2udq_256 + : GCCBuiltin<"__builtin_ia32_vcvttph2udq256_mask">, + Intrinsic<[ llvm_v8i32_ty ], + [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2udq_512 + : GCCBuiltin<"__builtin_ia32_vcvttph2udq512_mask">, + Intrinsic<[ llvm_v16i32_ty ], + [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_vcvtqq2ph_128 + : GCCBuiltin<"__builtin_ia32_vcvtqq2ph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtqq2ph_256 + : GCCBuiltin<"__builtin_ia32_vcvtqq2ph256_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2qq_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2qq128_mask">, + Intrinsic<[ llvm_v2i64_ty ], + [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2qq_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2qq256_mask">, + Intrinsic<[ llvm_v4i64_ty ], + [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2qq_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2qq512_mask">, + Intrinsic<[ llvm_v8i64_ty ], + [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvtuqq2ph_128 + : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtuqq2ph_256 + : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph256_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2uqq_128 + : GCCBuiltin<"__builtin_ia32_vcvtph2uqq128_mask">, + Intrinsic<[ llvm_v2i64_ty ], + [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2uqq_256 + : GCCBuiltin<"__builtin_ia32_vcvtph2uqq256_mask">, + Intrinsic<[ llvm_v4i64_ty ], + [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvtph2uqq_512 + : GCCBuiltin<"__builtin_ia32_vcvtph2uqq512_mask">, + Intrinsic<[ llvm_v8i64_ty ], + [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvttph2qq_128 + : GCCBuiltin<"__builtin_ia32_vcvttph2qq128_mask">, + Intrinsic<[ llvm_v2i64_ty ], + [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2qq_256 + : GCCBuiltin<"__builtin_ia32_vcvttph2qq256_mask">, + Intrinsic<[ llvm_v4i64_ty ], + [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2qq_512 + : GCCBuiltin<"__builtin_ia32_vcvttph2qq512_mask">, + Intrinsic<[ llvm_v8i64_ty ], + [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_vcvttph2uqq_128 + : GCCBuiltin<"__builtin_ia32_vcvttph2uqq128_mask">, + Intrinsic<[ llvm_v2i64_ty ], + [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2uqq_256 + : GCCBuiltin<"__builtin_ia32_vcvttph2uqq256_mask">, + Intrinsic<[ llvm_v4i64_ty ], + [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_vcvttph2uqq_512 + : GCCBuiltin<"__builtin_ia32_vcvttph2uqq512_mask">, + Intrinsic<[ llvm_v8i64_ty ], + [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_vcvtsh2si32 + : GCCBuiltin<"__builtin_ia32_vcvtsh2si32">, + Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtsh2usi32 + : GCCBuiltin<"__builtin_ia32_vcvtsh2usi32">, + Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtsh2si64 + : GCCBuiltin<"__builtin_ia32_vcvtsh2si64">, + Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtsh2usi64 + : GCCBuiltin<"__builtin_ia32_vcvtsh2usi64">, + Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtusi2sh + : GCCBuiltin<"__builtin_ia32_vcvtusi2sh">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtusi642sh + : GCCBuiltin<"__builtin_ia32_vcvtusi642sh">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtsi2sh + : GCCBuiltin<"__builtin_ia32_vcvtsi2sh">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvtsi642sh + : GCCBuiltin<"__builtin_ia32_vcvtsi642sh">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvttsh2si32 + : GCCBuiltin<"__builtin_ia32_vcvttsh2si32">, + Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvttsh2si64 + : GCCBuiltin<"__builtin_ia32_vcvttsh2si64">, + Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvttsh2usi32 + : GCCBuiltin<"__builtin_ia32_vcvttsh2usi32">, + Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_vcvttsh2usi64 + : GCCBuiltin<"__builtin_ia32_vcvttsh2usi64">, + Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -287,6 +287,7 @@ HANDLE_LIBCALL(FPEXT_F64_F128, "__extenddftf2") HANDLE_LIBCALL(FPEXT_F32_F128, "__extendsftf2") HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2") +HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2") HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2") HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2") HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee") diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -237,6 +237,8 @@ return FPEXT_F16_F32; if (RetVT == MVT::f64) return FPEXT_F16_F64; + if (RetVT == MVT::f80) + return FPEXT_F16_F80; if (RetVT == MVT::f128) return FPEXT_F16_F128; } else if (OpVT == MVT::f32) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1931,6 +1931,13 @@ setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + if (isTypeLegal(MVT::f80)) { + setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); + } setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); @@ -1939,8 +1946,31 @@ setGroup(MVT::v32f16); addRegisterClass(MVT::v32f16, &X86::VR512RegClass); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8, + MVT::v32i16); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8, + MVT::v32i16); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1, + MVT::v32i16); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1, + MVT::v32i16); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); @@ -1960,6 +1990,21 @@ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); @@ -2001,6 +2046,37 @@ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } + if (Subtarget.hasFP16()) { + // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64 + setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom); + // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16 + setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom); + // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16 + setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom); + // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32 + setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); + } + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); @@ -19993,6 +20069,43 @@ DAG.getIntPtrConstant(0, dl)); } +// Try to use a packed vector operation to handle i64 on 32-bit targets. +static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert((Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + MVT SrcVT = Src.getSimpleValueType(); + MVT VT = Op.getSimpleValueType(); + + if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16) + return SDValue(); + + // Pack the i64 into a vector, do the operation and extract. + + assert(Subtarget.hasFP16() && "Expected FP16"); + + SDLoc dl(Op); + SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); + if (IsStrict) { + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other}, + {Op.getOperand(0), InVec}); + SDValue Chain = CvtVec.getValue(1); + SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Value, Chain}, dl); + } + + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); +} + static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget) { switch (Opcode) { @@ -20245,6 +20358,8 @@ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; + if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget)) + return V; // SSE doesn't have an i16 conversion so we need to promote. if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { @@ -20724,6 +20839,8 @@ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; + if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget)) + return V; // The transform for i64->f64 isn't correct for 0 when rounding to negative // infinity. It produces -0.0, so disable under strictfp. @@ -21505,9 +21622,11 @@ Op.getOpcode() == ISD::STRICT_FP_TO_SINT; MVT VT = Op->getSimpleValueType(0); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue(); MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); + SDValue Res; if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; @@ -21532,10 +21651,8 @@ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src, DAG.getIntPtrConstant(0, dl)); } - SDValue Res, Chain; if (IsStrict) { - Res = - DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src}); + Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Opc, dl, ResVT, Src); @@ -21549,6 +21666,67 @@ return Res; } + if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) { + if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) + return Op; + + MVT ResVT = VT; + MVT EleVT = VT.getVectorElementType(); + if (EleVT != MVT::i64) + ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; + + if (SrcVT != MVT::v8f16) { + SDValue Tmp = + IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT); + SmallVector Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp); + Ops[0] = Src; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); + } + + if (IsStrict) { + Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI + : X86ISD::STRICT_CVTTP2UI, + dl, {ResVT, MVT::Other}, {Chain, Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, + ResVT, Src); + } + + // TODO: Need to add exception check code for strict FP. + if (EleVT.getSizeInBits() < 16) { + ResVT = MVT::getVectorVT(EleVT, 8); + Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res); + } + + if (ResVT != VT) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; + } + + if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) { + if (IsStrict) { + Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT + : ISD::STRICT_FP_TO_UINT, + dl, {MVT::v8i32, MVT::Other}, {Chain, Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, + MVT::v8i32, Src); + } + + // TODO: Need to add exception check code for strict FP. + Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; + } + // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32. if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) { assert(!IsSigned && "Expected unsigned conversion!"); @@ -21572,10 +21750,9 @@ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, DAG.getIntPtrConstant(0, dl)); - SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other}, - {Op->getOperand(0), Src}); + {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src); @@ -21603,10 +21780,9 @@ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, DAG.getIntPtrConstant(0, dl)); - SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, - {Op->getOperand(0), Src}); + {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src); @@ -21631,7 +21807,7 @@ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32, {Src, Zero, Zero, Zero}); Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, - {Op->getOperand(0), Tmp}); + {Chain, Tmp}); SDValue Chain = Tmp.getValue(1); Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp, DAG.getIntPtrConstant(0, dl)); @@ -21714,17 +21890,16 @@ // FIXME: This does not generate an invalid exception if the input does not // fit in i32. PR44019 if (Subtarget.is64Bit()) { - SDValue Res, Chain; if (IsStrict) { - Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other}, - { Op.getOperand(0), Src }); + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other}, + {Chain, Src}); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) - return DAG.getMergeValues({ Res, Chain }, dl); + return DAG.getMergeValues({Res, Chain}, dl); return Res; } @@ -21739,17 +21914,16 @@ // fit in i16. PR44019 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) { assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); - SDValue Res, Chain; if (IsStrict) { - Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other}, - { Op.getOperand(0), Src }); + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other}, + {Chain, Src}); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) - return DAG.getMergeValues({ Res, Chain }, dl); + return DAG.getMergeValues({Res, Chain}, dl); return Res; } @@ -21765,7 +21939,6 @@ else LC = RTLIB::getFPTOUINT(SrcVT, VT); - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; std::pair Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op), Chain); @@ -21777,7 +21950,6 @@ } // Fall back to X87. - SDValue Chain; if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { if (IsStrict) return DAG.getMergeValues({V, Chain}, dl); @@ -22004,6 +22176,35 @@ if (VT == MVT::f128) return SDValue(); + if (VT == MVT::f80) { + if (SVT == MVT::f16) { + assert(Subtarget.hasFP16() && "Unexpected features!"); + RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); + MakeLibCallOptions CallOptions; + std::pair Tmp = + makeLibCall(DAG, LC, VT, In, CallOptions, DL, + IsStrict ? Op.getOperand(0) : SDValue()); + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, DL); + else + return Tmp.first; + } + return Op; + } + + if (SVT.getVectorElementType() == MVT::f16) { + assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!"); + if (SVT == MVT::v2f16) + In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, + DAG.getUNDEF(MVT::v2f16)); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In, + DAG.getUNDEF(MVT::v4f16)); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, + {Op->getOperand(0), Res}); + return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); + } + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); SDValue Res = @@ -22017,8 +22218,11 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); - // It's legal except when f128 is involved - if (In.getSimpleValueType() != MVT::f128) + MVT VT = Op.getSimpleValueType(); + MVT SVT = In.getSimpleValueType(); + + // It's legal except when f128 is involved or we're converting f80->f16. + if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80)) return Op; return SDValue(); @@ -31113,6 +31317,51 @@ SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); + if (VT.isVector() && Subtarget.hasFP16() && + SrcVT.getVectorElementType() == MVT::f16) { + EVT EleVT = VT.getVectorElementType(); + EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; + + if (SrcVT != MVT::v8f16) { + SDValue Tmp = + IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT); + SmallVector Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp); + Ops[0] = Src; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); + } + + SDValue Res, Chain; + if (IsStrict) { + unsigned Opc = + IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + Res = + DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + Res = DAG.getNode(Opc, dl, ResVT, Src); + } + + // TODO: Need to add exception check code for strict FP. + if (EleVT.getSizeInBits() < 16) { + MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8); + Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res); + + // Now widen to 128 bits. + unsigned NumConcats = 128 / TmpVT.getSizeInBits(); + MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats); + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(TmpVT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); + } + + Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); + + return; + } + if (VT.isVector() && VT.getScalarSizeInBits() < 32) { assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); @@ -31287,9 +31536,31 @@ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP || N->getOpcode() == ISD::STRICT_SINT_TO_FP; EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); + if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() && + Subtarget.hasVLX()) { + if (Src.getValueType().getVectorElementType() == MVT::i16) + return; + + if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32) + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + IsStrict ? DAG.getConstant(0, dl, MVT::v2i32) + : DAG.getUNDEF(MVT::v2i32)); + if (IsStrict) { + unsigned Opc = + IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P; + SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } else { + unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; + Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src)); + } + return; + } if (VT != MVT::v2f32) return; - SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { if (IsStrict) { @@ -31390,14 +31661,21 @@ case ISD::FP_ROUND: { bool IsStrict = N->isStrictFPOpcode(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); + EVT VT = N->getValueType(0); + EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; + if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) { + SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32) + : DAG.getUNDEF(MVT::v2f32); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext); + } if (!isTypeLegal(Src.getValueType())) return; SDValue V; if (IsStrict) - V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other}, - {N->getOperand(0), N->getOperand(1)}); + V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other}, + {N->getOperand(0), Src}); else - V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src); Results.push_back(V); if (IsStrict) Results.push_back(V.getValue(1)); @@ -31409,6 +31687,21 @@ // No other ValueType for FP_EXTEND should reach this point. assert(N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"); + if (!Subtarget.hasFP16() || !Subtarget.hasVLX()) + return; + bool IsStrict = N->isStrictFPOpcode(); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); + SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16) + : DAG.getUNDEF(MVT::v2f16); + SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext); + if (IsStrict) + V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), V}); + else + V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V); + Results.push_back(V); + if (IsStrict) + Results.push_back(V.getValue(1)); return; } case ISD::INTRINSIC_W_CHAIN: { @@ -49415,10 +49708,31 @@ EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); + // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16)) + // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32)) + // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64)) + if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) { + unsigned ScalarSize = InVT.getScalarSizeInBits(); + if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64) + return SDValue(); + SDLoc dl(N); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), + ScalarSize < 16 ? MVT::i16 + : ScalarSize < 32 ? MVT::i32 + : MVT::i64, + InVT.getVectorNumElements()); + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + if (IsStrict) + return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), P}); + return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); + } + // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32)) // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) - if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) { + if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 && + VT.getScalarType() != MVT::f16) { SDLoc dl(N); EVT DstVT = InVT.changeVectorElementType(MVT::i32); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); @@ -49457,10 +49771,31 @@ EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); + // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16)) + // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32)) + // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64)) + if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) { + unsigned ScalarSize = InVT.getScalarSizeInBits(); + if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64) + return SDValue(); + SDLoc dl(N); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), + ScalarSize < 16 ? MVT::i16 + : ScalarSize < 32 ? MVT::i32 + : MVT::i64, + InVT.getVectorNumElements()); + SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), P}); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); + } + // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) - if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) { + if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 && + VT.getScalarType() != MVT::f16) { SDLoc dl(N); EVT DstVT = InVT.changeVectorElementType(MVT::i32); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); @@ -51306,6 +51641,9 @@ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) return SDValue(); + if (Subtarget.hasFP16()) + return SDValue(); + bool IsStrict = N->isStrictFPOpcode(); EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); @@ -51414,6 +51752,9 @@ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) return SDValue(); + if (Subtarget.hasFP16()) + return SDValue(); + EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -7531,8 +7531,8 @@ X86VectorVTInfo DstVT, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, string asm, - string aliasStr> { - let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in { + string aliasStr, Predicate prd = HasAVX512> { + let Predicates = [prd], ExeDomain = SrcVT.ExeDomain in { def rr_Int : SI, @@ -7548,7 +7548,7 @@ [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarIntMemFrags addr:$src)))]>, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; - } // Predicates = [HasAVX512] + } // Predicates = [prd] def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", (!cast(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">; @@ -7712,8 +7712,9 @@ multiclass avx512_cvt_s_all opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDPatternOperator OpNode, SDNode OpNodeInt, SDNode OpNodeSAE, - X86FoldableSchedWrite sched, string aliasStr>{ -let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in { + X86FoldableSchedWrite sched, string aliasStr, + Predicate prd = HasAVX512> { +let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in { let isCodeGenOnly = 1 in { def rr : AVX512, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; -} //HasAVX512 +} // Predicates = [prd] def : InstAlias(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; @@ -7838,33 +7839,47 @@ EVEX_4V, VEX_LIG, Sched<[sched]>, EVEX_B, EVEX_RC; } -multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, +multiclass avx512_cvt_fp_scalar_trunc opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, - X86VectorVTInfo _src, X86VectorVTInfo _dst> { - let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in { + X86VectorVTInfo _src, X86VectorVTInfo _dst, + Predicate prd = HasAVX512> { + let Predicates = [prd], ExeDomain = SSEPackedSingle in { defm Z : avx512_cvt_fp_scalar, avx512_cvt_fp_rc_scalar, VEX_W, EVEX_CD8<64, CD8VT1>, XD; + OpNodeRnd, sched>, EVEX_CD8<_src.EltSize, CD8VT1>; } } -multiclass avx512_cvt_fp_scalar_ss2sd opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeSAE, - X86FoldableSchedWrite sched, - X86VectorVTInfo _src, X86VectorVTInfo _dst> { - let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in { +multiclass avx512_cvt_fp_scalar_extend opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeSAE, + X86FoldableSchedWrite sched, + X86VectorVTInfo _src, X86VectorVTInfo _dst, + Predicate prd = HasAVX512> { + let Predicates = [prd], ExeDomain = SSEPackedSingle in { defm Z : avx512_cvt_fp_scalar, avx512_cvt_fp_sae_scalar, - EVEX_CD8<32, CD8VT1>, XS; + EVEX_CD8<_src.EltSize, CD8VT1>; } } -defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds, +defm VCVTSD2SS : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2ss", X86frounds, X86froundsRnd, WriteCvtSD2SS, f64x_info, - f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts, + f32x_info>, XD, VEX_W; +defm VCVTSS2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtss2sd", X86fpexts, X86fpextsSAE, WriteCvtSS2SD, f32x_info, - f64x_info>; + f64x_info>, XS; +defm VCVTSD2SH : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2sh", X86frounds, + X86froundsRnd, WriteCvtSD2SS, f64x_info, + f16x_info, HasFP16>, T_MAP5XD, VEX_W; +defm VCVTSH2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtsh2sd", X86fpexts, + X86fpextsSAE, WriteCvtSS2SD, f16x_info, + f64x_info, HasFP16>, T_MAP5XS; +defm VCVTSS2SH : avx512_cvt_fp_scalar_trunc<0x1D, "vcvtss2sh", X86frounds, + X86froundsRnd, WriteCvtSD2SS, f32x_info, + f16x_info, HasFP16>, T_MAP5PS; +defm VCVTSH2SS : avx512_cvt_fp_scalar_extend<0x13, "vcvtsh2ss", X86fpexts, + X86fpextsSAE, WriteCvtSS2SD, f16x_info, + f32x_info, HasFP16>, T_MAP6PS; def : Pat<(f64 (any_fpextend FR32X:$src)), (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>, @@ -7877,6 +7892,27 @@ (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, Requires<[HasAVX512]>; +def : Pat<(f32 (any_fpextend FR16X:$src)), + (VCVTSH2SSZrr (f32 (IMPLICIT_DEF)), FR16X:$src)>, + Requires<[HasFP16]>; +def : Pat<(f32 (any_fpextend (loadf16 addr:$src))), + (VCVTSH2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasFP16, OptForSize]>; + +def : Pat<(f64 (any_fpextend FR16X:$src)), + (VCVTSH2SDZrr (f64 (IMPLICIT_DEF)), FR16X:$src)>, + Requires<[HasFP16]>; +def : Pat<(f64 (any_fpextend (loadf16 addr:$src))), + (VCVTSH2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasFP16, OptForSize]>; + +def : Pat<(f16 (any_fpround FR32X:$src)), + (VCVTSS2SHZrr (f16 (IMPLICIT_DEF)), FR32X:$src)>, + Requires<[HasFP16]>; +def : Pat<(f16 (any_fpround FR64X:$src)), + (VCVTSD2SHZrr (f16 (IMPLICIT_DEF)), FR64X:$src)>, + Requires<[HasFP16]>; + def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector @@ -7990,39 +8026,82 @@ (_.VT (!cast("extload"#_Src.VTName) addr:$src)), (_.VT (!cast("extload"#_Src.VTName) addr:$src))>; -// Extend Float to Double -multiclass avx512_cvtps2pd opc, string OpcodeStr, - X86SchedWriteWidths sched> { - let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fpextend opc, string OpcodeStr, + AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src, + X86SchedWriteWidths sched, Predicate prd = HasAVX512> { + let Predicates = [prd] in { + defm Z : avx512_vcvt_fpextend, - avx512_vcvt_fp_sae, EVEX_V512; } - let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fpextend, EVEX_V128; - defm Z256 : avx512_vcvt_fpextend, EVEX_V256; - } -} - -// Truncate Double to Float -multiclass avx512_cvtpd2ps opc, string OpcodeStr, X86SchedWriteWidths sched> { - let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp, EVEX_V256; + } +} + +// Truncate [Double to Float, Float to Half] +multiclass avx512_cvt_trunc opc, string OpcodeStr, + AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src, + X86SchedWriteWidths sched, Predicate prd = HasAVX512, + PatFrag bcast128 = _src.info128.BroadcastLdFrag, + PatFrag bcast256 = _src.info256.BroadcastLdFrag, + PatFrag bcast512 = _src.info512.BroadcastLdFrag, + PatFrag loadVT128 = _src.info128.LdFrag, + PatFrag loadVT256 = _src.info256.LdFrag, + PatFrag loadVT512 = _src.info512.LdFrag, + RegisterClass maskRC128 = _src.info128.KRCWM, + RegisterClass maskRC256 = _src.info256.KRCWM, + RegisterClass maskRC512 = _src.info512.KRCWM> { + let Predicates = [prd] in { + defm Z : avx512_vcvt_fp, - avx512_vcvt_fp_rc, EVEX_V512; } - let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fp, EVEX_V128; - defm Z256 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + sched.YMM, _src.info256.BroadcastStr, "{y}">, EVEX_V256; + + // Special patterns to allow use of X86vmfpround for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(_dst.info128.VT (X86any_vfpround (_src.info128.VT VR128X:$src))), + (!cast(NAME # "Z128rr") VR128X:$src)>; + def : Pat<(X86vmfpround (_src.info128.VT VR128X:$src), (_dst.info128.VT VR128X:$src0), + maskRC128:$mask), + (!cast(NAME # "Z128rrk") VR128X:$src0, maskRC128:$mask, VR128X:$src)>; + def : Pat<(X86vmfpround (_src.info128.VT VR128X:$src), _dst.info128.ImmAllZerosV, + maskRC128:$mask), + (!cast(NAME # "Z128rrkz") maskRC128:$mask, VR128X:$src)>; + + def : Pat<(_dst.info128.VT (X86any_vfpround (loadVT128 addr:$src))), + (!cast(NAME # "Z128rm") addr:$src)>; + def : Pat<(X86vmfpround (loadVT128 addr:$src), (_dst.info128.VT VR128X:$src0), + maskRC128:$mask), + (!cast(NAME # "Z128rmk") VR128X:$src0, maskRC128:$mask, addr:$src)>; + def : Pat<(X86vmfpround (loadVT128 addr:$src), _dst.info128.ImmAllZerosV, + maskRC128:$mask), + (!cast(NAME # "Z128rmkz") maskRC128:$mask, addr:$src)>; + + def : Pat<(_dst.info128.VT (X86any_vfpround (_src.info128.VT (bcast128 addr:$src)))), + (!cast(NAME # "Z128rmb") addr:$src)>; + def : Pat<(X86vmfpround (_src.info128.VT (bcast128 addr:$src)), + (_dst.info128.VT VR128X:$src0), maskRC128:$mask), + (!cast(NAME # "Z128rmbk") VR128X:$src0, maskRC128:$mask, addr:$src)>; + def : Pat<(X86vmfpround (_src.info128.VT (bcast128 addr:$src)), + _dst.info128.ImmAllZerosV, maskRC128:$mask), + (!cast(NAME # "Z128rmbkz") maskRC128:$mask, addr:$src)>; } def : InstAlias; } -defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, +defm VCVTPD2PS : avx512_cvt_trunc<0x5A, "vcvtpd2ps", + avx512vl_f32_info, avx512vl_f64_info, SchedWriteCvtPD2PS>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>, - PS, EVEX_CD8<32, CD8VH>; +defm VCVTPS2PD : avx512_cvt_extend<0x5A, "vcvtps2pd", + avx512vl_f64_info, avx512vl_f32_info, SchedWriteCvtPS2PD>, + PS, EVEX_CD8<32, CD8VH>; -let Predicates = [HasVLX] in { +// Extend Half to Double +multiclass avx512_cvtph2pd opc, string OpcodeStr, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fpextend, + avx512_vcvt_fp_sae, EVEX_V512; + def : Pat<(v8f64 (extloadv8f16 addr:$src)), + (!cast(NAME # "Zrm") addr:$src)>; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fpextend, EVEX_V128; + defm Z256 : avx512_vcvt_fpextend, EVEX_V256; + } +} + +// Truncate Double to Half +multiclass avx512_cvtpd2ph opc, string OpcodeStr, X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_rc, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + } + def : InstAlias(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Zrr") VR128X:$dst, + VR512:$src), 0, "att">; + def : InstAlias(NAME # "Zrrk") VR128X:$dst, + VK8WM:$mask, VR512:$src), 0, "att">; + def : InstAlias(NAME # "Zrrkz") VR128X:$dst, + VK8WM:$mask, VR512:$src), 0, "att">; + def : InstAlias(NAME # "Zrmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Zrmbk") VR128X:$dst, + VK8WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Zrmbkz") VR128X:$dst, + VK8WM:$mask, i64mem:$src), 0, "att">; +} + +defm VCVTPS2PHX : avx512_cvt_trunc<0x1D, "vcvtps2phx", avx512vl_f16_info, + avx512vl_f32_info, SchedWriteCvtPD2PS, + HasFP16>, T_MAP5PD, EVEX_CD8<32, CD8VF>; +defm VCVTPH2PSX : avx512_cvt_extend<0x13, "vcvtph2psx", avx512vl_f32_info, + avx512vl_f16_info, SchedWriteCvtPS2PD, + HasFP16>, T_MAP6PD, EVEX_CD8<16, CD8VH>; +defm VCVTPD2PH : avx512_cvtpd2ph<0x5A, "vcvtpd2ph", SchedWriteCvtPD2PS>, + VEX_W, T_MAP5PD, EVEX_CD8<64, CD8VF>; +defm VCVTPH2PD : avx512_cvtph2pd<0x5A, "vcvtph2pd", SchedWriteCvtPS2PD>, + T_MAP5PS, EVEX_CD8<16, CD8VQ>; + +let Predicates = [HasFP16, HasVLX] in { // Special patterns to allow use of X86vmfpround for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(X86any_vfpround (v2f64 VR128X:$src)), - (VCVTPD2PSZ128rr VR128X:$src)>; - def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v4f32 VR128X:$src0), + def : Pat<(v8f16 (X86any_vfpround (v4f64 VR256X:$src))), + (VCVTPD2PHZ256rr VR256X:$src)>; + def : Pat<(v8f16 (X86vmfpround (v4f64 VR256X:$src), (v8f16 VR128X:$src0), + VK4WM:$mask)), + (VCVTPD2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>; + def : Pat<(X86vmfpround (v4f64 VR256X:$src), v8f16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTPD2PHZ256rrkz VK4WM:$mask, VR256X:$src)>; + + def : Pat<(v8f16 (X86any_vfpround (loadv4f64 addr:$src))), + (VCVTPD2PHZ256rm addr:$src)>; + def : Pat<(X86vmfpround (loadv4f64 addr:$src), (v8f16 VR128X:$src0), + VK4WM:$mask), + (VCVTPD2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86vmfpround (loadv4f64 addr:$src), v8f16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTPD2PHZ256rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_vfpround (v4f64 (X86VBroadcastld64 addr:$src)))), + (VCVTPD2PHZ256rmb addr:$src)>; + def : Pat<(X86vmfpround (v4f64 (X86VBroadcastld64 addr:$src)), + (v8f16 VR128X:$src0), VK4WM:$mask), + (VCVTPD2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86vmfpround (v4f64 (X86VBroadcastld64 addr:$src)), + v8f16x_info.ImmAllZerosV, VK4WM:$mask), + (VCVTPD2PHZ256rmbkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_vfpround (v2f64 VR128X:$src))), + (VCVTPD2PHZ128rr VR128X:$src)>; + def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v8f16 VR128X:$src0), VK2WM:$mask), - (VCVTPD2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; - def : Pat<(X86vmfpround (v2f64 VR128X:$src), v4f32x_info.ImmAllZerosV, + (VCVTPD2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86vmfpround (v2f64 VR128X:$src), v8f16x_info.ImmAllZerosV, VK2WM:$mask), - (VCVTPD2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; + (VCVTPD2PHZ128rrkz VK2WM:$mask, VR128X:$src)>; - def : Pat<(X86any_vfpround (loadv2f64 addr:$src)), - (VCVTPD2PSZ128rm addr:$src)>; - def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v4f32 VR128X:$src0), + def : Pat<(v8f16 (X86any_vfpround (loadv2f64 addr:$src))), + (VCVTPD2PHZ128rm addr:$src)>; + def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v8f16 VR128X:$src0), VK2WM:$mask), - (VCVTPD2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86vmfpround (loadv2f64 addr:$src), v4f32x_info.ImmAllZerosV, + (VCVTPD2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86vmfpround (loadv2f64 addr:$src), v8f16x_info.ImmAllZerosV, VK2WM:$mask), - (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>; + (VCVTPD2PHZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src))), - (VCVTPD2PSZ128rmb addr:$src)>; + def : Pat<(v8f16 (X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src)))), + (VCVTPD2PHZ128rmb addr:$src)>; def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), - (v4f32 VR128X:$src0), VK2WM:$mask), - (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + (v8f16 VR128X:$src0), VK2WM:$mask), + (VCVTPD2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), - v4f32x_info.ImmAllZerosV, VK2WM:$mask), - (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>; + v8f16x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTPD2PHZ128rmbkz VK2WM:$mask, addr:$src)>; } // Convert Signed/Unsigned Doubleword to Double @@ -8420,26 +8644,60 @@ } // Convert Signed/Unsigned Quardword to Float -multiclass avx512_cvtqq2ps opc, string OpcodeStr, SDPatternOperator OpNode, - SDNode MaskOpNode, SDNode OpNodeRnd, - X86SchedWriteWidths sched> { - let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDPatternOperator OpNode128, + SDPatternOperator OpNode128M, SDPatternOperator OpNodeRnd, + AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src, + X86SchedWriteWidths sched, Predicate prd = HasDQI> { + let Predicates = [prd] in { + defm Z : avx512_vcvt_fp, - avx512_vcvt_fp_rc, EVEX_V512; } - let Predicates = [HasDQI, HasVLX] in { + let Predicates = [prd, HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 // memory forms of these instructions in Asm Parcer. They have the same // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. - defm Z128 : avx512_vcvt_fp, + defm Z128 : avx512_vcvt_fp, EVEX_V128, NotEVEX2VEXConvertible; - defm Z256 : avx512_vcvt_fp, EVEX_V256, + defm Z256 : avx512_vcvt_fp, EVEX_V256, NotEVEX2VEXConvertible; + + // Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.VT VR128X:$src))), + (!cast(NAME # "Z128rr") VR128X:$src)>; + def : Pat<(OpNode128M (_src.info128.VT VR128X:$src), (_dst.info128.VT VR128X:$src0), + _src.info128.KRCWM:$mask), + (!cast(NAME # "Z128rrk") VR128X:$src0, _src.info128.KRCWM:$mask, VR128X:$src)>; + def : Pat<(OpNode128M (_src.info128.VT VR128X:$src), _dst.info128.ImmAllZerosV, + _src.info128.KRCWM:$mask), + (!cast(NAME # "Z128rrkz") _src.info128.KRCWM:$mask, VR128X:$src)>; + + def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.LdFrag addr:$src))), + (!cast(NAME # "Z128rm") addr:$src)>; + def : Pat<(OpNode128M (_src.info128.LdFrag addr:$src), (_dst.info128.VT VR128X:$src0), + _src.info128.KRCWM:$mask), + (!cast(NAME # "Z128rmk") VR128X:$src0, _src.info128.KRCWM:$mask, addr:$src)>; + def : Pat<(OpNode128M (_src.info128.LdFrag addr:$src), _dst.info128.ImmAllZerosV, + _src.info128.KRCWM:$mask), + (!cast(NAME # "Z128rmkz") _src.info128.KRCWM:$mask, addr:$src)>; + + def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.VT (X86VBroadcastld64 addr:$src)))), + (!cast(NAME # "Z128rmb") addr:$src)>; + def : Pat<(OpNode128M (_src.info128.VT (X86VBroadcastld64 addr:$src)), + (_dst.info128.VT VR128X:$src0), _src.info128.KRCWM:$mask), + (!cast(NAME # "Z128rmbk") VR128X:$src0, _src.info128.KRCWM:$mask, addr:$src)>; + def : Pat<(OpNode128M (_src.info128.VT (X86VBroadcastld64 addr:$src)), + _dst.info128.ImmAllZerosV, _src.info128.KRCWM:$mask), + (!cast(NAME # "Z128rmbkz") _src.info128.KRCWM:$mask, addr:$src)>; } def : InstAlias, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp, - sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, - VEX_W, PS, EVEX_CD8<64, CD8VF>; +defm VCVTDQ2PH : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtdq2ph", any_sint_to_fp, sint_to_fp, + X86any_VSintToFP, X86VMSintToFP, + X86VSintToFpRnd, avx512vl_f16_info, avx512vl_i32_info, + SchedWriteCvtDQ2PS, HasFP16>, + T_MAP5PS, EVEX_CD8<32, CD8VF>; + +defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint_to_fp, + X86any_VUintToFP, X86VMUintToFP, + X86VUintToFpRnd, avx512vl_f16_info, avx512vl_i32_info, + SchedWriteCvtDQ2PS, HasFP16>, T_MAP5XD, + EVEX_CD8<32, CD8VF>; + +defm VCVTQQ2PS : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtqq2ps", any_sint_to_fp, sint_to_fp, + X86any_VSintToFP, X86VMSintToFP, + X86VSintToFpRnd, avx512vl_f32_info, avx512vl_i64_info, + SchedWriteCvtDQ2PS>, VEX_W, PS, + EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp, - uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, - VEX_W, XD, EVEX_CD8<64, CD8VF>; +defm VCVTUQQ2PS : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtuqq2ps", any_uint_to_fp, uint_to_fp, + X86any_VUintToFP, X86VMUintToFP, + X86VUintToFpRnd, avx512vl_f32_info, avx512vl_i64_info, + SchedWriteCvtDQ2PS>, VEX_W, XD, + EVEX_CD8<64, CD8VF>; let Predicates = [HasVLX] in { // Special patterns to allow use of X86mcvtp2Int for masking. Instruction @@ -8777,66 +9051,6 @@ (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } -let Predicates = [HasDQI, HasVLX] in { - // Special patterns to allow use of X86VMSintToFP for masking. Instruction - // patterns have been disabled with null_frag. - def : Pat<(v4f32 (X86any_VSintToFP (v2i64 VR128X:$src))), - (VCVTQQ2PSZ128rr VR128X:$src)>; - def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), - VK2WM:$mask), - (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; - def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV, - VK2WM:$mask), - (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; - - def : Pat<(v4f32 (X86any_VSintToFP (loadv2i64 addr:$src))), - (VCVTQQ2PSZ128rm addr:$src)>; - def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), - VK2WM:$mask), - (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV, - VK2WM:$mask), - (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - - def : Pat<(v4f32 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), - (VCVTQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), - (v4f32 VR128X:$src0), VK2WM:$mask), - (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), - v4f32x_info.ImmAllZerosV, VK2WM:$mask), - (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; - - // Special patterns to allow use of X86VMUintToFP for masking. Instruction - // patterns have been disabled with null_frag. - def : Pat<(v4f32 (X86any_VUintToFP (v2i64 VR128X:$src))), - (VCVTUQQ2PSZ128rr VR128X:$src)>; - def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), - VK2WM:$mask), - (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; - def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV, - VK2WM:$mask), - (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; - - def : Pat<(v4f32 (X86any_VUintToFP (loadv2i64 addr:$src))), - (VCVTUQQ2PSZ128rm addr:$src)>; - def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), - VK2WM:$mask), - (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV, - VK2WM:$mask), - (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - - def : Pat<(v4f32 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), - (VCVTUQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), - (v4f32 VR128X:$src0), VK2WM:$mask), - (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), - v4f32x_info.ImmAllZerosV, VK2WM:$mask), - (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; -} - //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// @@ -12663,3 +12877,510 @@ def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>; } + +// Convert 16-bit float to i16/u16 +multiclass avx512_cvtph2w opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + AVX512VLVectorVTInfo _Dst, + AVX512VLVectorVTInfo _Src, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_rc, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + } +} + +// Convert 16-bit float to i16/u16 truncate +multiclass avx512_cvttph2w opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_sae, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + } +} + +defm VCVTPH2UW : avx512_cvtph2w<0x7D, "vcvtph2uw", X86cvtp2UInt, X86cvtp2UInt, + X86cvtp2UIntRnd, avx512vl_i16_info, + avx512vl_f16_info, SchedWriteCvtPD2DQ>, + T_MAP5PS, EVEX_CD8<16, CD8VF>; +defm VCVTUW2PH : avx512_cvtph2w<0x7D, "vcvtuw2ph", any_uint_to_fp, uint_to_fp, + X86VUintToFpRnd, avx512vl_f16_info, + avx512vl_i16_info, SchedWriteCvtPD2DQ>, + T_MAP5XD, EVEX_CD8<16, CD8VF>; +defm VCVTTPH2W : avx512_cvttph2w<0x7C, "vcvttph2w", X86any_cvttp2si, + X86cvttp2si, X86cvttp2siSAE, + avx512vl_i16_info, avx512vl_f16_info, + SchedWriteCvtPD2DQ>, T_MAP5PD, EVEX_CD8<16, CD8VF>; +defm VCVTTPH2UW : avx512_cvttph2w<0x7C, "vcvttph2uw", X86any_cvttp2ui, + X86cvttp2ui, X86cvttp2uiSAE, + avx512vl_i16_info, avx512vl_f16_info, + SchedWriteCvtPD2DQ>, T_MAP5PS, EVEX_CD8<16, CD8VF>; +defm VCVTPH2W : avx512_cvtph2w<0x7D, "vcvtph2w", X86cvtp2Int, X86cvtp2Int, + X86cvtp2IntRnd, avx512vl_i16_info, + avx512vl_f16_info, SchedWriteCvtPD2DQ>, + T_MAP5PD, EVEX_CD8<16, CD8VF>; +defm VCVTW2PH : avx512_cvtph2w<0x7D, "vcvtw2ph", any_sint_to_fp, sint_to_fp, + X86VSintToFpRnd, avx512vl_f16_info, + avx512vl_i16_info, SchedWriteCvtPD2DQ>, + T_MAP5XS, EVEX_CD8<16, CD8VF>; + +// Convert Half to Signed/Unsigned Doubleword +multiclass avx512_cvtph2dq opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_rc, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + } +} + +// Convert Half to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttph2dq opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_sae, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + } +} + + +defm VCVTPH2DQ : avx512_cvtph2dq<0x5B, "vcvtph2dq", X86cvtp2Int, X86cvtp2Int, + X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD, + EVEX_CD8<16, CD8VH>; +defm VCVTPH2UDQ : avx512_cvtph2dq<0x79, "vcvtph2udq", X86cvtp2UInt, X86cvtp2UInt, + X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PS, + EVEX_CD8<16, CD8VH>; + +defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si, + X86cvttp2si, X86cvttp2siSAE, + SchedWriteCvtPS2DQ>, T_MAP5XS, + EVEX_CD8<16, CD8VH>; + +defm VCVTTPH2UDQ : avx512_cvttph2dq<0x78, "vcvttph2udq", X86any_cvttp2ui, + X86cvttp2ui, X86cvttp2uiSAE, + SchedWriteCvtPS2DQ>, T_MAP5PS, + EVEX_CD8<16, CD8VH>; + +// Convert Half to Signed/Unsigned Quardword +multiclass avx512_cvtph2qq opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_rc, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v8f16x_info source + defm Z128 : avx512_vcvt_fp, + EVEX_V128; + // Explicitly specified broadcast string, since we take only 4 elements + // from v8f16x_info source + defm Z256 : avx512_vcvt_fp, + EVEX_V256; + } +} + +// Convert Half to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttph2qq opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_sae, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v8f16x_info source + defm Z128 : avx512_vcvt_fp, EVEX_V128; + // Explicitly specified broadcast string, since we take only 4 elements + // from v8f16x_info source + defm Z256 : avx512_vcvt_fp, EVEX_V256; + } +} + +defm VCVTPH2QQ : avx512_cvtph2qq<0x7B, "vcvtph2qq", X86cvtp2Int, X86cvtp2Int, + X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD, + EVEX_CD8<16, CD8VQ>; + +defm VCVTPH2UQQ : avx512_cvtph2qq<0x79, "vcvtph2uqq", X86cvtp2UInt, X86cvtp2UInt, + X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD, + EVEX_CD8<16, CD8VQ>; + +defm VCVTTPH2QQ : avx512_cvttph2qq<0x7A, "vcvttph2qq", X86any_cvttp2si, + X86cvttp2si, X86cvttp2siSAE, + SchedWriteCvtPS2DQ>, T_MAP5PD, + EVEX_CD8<16, CD8VQ>; + +defm VCVTTPH2UQQ : avx512_cvttph2qq<0x78, "vcvttph2uqq", X86any_cvttp2ui, + X86cvttp2ui, X86cvttp2uiSAE, + SchedWriteCvtPS2DQ>, T_MAP5PD, + EVEX_CD8<16, CD8VQ>; + +// Convert Signed/Unsigned Quardword to Half +multiclass avx512_cvtqq2ph opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { + // we need "x"/"y"/"z" suffixes in order to distinguish between 128, 256 and + // 512 memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v8f16x_info'. We also specify the broadcast string explicitly + // due to the same reason. + let Predicates = [HasFP16] in { + defm Z : avx512_vcvt_fp, + avx512_vcvt_fp_rc, EVEX_V512; + } + let Predicates = [HasFP16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, + EVEX_V128, NotEVEX2VEXConvertible; + defm Z256 : avx512_vcvt_fp, + EVEX_V256, NotEVEX2VEXConvertible; + } + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrk") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rrkz") VR128X:$dst, + VK2WM:$mask, VR128X:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbk") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z128rmbkz") VR128X:$dst, + VK2WM:$mask, i64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrk") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rrkz") VR128X:$dst, + VK4WM:$mask, VR256X:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbk") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Z256rmbkz") VR128X:$dst, + VK4WM:$mask, i64mem:$src), 0, "att">; + + def : InstAlias(NAME # "Zrr") VR128X:$dst, + VR512:$src), 0, "att">; + def : InstAlias(NAME # "Zrrk") VR128X:$dst, + VK8WM:$mask, VR512:$src), 0, "att">; + def : InstAlias(NAME # "Zrrkz") VR128X:$dst, + VK8WM:$mask, VR512:$src), 0, "att">; + def : InstAlias(NAME # "Zrmb") VR128X:$dst, + i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Zrmbk") VR128X:$dst, + VK8WM:$mask, i64mem:$src), 0, "att">; + def : InstAlias(NAME # "Zrmbkz") VR128X:$dst, + VK8WM:$mask, i64mem:$src), 0, "att">; +} + +defm VCVTQQ2PH : avx512_cvtqq2ph<0x5B, "vcvtqq2ph", any_sint_to_fp, sint_to_fp, + X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, T_MAP5PS, + EVEX_CD8<64, CD8VF>; + +defm VCVTUQQ2PH : avx512_cvtqq2ph<0x7A, "vcvtuqq2ph", any_uint_to_fp, uint_to_fp, + X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, T_MAP5XD, + EVEX_CD8<64, CD8VF>; + +// Convert half to signed/unsigned int 32/64 +defm VCVTSH2SIZ: avx512_cvt_s_int_round<0x2D, f16x_info, i32x_info, X86cvts2si, + X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{l}", HasFP16>, + T_MAP5XS, EVEX_CD8<16, CD8VT1>; +defm VCVTSH2SI64Z: avx512_cvt_s_int_round<0x2D, f16x_info, i64x_info, X86cvts2si, + X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{q}", HasFP16>, + T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>; +defm VCVTSH2USIZ: avx512_cvt_s_int_round<0x79, f16x_info, i32x_info, X86cvts2usi, + X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{l}", HasFP16>, + T_MAP5XS, EVEX_CD8<16, CD8VT1>; +defm VCVTSH2USI64Z: avx512_cvt_s_int_round<0x79, f16x_info, i64x_info, X86cvts2usi, + X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{q}", HasFP16>, + T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>; + +defm VCVTTSH2SIZ: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i32x_info, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, + "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>; +defm VCVTTSH2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i64x_info, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, + "{q}", HasFP16>, VEX_W, T_MAP5XS, EVEX_CD8<16, CD8VT1>; +defm VCVTTSH2USIZ: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i32x_info, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, + "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>; +defm VCVTTSH2USI64Z: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i64x_info, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, + "{q}", HasFP16>, T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>; + +let Predicates = [HasFP16] in { + defm VCVTSI2SHZ : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR32, + v8f16x_info, i32mem, loadi32, "cvtsi2sh", "l">, + T_MAP5XS, EVEX_CD8<32, CD8VT1>; + defm VCVTSI642SHZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR64, + v8f16x_info, i64mem, loadi64, "cvtsi2sh","q">, + T_MAP5XS, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCVTUSI2SHZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR32, + v8f16x_info, i32mem, loadi32, + "cvtusi2sh","l">, T_MAP5XS, EVEX_CD8<32, CD8VT1>; + defm VCVTUSI642SHZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR64, + v8f16x_info, i64mem, loadi64, "cvtusi2sh", "q">, + T_MAP5XS, VEX_W, EVEX_CD8<64, CD8VT1>; + def : InstAlias<"vcvtsi2sh\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; + + def : InstAlias<"vcvtusi2sh\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTUSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; + + + def : Pat<(f16 (any_sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f16 (any_sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(f16 (any_sint_to_fp GR32:$src)), + (VCVTSI2SHZrr (f16 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f16 (any_sint_to_fp GR64:$src)), + (VCVTSI642SHZrr (f16 (IMPLICIT_DEF)), GR64:$src)>; + + def : Pat<(f16 (any_uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f16 (any_uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(f16 (any_uint_to_fp GR32:$src)), + (VCVTUSI2SHZrr (f16 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f16 (any_uint_to_fp GR64:$src)), + (VCVTUSI642SHZrr (f16 (IMPLICIT_DEF)), GR64:$src)>; + + // Patterns used for matching vcvtsi2sh intrinsic sequences from clang + // which produce unnecessary vmovsh instructions + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_sint_to_fp GR64:$src)))))), + (VCVTSI642SHZrr_Int VR128X:$dst, GR64:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_sint_to_fp (loadi64 addr:$src))))))), + (VCVTSI642SHZrm_Int VR128X:$dst, addr:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_sint_to_fp GR32:$src)))))), + (VCVTSI2SHZrr_Int VR128X:$dst, GR32:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_sint_to_fp (loadi32 addr:$src))))))), + (VCVTSI2SHZrm_Int VR128X:$dst, addr:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_uint_to_fp GR64:$src)))))), + (VCVTUSI642SHZrr_Int VR128X:$dst, GR64:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_uint_to_fp (loadi64 addr:$src))))))), + (VCVTUSI642SHZrm_Int VR128X:$dst, addr:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_uint_to_fp GR32:$src)))))), + (VCVTUSI2SHZrr_Int VR128X:$dst, GR32:$src)>; + + def : Pat<(v8f16 (X86Movsh + (v8f16 VR128X:$dst), + (v8f16 (scalar_to_vector (f16 (any_uint_to_fp (loadi32 addr:$src))))))), + (VCVTUSI2SHZrm_Int VR128X:$dst, addr:$src)>; +} // Predicates = [HasFP16] + +let Predicates = [HasFP16, HasVLX] in { + // Special patterns to allow use of X86VMSintToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v8f16 (X86any_VSintToFP (v4i64 VR256X:$src))), + (VCVTQQ2PHZ256rr VR256X:$src)>; + def : Pat<(X86VMSintToFP (v4i64 VR256X:$src), (v8f16 VR128X:$src0), + VK4WM:$mask), + (VCVTQQ2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>; + def : Pat<(X86VMSintToFP (v4i64 VR256X:$src), v8f16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTQQ2PHZ256rrkz VK4WM:$mask, VR256X:$src)>; + + def : Pat<(v8f16 (X86any_VSintToFP (loadv4i64 addr:$src))), + (VCVTQQ2PHZ256rm addr:$src)>; + def : Pat<(X86VMSintToFP (loadv4i64 addr:$src), (v8f16 VR128X:$src0), + VK4WM:$mask), + (VCVTQQ2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (loadv4i64 addr:$src), v8f16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTQQ2PHZ256rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_VSintToFP (v4i64 (X86VBroadcastld64 addr:$src)))), + (VCVTQQ2PHZ256rmb addr:$src)>; + def : Pat<(X86VMSintToFP (v4i64 (X86VBroadcastld64 addr:$src)), + (v8f16 VR128X:$src0), VK4WM:$mask), + (VCVTQQ2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (v4i64 (X86VBroadcastld64 addr:$src)), + v8f16x_info.ImmAllZerosV, VK4WM:$mask), + (VCVTQQ2PHZ256rmbkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_VSintToFP (v2i64 VR128X:$src))), + (VCVTQQ2PHZ128rr VR128X:$src)>; + def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v8f16 VR128X:$src0), + VK2WM:$mask), + (VCVTQQ2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v8f16x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTQQ2PHZ128rrkz VK2WM:$mask, VR128X:$src)>; + + def : Pat<(v8f16 (X86any_VSintToFP (loadv2i64 addr:$src))), + (VCVTQQ2PHZ128rm addr:$src)>; + def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v8f16 VR128X:$src0), + VK2WM:$mask), + (VCVTQQ2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v8f16x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTQQ2PHZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), + (VCVTQQ2PHZ128rmb addr:$src)>; + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), + (v8f16 VR128X:$src0), VK2WM:$mask), + (VCVTQQ2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), + v8f16x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>; + + // Special patterns to allow use of X86VMUintToFP for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v8f16 (X86any_VUintToFP (v4i64 VR256X:$src))), + (VCVTUQQ2PHZ256rr VR256X:$src)>; + def : Pat<(X86VMUintToFP (v4i64 VR256X:$src), (v8f16 VR128X:$src0), + VK4WM:$mask), + (VCVTUQQ2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>; + def : Pat<(X86VMUintToFP (v4i64 VR256X:$src), v8f16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTUQQ2PHZ256rrkz VK4WM:$mask, VR256X:$src)>; + + def : Pat<(v8f16 (X86any_VUintToFP (loadv4i64 addr:$src))), + (VCVTUQQ2PHZ256rm addr:$src)>; + def : Pat<(X86VMUintToFP (loadv4i64 addr:$src), (v8f16 VR128X:$src0), + VK4WM:$mask), + (VCVTUQQ2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (loadv4i64 addr:$src), v8f16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTUQQ2PHZ256rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_VUintToFP (v4i64 (X86VBroadcastld64 addr:$src)))), + (VCVTUQQ2PHZ256rmb addr:$src)>; + def : Pat<(X86VMUintToFP (v4i64 (X86VBroadcastld64 addr:$src)), + (v8f16 VR128X:$src0), VK4WM:$mask), + (VCVTUQQ2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (v4i64 (X86VBroadcastld64 addr:$src)), + v8f16x_info.ImmAllZerosV, VK4WM:$mask), + (VCVTUQQ2PHZ256rmbkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_VUintToFP (v2i64 VR128X:$src))), + (VCVTUQQ2PHZ128rr VR128X:$src)>; + def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v8f16 VR128X:$src0), + VK2WM:$mask), + (VCVTUQQ2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>; + def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v8f16x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTUQQ2PHZ128rrkz VK2WM:$mask, VR128X:$src)>; + + def : Pat<(v8f16 (X86any_VUintToFP (loadv2i64 addr:$src))), + (VCVTUQQ2PHZ128rm addr:$src)>; + def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v8f16 VR128X:$src0), + VK2WM:$mask), + (VCVTUQQ2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v8f16x_info.ImmAllZerosV, + VK2WM:$mask), + (VCVTUQQ2PHZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v8f16 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), + (VCVTUQQ2PHZ128rmb addr:$src)>; + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), + (v8f16 VR128X:$src0), VK2WM:$mask), + (VCVTUQQ2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), + v8f16x_info.ImmAllZerosV, VK2WM:$mask), + (VCVTUQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>; +} diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -4455,8 +4455,12 @@ { X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0 }, { X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0 }, { X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0 }, + { X86::VCVTSD2SHZrr_Intk, X86::VCVTSD2SHZrm_Intk, TB_NO_REVERSE }, { X86::VCVTSD2SSZrr_Intk, X86::VCVTSD2SSZrm_Intk, TB_NO_REVERSE }, + { X86::VCVTSH2SDZrr_Intk, X86::VCVTSH2SDZrm_Intk, TB_NO_REVERSE }, + { X86::VCVTSH2SSZrr_Intk, X86::VCVTSH2SSZrm_Intk, TB_NO_REVERSE }, { X86::VCVTSS2SDZrr_Intk, X86::VCVTSS2SDZrm_Intk, TB_NO_REVERSE }, + { X86::VCVTSS2SHZrr_Intk, X86::VCVTSS2SHZrm_Intk, TB_NO_REVERSE }, { X86::VDBPSADBWZ128rrik, X86::VDBPSADBWZ128rmik, 0 }, { X86::VDBPSADBWZ256rrik, X86::VDBPSADBWZ256rmik, 0 }, { X86::VDBPSADBWZrrik, X86::VDBPSADBWZrmik, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -130,14 +130,12 @@ def X86vmtruncus : SDNode<"X86ISD::VMTRUNCUS", SDTVmtrunc>; def X86vfpext : SDNode<"X86ISD::VFPEXT", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, - SDTCVecEltisVT<1, f32>, - SDTCisSameSizeAs<0, 1>]>>; + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>]>>; def X86strict_vfpext : SDNode<"X86ISD::STRICT_VFPEXT", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, - SDTCVecEltisVT<1, f32>, - SDTCisSameSizeAs<0, 1>]>, + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>]>, [SDNPHasChain]>; def X86any_vfpext : PatFrags<(ops node:$src), @@ -145,13 +143,13 @@ (X86vfpext node:$src)]>; def X86vfpround: SDNode<"X86ISD::VFPROUND", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, f64>, + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>, SDTCisOpSmallerThanOp<0, 1>]>>; def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, f64>, + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>, SDTCisOpSmallerThanOp<0, 1>]>, [SDNPHasChain]>; @@ -160,33 +158,32 @@ (X86vfpround node:$src)]>; def X86frounds : SDNode<"X86ISD::VFPROUNDS", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>, - SDTCVecEltisVT<2, f64>, + SDTCisFP<2>, SDTCisVec<2>, SDTCisSameSizeAs<0, 2>]>>; def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND", - SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>, - SDTCVecEltisVT<2, f64>, + SDTCisFP<2>, SDTCisVec<2>, SDTCisSameSizeAs<0, 2>, SDTCisVT<3, i32>]>>; def X86fpexts : SDNode<"X86ISD::VFPEXTS", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>, - SDTCVecEltisVT<2, f32>, + SDTCisFP<2>, SDTCisVec<2>, SDTCisSameSizeAs<0, 2>]>>; def X86fpextsSAE : SDNode<"X86ISD::VFPEXTS_SAE", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>, - SDTCVecEltisVT<2, f32>, + SDTCisFP<2>, SDTCisVec<2>, SDTCisSameSizeAs<0, 2>]>>; def X86vmfpround: SDNode<"X86ISD::VMFPROUND", - SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, f64>, - SDTCisSameSizeAs<0, 1>, + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<1, 3>]>>; @@ -709,7 +706,6 @@ // Masked versions of above def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisInt<1>, - SDTCisSameSizeAs<0, 1>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<1, 3>]>; @@ -757,12 +753,12 @@ SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<1, 4>]> >; def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, - SDTCVecEltisVT<1, f32>, + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>, SDTCisOpSmallerThanOp<1, 0>]>>; def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, f64>, + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisVT<2, i32>]>>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5177,6 +5177,26 @@ case X86::VCVTUSI642SDZrr_Int: case X86::VCVTUSI642SDZrrb_Int: case X86::VCVTUSI642SDZrm_Int: + case X86::VCVTSI2SHZrr: + case X86::VCVTSI2SHZrm: + case X86::VCVTSI2SHZrr_Int: + case X86::VCVTSI2SHZrrb_Int: + case X86::VCVTSI2SHZrm_Int: + case X86::VCVTSI642SHZrr: + case X86::VCVTSI642SHZrm: + case X86::VCVTSI642SHZrr_Int: + case X86::VCVTSI642SHZrrb_Int: + case X86::VCVTSI642SHZrm_Int: + case X86::VCVTUSI2SHZrr: + case X86::VCVTUSI2SHZrm: + case X86::VCVTUSI2SHZrr_Int: + case X86::VCVTUSI2SHZrrb_Int: + case X86::VCVTUSI2SHZrm_Int: + case X86::VCVTUSI642SHZrr: + case X86::VCVTUSI642SHZrm: + case X86::VCVTUSI642SHZrr_Int: + case X86::VCVTUSI642SHZrrb_Int: + case X86::VCVTUSI642SHZrm_Int: // Load folding won't effect the undef register update since the input is // a GPR. return OpNum == 1 && !ForLoadFold; @@ -5278,6 +5298,26 @@ case X86::VSQRTSDZrb_Int: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: + case X86::VCVTSD2SHZrr: + case X86::VCVTSD2SHZrr_Int: + case X86::VCVTSD2SHZrrb_Int: + case X86::VCVTSD2SHZrm: + case X86::VCVTSD2SHZrm_Int: + case X86::VCVTSS2SHZrr: + case X86::VCVTSS2SHZrr_Int: + case X86::VCVTSS2SHZrrb_Int: + case X86::VCVTSS2SHZrm: + case X86::VCVTSS2SHZrm_Int: + case X86::VCVTSH2SDZrr: + case X86::VCVTSH2SDZrr_Int: + case X86::VCVTSH2SDZrrb_Int: + case X86::VCVTSH2SDZrm: + case X86::VCVTSH2SDZrm_Int: + case X86::VCVTSH2SSZrr: + case X86::VCVTSH2SSZrr_Int: + case X86::VCVTSH2SSZrrb_Int: + case X86::VCVTSH2SSZrm: + case X86::VCVTSH2SSZrm_Int: return OpNum == 1; case X86::VMOVSSZrrk: case X86::VMOVSDZrrk: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -1747,20 +1747,20 @@ // XMM only def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, + [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>, + [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>, VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86any_vfpround VR256:$src))]>, + [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>, + [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; } // Predicates = [HasAVX, NoVLX] @@ -1771,11 +1771,11 @@ def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, + [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, Sched<[WriteCvtPD2PS]>, SIMD_EXC; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>, + [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>, Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1006,6 +1006,117 @@ X86ISD::FMULS, X86ISD::FMULS_RND), X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK, X86ISD::FSUBS, X86ISD::FSUBS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtdq2ph_128, TRUNCATE_TO_REG, + X86ISD::CVTSI2P, X86ISD::MCVTSI2P), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_128, TRUNCATE_TO_REG, + X86ISD::VFPROUND, X86ISD::VMFPROUND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_256, TRUNCATE_TO_REG, + X86ISD::VFPROUND, X86ISD::VMFPROUND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_512, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, X86ISD::VFPROUND_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_256, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_512, INTR_TYPE_1OP_MASK_SAE, + ISD::FP_EXTEND, X86ISD::VFPEXT_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_256, INTR_TYPE_1OP_MASK, ISD::FP_EXTEND, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_512, INTR_TYPE_1OP_MASK_SAE, + ISD::FP_EXTEND, X86ISD::VFPEXT_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_128, TRUNCATE_TO_REG, + X86ISD::VFPROUND, X86ISD::VMFPROUND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_256, INTR_TYPE_1OP_MASK, X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_512, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, X86ISD::VFPROUND_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtqq2ph_128, TRUNCATE_TO_REG, + X86ISD::CVTSI2P, X86ISD::MCVTSI2P), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtqq2ph_256, TRUNCATE_TO_REG, + X86ISD::CVTSI2P, X86ISD::MCVTSI2P), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsd2sh_round, INTR_TYPE_SCALAR_MASK_RND, + X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsh2sd_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsh2ss_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtss2sh_round, INTR_TYPE_SCALAR_MASK_RND, + X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2UI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_256, INTR_TYPE_1OP_MASK, + X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtudq2ph_128, TRUNCATE_TO_REG, + X86ISD::CVTUI2P, X86ISD::MCVTUI2P), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_128, TRUNCATE_TO_REG, + X86ISD::CVTUI2P, X86ISD::MCVTUI2P), + X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_256, TRUNCATE_TO_REG, + X86ISD::CVTUI2P, X86ISD::MCVTUI2P), X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), @@ -1015,6 +1126,23 @@ X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + /*fp16 scalar convert instruction*/ + X86_INTRINSIC_DATA(avx512fp16_vcvtsh2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvtsh2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvtsh2usi32, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvtsh2usi64, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvtsi2sh, INTR_TYPE_2OP, + X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvtsi642sh, INTR_TYPE_2OP, + X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvttsh2si32, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512fp16_vcvttsh2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE), + X86_INTRINSIC_DATA(avx512fp16_vcvttsh2usi32, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512fp16_vcvttsh2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE), + X86_INTRINSIC_DATA(avx512fp16_vcvtusi2sh, INTR_TYPE_2OP, + X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), + X86_INTRINSIC_DATA(avx512fp16_vcvtusi642sh, INTR_TYPE_2OP, + X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll @@ -282,3 +282,364 @@ %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer ret <32 x half> %res1 } + +declare <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half>, <8 x double>, i8, i32) + +define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd(<8 x half> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 %x2, i32 4) + ret <8 x double> %res +} + +define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_sae(<8 x half> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2pd {sae}, %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 %x2, i32 8) + ret <8 x double> %res +} + +define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_nomask(<8 x half> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 -1, i32 4) + ret <8 x double> %res +} + +define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_load(<8 x half>* %px0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2pd (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %x0 = load <8 x half>, <8 x half>* %px0, align 16 + %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 %x2, i32 4) + ret <8 x double> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph(<8 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ph %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> %x0, <8 x half> %x1, i8 %x2, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_r(<8 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ph {rz-sae}, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> %x0, <8 x half> %x1, i8 %x2, i32 11) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_load(<8 x double>* %px0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtpd2phz (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %x0 = load <8 x double>, <8 x double>* %px0, align 64 + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> %x0, <8 x half> %x1, i8 %x2, i32 4) + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half>, <4 x float>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtss2sh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_r(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtss2sh {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3, i32 11) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_nomask(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtss2sh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 -1, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_z(<8 x half> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtss2sh %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> zeroinitializer, i8 %x2, i32 4) + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half>, <2 x double>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsd2sh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_r(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsd2sh {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3, i32 11) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_nomask(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsd2sh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 -1, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_z(<8 x half> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsd2sh %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> zeroinitializer, i8 %x2, i32 4) + ret <8 x half> %res +} + +declare <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float>, <8 x half>, <4 x float>, i8, i32) + +define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_r(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsh2ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3, i32 8) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_nomask(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 -1, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_z(<4 x float> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> zeroinitializer, i8 %x2, i32 4) + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double>, <8 x half>, <2 x double>, i8, i32) + +define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsh2sd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_r(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsh2sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3, i32 8) + ret <2 x double> %res +} + +define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_nomask(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2sd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 -1, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_z(<2 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtsh2sd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> zeroinitializer, i8 %x2, i32 4) + ret <2 x double> %res +} + +declare <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half>, <16 x float>, i16, i32) + +define <16 x float> @test_int_x86_avx512_cvt_ph2psx_512(<16 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %ymm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512(<16 x half> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> %x1, i16 %x2, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512(<16 x half> %x0, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx %ymm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> zeroinitializer, i16 %x2, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512_cvt_ph2psx_512r(<16 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_512r: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx {sae}, %ymm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> undef, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512r(<16 x half> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_512r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx {sae}, %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> %x1, i16 %x2, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512r(<16 x half> %x0, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_512r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx {sae}, %ymm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> zeroinitializer, i16 %x2, i32 8) + ret <16 x float> %res +} + +declare <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float>, <16 x half>, i16, i32) + +define <16 x half> @test_int_x86_avx512_cvt_ps2phx_512(<16 x float> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2phx_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %zmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> undef, i16 -1, i32 4) + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512(<16 x float> %x0, <16 x half> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2phx %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> %x1, i16 %x2, i32 4) + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_512(<16 x float> %x0, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ps2phx_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2phx %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> zeroinitializer, i16 %x2, i32 4) + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512r(<16 x float> %x0, <16 x half> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_512r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2phx {rd-sae}, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vcvtps2phx {ru-sae}, %zmm0, %ymm0 +; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> %x1, i16 %x2, i32 9) + %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> %x1, i16 -1, i32 10) + %res2 = fadd <16 x half> %res, %res1 + ret <16 x half> %res2 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll @@ -402,3 +402,403 @@ %res0 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %x1, <16 x half> %x2) ret <16 x half> %res0 } + +declare <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half>, <4 x double>, i8) + +define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256(<8 x half> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 %x2) + ret <4 x double> %res +} + +define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256_nomask(<8 x half> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half>, <2 x double>, i8) + +define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128(<8 x half> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 %x2) + ret <2 x double> %res +} + +define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128_nomask(<8 x half> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256(<4 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ph %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256_load(<4 x double>* %px0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtpd2phy (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %x0 = load <4 x double>, <4 x double>* %px0, align 32 + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128(<2 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128_load(<2 x double>* %px0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtpd2phx (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %x0 = load <2 x double>, <2 x double>* %px0, align 16 + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvt_ph2udq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2udq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvt_ph2udq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2udq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvtt_ph2dq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvtt_ph2dq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvtt_ph2udq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvtt_ph2udq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half>, <4 x float>, i8) + +define <4 x float> @test_int_x86_avx512_cvt_ph2psx_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half> %x0, <4 x float> undef, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512_mask_cvt_ph2psx_128(<8 x half> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half> %x0, <4 x float> %x1, i8 %x2) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half> %x0, <4 x float> zeroinitializer, i8 %x2) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half>, <8 x float>, i8) + +define <8 x float> @test_int_x86_avx512_cvt_ph2psx_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half> %x0, <8 x float> undef, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512_mask_cvt_ph2psx_256(<8 x half> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half> %x0, <8 x float> %x1, i8 %x2) + ret <8 x float> %res +} + +define <8 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half> %x0, <8 x float> zeroinitializer, i8 %x2) + ret <8 x float> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_128(<4 x float> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2phx %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vcvtps2phx %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float> %x0, <8 x half> %x1, i8 %x2) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float> %x0, <8 x half> %x1, i8 -1) + %res2 = fadd <8 x half> %res, %res1 + ret <8 x half> %res2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_cvt_ps2phx_256(<8 x float> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2phx_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %x0, <8 x half> undef, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_256(<8 x float> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2phx %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_256(<8 x float> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ps2phx_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll --- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -274,6 +274,68 @@ ret <32 x i1> %0 } +define <8 x half> @regression_test1(<8 x half> %x, <8 x half> %y) #0 { +; CHECK-LABEL: regression_test1: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; CHECK-NEXT: retq +entry: + %a = fsub <8 x half> %x, %y + %b = fadd <8 x half> %x, %y + %c = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + ret <8 x half> %c +} + +define <8 x i16> @regression_test2(<8 x float> %x) #0 { +; CHECK-LABEL: regression_test2: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %a = fptoui <8 x float> %x to <8 x i16> + ret <8 x i16> %a +} + +define <8 x i16> @regression_test3(<8 x float> %x) #0 { +; CHECK-LABEL: regression_test3: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %a = fptosi <8 x float> %x to <8 x i16> + ret <8 x i16> %a +} + +define <8 x i16> @regression_test4(<8 x double> %x) #0 { +; CHECK-LABEL: regression_test4: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm0 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %a = fptoui <8 x double> %x to <8 x i16> + ret <8 x i16> %a +} + +define <8 x i16> @regression_test5(<8 x double> %x) #0 { +; CHECK-LABEL: regression_test5: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm0 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %a = fptosi <8 x double> %x to <8 x i16> + ret <8 x i16> %a +} + define <8 x i1> @fcmp_v8f16(<8 x half> %a, <8 x half> %b) ; CHECK-LABEL: fcmp_v8f16: ; CHECK: ## %bb.0: ## %entry diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-intrinsics.ll @@ -0,0 +1,549 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512fp16 | FileCheck %s + +declare <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16>, i32) + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_2(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = sitofp <32 x i16> %arg0 to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_b(i16* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer + %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_b_2(i16* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_b_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer + %res0 = sitofp <32 x i16> %val to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_r(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph {ru-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 10) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask(<32 x i16> %arg0) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask_2(<32 x i16> %arg0) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = sitofp <32 x i16> %arg0 to <32 x half> + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_z(<32 x i16> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_z_2(<32 x i16> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_z_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = sitofp <32 x i16> %arg0 to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_load(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %val = load <32 x i16>, <32 x i16>* %arg0 + %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_load_2(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_load_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %val = load <32 x i16>, <32 x i16>* %arg0 + %res0 = sitofp <32 x i16> %val to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half>, <32 x i16>, i32, i32) + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2w (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0 + %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_r(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w {rd-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 9) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2w %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_z(<32 x half> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2w (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %val = load <32 x half>, <32 x half>* %arg0 + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + + +declare <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16>, i32) + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_2(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = uitofp <32 x i16> %arg0 to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_b(i16* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer + %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_b_2(i16* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_b_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer + %res0 = uitofp <32 x i16> %val to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_r(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph {ru-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 10) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask(<32 x i16> %arg0) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask_2(<32 x i16> %arg0) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = uitofp <32 x i16> %arg0 to <32 x half> + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_z(<32 x i16> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_z_2(<32 x i16> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_z_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %res0 = uitofp <32 x i16> %arg0 to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_load(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %val = load <32 x i16>, <32 x i16>* %arg0 + %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4) + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_load_2(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_load_2: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i32 %mask to <32 x i1> + %val = load <32 x i16>, <32 x i16>* %arg0 + %res0 = uitofp <32 x i16> %val to <32 x half> + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1 + ret <32 x half> %res +} + +declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half>, <32 x i16>, i32, i32) + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2uw (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0 + %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_r(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw {rd-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 9) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2uw %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_z(<32 x half> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2uw (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %val = load <32 x half>, <32 x half>* %arg0 + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half>, <32 x i16>, i32, i32) + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2w (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0 + %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_sae(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w {sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 8) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_z(<32 x half> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2w (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %val = load <32 x half>, <32 x half>* %arg0 + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half>, <32 x i16>, i32, i32) + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2uw (%rdi){1to32}, %zmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0 + %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_sae(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw {sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 8) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_z(<32 x half> %arg0, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4) + ret <32 x i16> %res +} + +define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2uw (%rdi), %zmm0 {%k1} +; CHECK-NEXT: retq + %val = load <32 x half>, <32 x half>* %arg0 + %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4) + ret <32 x i16> %res +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll @@ -0,0 +1,770 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256(<16 x i16> %arg0, <16 x half> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = sitofp <16 x i16> %arg0 to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_b(i16* %arg0, <16 x half> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi){1to16}, %ymm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <16 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <16 x i16> %scalar_in_vector, <16 x i16> undef, <16 x i32> zeroinitializer + %res0 = sitofp <16 x i16> %val to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_nomask(<16 x i16> %arg0, <16 x half> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = sitofp <16 x i16> %arg0 to <16 x half> + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_z(<16 x i16> %arg0, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = sitofp <16 x i16> %arg0 to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_load(<16 x i16>* %arg0, <16 x half> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x i16>, <16 x i16>* %arg0 + %res0 = sitofp <16 x i16> %val to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1 + ret <16 x half> %res +} + +declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half>, <16 x i16>, i16) + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2w (%rdi){1to16}, %ymm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0 + %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2w %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_z(<16 x half> %arg0, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2w (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %val = load <16 x half>, <16 x half>* %arg0 + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256(<16 x i16> %arg0, <16 x half> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = uitofp <16 x i16> %arg0 to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_b(i16* %arg0, <16 x half> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi){1to16}, %ymm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <16 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <16 x i16> %scalar_in_vector, <16 x i16> undef, <16 x i32> zeroinitializer + %res0 = uitofp <16 x i16> %val to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_nomask(<16 x i16> %arg0, <16 x half> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = uitofp <16 x i16> %arg0 to <16 x half> + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_z(<16 x i16> %arg0, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = uitofp <16 x i16> %arg0 to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_load(<16 x i16>* %arg0, <16 x half> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x i16>, <16 x i16>* %arg0 + %res0 = uitofp <16 x i16> %val to <16 x half> + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1 + ret <16 x half> %res +} + +declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half>, <16 x i16>, i16) + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2uw (%rdi){1to16}, %ymm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0 + %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2uw %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_z(<16 x half> %arg0, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2uw (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %val = load <16 x half>, <16 x half>* %arg0 + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half>, <16 x i16>, i16) + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2w (%rdi){1to16}, %ymm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0 + %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_z(<16 x half> %arg0, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2w (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %val = load <16 x half>, <16 x half>* %arg0 + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half>, <16 x i16>, i16) + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2uw (%rdi){1to16}, %ymm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0 + %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_z(<16 x half> %arg0, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2uw (%rdi), %ymm0 {%k1} +; CHECK-NEXT: retq + %val = load <16 x half>, <16 x half>* %arg0 + %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask) + ret <16 x i16> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128(<8 x i16> %arg0, <8 x half> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = sitofp <8 x i16> %arg0 to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_b(i16* %arg0, <8 x half> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi){1to8}, %xmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <8 x i16> %scalar_in_vector, <8 x i16> undef, <8 x i32> zeroinitializer + %res0 = sitofp <8 x i16> %val to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_nomask(<8 x i16> %arg0, <8 x half> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <8 x i16> %arg0 to <8 x half> + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_z(<8 x i16> %arg0, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = sitofp <8 x i16> %arg0 to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_load(<8 x i16>* %arg0, <8 x half> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtw2ph (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x i16>, <8 x i16>* %arg0 + %res0 = sitofp <8 x i16> %val to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1 + ret <8 x half> %res +} + +declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half>, <8 x i16>, i8) + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2w (%rdi){1to8}, %xmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0 + %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2w %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_z(<8 x half> %arg0, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2w %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2w (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %val = load <8 x half>, <8 x half>* %arg0 + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128(<8 x i16> %arg0, <8 x half> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = uitofp <8 x i16> %arg0 to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_b(i16* %arg0, <8 x half> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi){1to8}, %xmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %scalar = load i16, i16* %arg0 + %scalar_in_vector = insertelement <8 x i16> undef, i16 %scalar, i32 0 + %val = shufflevector <8 x i16> %scalar_in_vector, <8 x i16> undef, <8 x i32> zeroinitializer + %res0 = uitofp <8 x i16> %val to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_nomask(<8 x i16> %arg0, <8 x half> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <8 x i16> %arg0 to <8 x half> + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_z(<8 x i16> %arg0, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = uitofp <8 x i16> %arg0 to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_load(<8 x i16>* %arg0, <8 x half> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtuw2ph (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x i16>, <8 x i16>* %arg0 + %res0 = uitofp <8 x i16> %val to <8 x half> + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1 + ret <8 x half> %res +} + +declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half>, <8 x i16>, i8) + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2uw (%rdi){1to8}, %xmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0 + %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2uw %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_z(<8 x half> %arg0, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uw %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtph2uw (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %val = load <8 x half>, <8 x half>* %arg0 + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half>, <8 x i16>, i8) + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2w (%rdi){1to8}, %xmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0 + %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_z(<8 x half> %arg0, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2w (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %val = load <8 x half>, <8 x half>* %arg0 + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half>, <8 x i16>, i8) + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_b: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2uw (%rdi){1to8}, %xmm0 {%k1} +; CHECK-NEXT: retq + %scalar = load half, half* %arg0 + %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0 + %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_z(<8 x half> %arg0, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvttph2uw (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %val = load <8 x half>, <8 x half>* %arg0 + %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask) + ret <8 x i16> %res +} + +define <4 x half> @test_u16tofp4(<4 x i16> %arg0) { +; CHECK-LABEL: test_u16tofp4: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <4 x i16> %arg0 to <4 x half> + ret <4 x half> %res +} + +define <2 x half> @test_s16tofp2(<2 x i16> %arg0) { +; CHECK-LABEL: test_s16tofp2: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <2 x i16> %arg0 to <2 x half> + ret <2 x half> %res +} + +define <4 x half> @test_u8tofp4(<4 x i8> %arg0) { +; CHECK-LABEL: test_u8tofp4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <4 x i8> %arg0 to <4 x half> + ret <4 x half> %res +} + +define <2 x half> @test_s8tofp2(<2 x i8> %arg0) { +; CHECK-LABEL: test_s8tofp2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <2 x i8> %arg0 to <2 x half> + ret <2 x half> %res +} + +define <2 x half> @test_u1tofp2(<2 x i1> %arg0) { +; CHECK-LABEL: test_u1tofp2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <2 x i1> %arg0 to <2 x half> + ret <2 x half> %res +} + +define <4 x half> @test_s17tofp4(<4 x i17> %arg0) { +; CHECK-LABEL: test_s17tofp4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $15, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $15, %xmm0, %xmm0 +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <4 x i17> %arg0 to <4 x half> + ret <4 x half> %res +} + +define <2 x half> @test_u33tofp2(<2 x i33> %arg0) { +; CHECK-LABEL: test_u33tofp2: +; CHECK: # %bb.0: +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <2 x i33> %arg0 to <2 x half> + ret <2 x half> %res +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll @@ -0,0 +1,1029 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,X86 + +define half @f32tof16(float %b) nounwind { +; X64-LABEL: f32tof16: +; X64: # %bb.0: +; X64-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: f32tof16: +; X86: # %bb.0: +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; X86-NEXT: retl + %a = fptrunc float %b to half + ret half %a +} + +define half @f64tof16(double %b) nounwind { +; X64-LABEL: f64tof16: +; X64: # %bb.0: +; X64-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: f64tof16: +; X86: # %bb.0: +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 +; X86-NEXT: retl + %a = fptrunc double %b to half + ret half %a +} + +define <16 x half> @f32to16f16(<16 x float> %b) nounwind { +; CHECK-LABEL: f32to16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <16 x float> %b to <16 x half> + ret <16 x half> %a +} + +define <8 x half> @f32to8f16(<8 x float> %b) { +; CHECK-LABEL: f32to8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <8 x float> %b to <8 x half> + ret <8 x half> %a +} + +define <4 x half> @f32to4f16(<4 x float> %b) { +; CHECK-LABEL: f32to4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <4 x float> %b to <4 x half> + ret <4 x half> %a +} + +define <2 x half> @f32to2f16(<2 x float> %b) { +; CHECK-LABEL: f32to2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <2 x float> %b to <2 x half> + ret <2 x half> %a +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float>, <8 x half>, i8) +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float>, <8 x half>, i8) + +define <8 x half> @f32to4f16_mask(<4 x float> %a, <8 x half> %b, i8 %mask) { +; X64-LABEL: f32to4f16_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vcvtps2phx %xmm0, %xmm1 {%k1} +; X64-NEXT: vmovaps %xmm1, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: f32to4f16_mask: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vcvtps2phx %xmm0, %xmm1 {%k1} +; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: retl + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float> %a, <8 x half> %b, i8 %mask) + ret <8 x half> %res +} + +define <8 x half> @f32to8f16_mask(<8 x float> %a, <8 x half> %b, i8 %mask) { +; X64-LABEL: f32to8f16_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vcvtps2phx %ymm0, %xmm1 {%k1} +; X64-NEXT: vmovaps %xmm1, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: f32to8f16_mask: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vcvtps2phx %ymm0, %xmm1 {%k1} +; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %a, <8 x half> %b, i8 %mask) + ret <8 x half> %res +} + +define <8 x half> @f32to8f16_mask2(<8 x float> %b, <8 x i1> %mask) { +; CHECK-LABEL: f32to8f16_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1 +; CHECK-NEXT: vpmovw2m %xmm1, %k1 +; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <8 x float> %b to <8 x half> + %c = select <8 x i1>%mask, <8 x half>%a, <8 x half> zeroinitializer + ret <8 x half> %c +} + +define <16 x half> @f32to16f16_mask(<16 x float> %b, <16 x i1> %mask) { +; CHECK-LABEL: f32to16f16_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 +; CHECK-NEXT: vpmovb2m %xmm1, %k1 +; CHECK-NEXT: vcvtps2phx %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <16 x float> %b to <16 x half> + %c = select <16 x i1>%mask, <16 x half>%a, <16 x half> zeroinitializer + ret <16 x half> %c +} + +define float @f16tof32(half %b) nounwind { +; X64-LABEL: f16tof32: +; X64: # %bb.0: +; X64-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: f16tof32: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %a = fpext half %b to float + ret float %a +} + +define double @f16tof64(half %b) nounwind { +; X64-LABEL: f16tof64: +; X64: # %bb.0: +; X64-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: f16tof64: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsh 8(%ebp), %xmm0 +; X86-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl + %a = fpext half %b to double + ret double %a +} + +define <16 x float> @f16to16f32(<16 x half> %b) nounwind { +; CHECK-LABEL: f16to16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <16 x half> %b to <16 x float> + ret <16 x float> %a +} + +define <8 x float> @f16to8f32(<8 x half> %b) nounwind { +; CHECK-LABEL: f16to8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <8 x half> %b to <8 x float> + ret <8 x float> %a +} + +define <4 x float> @f16to4f32(<4 x half> %b) nounwind { +; CHECK-LABEL: f16to4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <4 x half> %b to <4 x float> + ret <4 x float> %a +} + +define <2 x float> @f16to2f32(<2 x half> %b) nounwind { +; CHECK-LABEL: f16to2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <2 x half> %b to <2 x float> + ret <2 x float> %a +} + +define <16 x float> @f16to16f32_mask(<16 x half> %b, <16 x float> %b1, <16 x float> %a1) { +; CHECK-LABEL: f16to16f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltps %zmm2, %zmm1, %k1 +; CHECK-NEXT: vcvtph2psx %ymm0, %zmm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <16 x half> %b to <16 x float> + %mask = fcmp ogt <16 x float> %a1, %b1 + %c = select <16 x i1> %mask, <16 x float> %a, <16 x float> zeroinitializer + ret <16 x float> %c +} + +define <8 x float> @f16to8f32_mask(<8 x half> %b, <8 x float> %b1, <8 x float> %a1) { +; CHECK-LABEL: f16to8f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <8 x half> %b to <8 x float> + %mask = fcmp ogt <8 x float> %a1, %b1 + %c = select <8 x i1> %mask, <8 x float> %a, <8 x float> zeroinitializer + ret <8 x float> %c +} + +define <4 x float> @f16to4f32_mask(<4 x half> %b, <4 x float> %b1, <4 x float> %a1) { +; CHECK-LABEL: f16to4f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <4 x half> %b to <4 x float> + %mask = fcmp ogt <4 x float> %a1, %b1 + %c = select <4 x i1> %mask, <4 x float> %a, <4 x float> zeroinitializer + ret <4 x float> %c +} + +define <2 x float> @f16to2f32_mask(<2 x half> %b, <2 x float> %b1, <2 x float> %a1) { +; CHECK-LABEL: f16to2f32_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <2 x half> %b to <2 x float> + %mask = fcmp ogt <2 x float> %a1, %b1 + %c = select <2 x i1> %mask, <2 x float> %a, <2 x float> zeroinitializer + ret <2 x float> %c +} + +define <2 x double> @f16to2f64(<2 x half> %b) nounwind { +; CHECK-LABEL: f16to2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <2 x half> %b to <2 x double> + ret <2 x double> %a +} + +define <2 x double> @f16to2f64_mask(<2 x half> %b, <2 x double> %b1, <2 x double> %a1) { +; CHECK-LABEL: f16to2f64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltpd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <2 x half> %b to <2 x double> + %mask = fcmp ogt <2 x double> %a1, %b1 + %c = select <2 x i1> %mask, <2 x double> %a, <2 x double> zeroinitializer + ret <2 x double> %c +} + +define <4 x double> @f16to4f64(<4 x half> %b) nounwind { +; CHECK-LABEL: f16to4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <4 x half> %b to <4 x double> + ret <4 x double> %a +} + +define <4 x double> @f16to4f64_mask(<4 x half> %b, <4 x double> %b1, <4 x double> %a1) { +; CHECK-LABEL: f16to4f64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <4 x half> %b to <4 x double> + %mask = fcmp ogt <4 x double> %a1, %b1 + %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer + ret <4 x double> %c +} + +define <8 x double> @f16to8f64(<8 x half> %b) nounwind { +; CHECK-LABEL: f16to8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <8 x half> %b to <8 x double> + ret <8 x double> %a +} + +define <8 x double> @f16to8f64_mask(<8 x half> %b, <8 x double> %b1, <8 x double> %a1) { +; CHECK-LABEL: f16to8f64_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpltpd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: ret{{[l|q]}} + %a = fpext <8 x half> %b to <8 x double> + %mask = fcmp ogt <8 x double> %a1, %b1 + %c = select <8 x i1> %mask, <8 x double> %a, <8 x double> zeroinitializer + ret <8 x double> %c +} + +define <2 x half> @f64to2f16(<2 x double> %b) { +; CHECK-LABEL: f64to2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <2 x double> %b to <2 x half> + ret <2 x half> %a +} + +define <4 x half> @f64to4f16(<4 x double> %b) { +; CHECK-LABEL: f64to4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <4 x double> %b to <4 x half> + ret <4 x half> %a +} + +define <8 x half> @f64to8f16(<8 x double> %b) { +; CHECK-LABEL: f64to8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ph %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %a = fptrunc <8 x double> %b to <8 x half> + ret <8 x half> %a +} + +define float @extload_f16_f32(half* %x) { +; X64-LABEL: extload_f16_f32: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_f16_f32: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %a = load half, half* %x + %b = fpext half %a to float + ret float %b +} + +define double @extload_f16_f64(half* %x) { +; X64-LABEL: extload_f16_f64: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_f16_f64: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %a = load half, half* %x + %b = fpext half %a to double + ret double %b +} + +define float @extload_f16_f32_optsize(half* %x) optsize { +; X64-LABEL: extload_f16_f32_optsize: +; X64: # %bb.0: +; X64-NEXT: vcvtsh2ss (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_f16_f32_optsize: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsh2ss (%eax), %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %a = load half, half* %x + %b = fpext half %a to float + ret float %b +} + +define double @extload_f16_f64_optsize(half* %x) optsize { +; X64-LABEL: extload_f16_f64_optsize: +; X64: # %bb.0: +; X64-NEXT: vcvtsh2sd (%rdi), %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_f16_f64_optsize: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: vcvtsh2sd (%eax), %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %a = load half, half* %x + %b = fpext half %a to double + ret double %b +} + +define <16 x float> @extload_v16f16_v16f32(<16 x half>* %x) { +; X64-LABEL: extload_v16f16_v16f32: +; X64: # %bb.0: +; X64-NEXT: vcvtph2psx (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_v16f16_v16f32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtph2psx (%eax), %zmm0 +; X86-NEXT: retl + %a = load <16 x half>, <16 x half>* %x + %b = fpext <16 x half> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x float> @extload_v8f16_v8f32(<8 x half>* %x) { +; X64-LABEL: extload_v8f16_v8f32: +; X64: # %bb.0: +; X64-NEXT: vcvtph2psx (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_v8f16_v8f32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtph2psx (%eax), %ymm0 +; X86-NEXT: retl + %a = load <8 x half>, <8 x half>* %x + %b = fpext <8 x half> %a to <8 x float> + ret <8 x float> %b +} + +define <4 x float> @extload_v4f16_v4f32(<4 x half>* %x) { +; X64-LABEL: extload_v4f16_v4f32: +; X64: # %bb.0: +; X64-NEXT: vcvtph2psx (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_v4f16_v4f32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtph2psx (%eax), %xmm0 +; X86-NEXT: retl + %a = load <4 x half>, <4 x half>* %x + %b = fpext <4 x half> %a to <4 x float> + ret <4 x float> %b +} + +define <8 x double> @extload_v8f16_v8f64(<8 x half>* %x) { +; X64-LABEL: extload_v8f16_v8f64: +; X64: # %bb.0: +; X64-NEXT: vcvtph2pd (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_v8f16_v8f64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtph2pd (%eax), %zmm0 +; X86-NEXT: retl + %a = load <8 x half>, <8 x half>* %x + %b = fpext <8 x half> %a to <8 x double> + ret <8 x double> %b +} + +define <4 x double> @extload_v4f16_v4f64(<4 x half>* %x) { +; X64-LABEL: extload_v4f16_v4f64: +; X64: # %bb.0: +; X64-NEXT: vcvtph2pd (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_v4f16_v4f64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtph2pd (%eax), %ymm0 +; X86-NEXT: retl + %a = load <4 x half>, <4 x half>* %x + %b = fpext <4 x half> %a to <4 x double> + ret <4 x double> %b +} + +define <2 x double> @extload_v2f16_v2f64(<2 x half>* %x) { +; X64-LABEL: extload_v2f16_v2f64: +; X64: # %bb.0: +; X64-NEXT: vcvtph2pd (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: extload_v2f16_v2f64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtph2pd (%eax), %xmm0 +; X86-NEXT: retl + %a = load <2 x half>, <2 x half>* %x + %b = fpext <2 x half> %a to <2 x double> + ret <2 x double> %b +} + +define half @s8_to_half(i8 %x) { +; X64-LABEL: s8_to_half: +; X64: # %bb.0: +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: s8_to_half: +; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl + %a = sitofp i8 %x to half + ret half %a +} + +define half @s16_to_half(i16 %x) { +; X64-LABEL: s16_to_half: +; X64: # %bb.0: +; X64-NEXT: movswl %di, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: s16_to_half: +; X86: # %bb.0: +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl + %a = sitofp i16 %x to half + ret half %a +} + +define half @s32_to_half(i32 %x) { +; X64-LABEL: s32_to_half: +; X64: # %bb.0: +; X64-NEXT: vcvtsi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: s32_to_half: +; X86: # %bb.0: +; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl + %a = sitofp i32 %x to half + ret half %a +} + +define half @s64_to_half(i64 %x) { +; X64-LABEL: s64_to_half: +; X64: # %bb.0: +; X64-NEXT: vcvtsi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: s64_to_half: +; X86: # %bb.0: +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcvtqq2ph %xmm0, %xmm0 +; X86-NEXT: retl + %a = sitofp i64 %x to half + ret half %a +} + +define half @s128_to_half(i128 %x) { +; X64-LABEL: s128_to_half: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: callq __floattihf@PLT +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X86-LABEL: s128_to_half: +; X86: # %bb.0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovups %xmm0, (%esp) +; X86-NEXT: calll __floattihf +; X86-NEXT: addl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %a = sitofp i128 %x to half + ret half %a +} + +define half @u8_to_half(i8 %x) { +; X64-LABEL: u8_to_half: +; X64: # %bb.0: +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: u8_to_half: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl + %a = uitofp i8 %x to half + ret half %a +} + +define half @u16_to_half(i16 %x) { +; X64-LABEL: u16_to_half: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: u16_to_half: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl + %a = uitofp i16 %x to half + ret half %a +} + +define half @u32_to_half(i32 %x) { +; X64-LABEL: u32_to_half: +; X64: # %bb.0: +; X64-NEXT: vcvtusi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: u32_to_half: +; X86: # %bb.0: +; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl + %a = uitofp i32 %x to half + ret half %a +} + +define half @u64_to_half(i64 %x) { +; X64-LABEL: u64_to_half: +; X64: # %bb.0: +; X64-NEXT: vcvtusi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: u64_to_half: +; X86: # %bb.0: +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; X86-NEXT: retl + %a = uitofp i64 %x to half + ret half %a +} + +define half @u128_to_half(i128 %x) { +; X64-LABEL: u128_to_half: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: callq __floatuntihf@PLT +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X86-LABEL: u128_to_half: +; X86: # %bb.0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovups %xmm0, (%esp) +; X86-NEXT: calll __floatuntihf +; X86-NEXT: addl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %a = uitofp i128 %x to half + ret half %a +} + +define i8 @half_to_s8(half %x) { +; X64-LABEL: half_to_s8: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: half_to_s8: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %a = fptosi half %x to i8 + ret i8 %a +} + +define i16 @half_to_s16(half %x) { +; X64-LABEL: half_to_s16: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: half_to_s16: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %a = fptosi half %x to i16 + ret i16 %a +} + +define i32 @half_to_s32(half %x) { +; X64-LABEL: half_to_s32: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: retq +; +; X86-LABEL: half_to_s32: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl + %a = fptosi half %x to i32 + ret i32 %a +} + +define i64 @half_to_s64(half %x) { +; X64-LABEL: half_to_s64: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %rax +; X64-NEXT: retq +; +; X86-LABEL: half_to_s64: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vcvttph2qq %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: vpextrd $1, %xmm0, %edx +; X86-NEXT: retl + %a = fptosi half %x to i64 + ret i64 %a +} + +define i128 @half_to_s128(half %x) { +; X64-LABEL: half_to_s128: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: callq __fixhfti@PLT +; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X86-LABEL: half_to_s128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: vmovsh 12(%ebp), %xmm0 +; X86-NEXT: vmovsh %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __fixhfti +; X86-NEXT: subl $4, %esp +; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovups %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl $4 + %a = fptosi half %x to i128 + ret i128 %a +} + +define i8 @half_to_u8(half %x) { +; X64-LABEL: half_to_u8: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: half_to_u8: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %a = fptoui half %x to i8 + ret i8 %a +} + +define i16 @half_to_u16(half %x) { +; X64-LABEL: half_to_u16: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: half_to_u16: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %a = fptoui half %x to i16 + ret i16 %a +} + +define i32 @half_to_u32(half %x) { +; X64-LABEL: half_to_u32: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2usi %xmm0, %eax +; X64-NEXT: retq +; +; X86-LABEL: half_to_u32: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2usi {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl + %a = fptoui half %x to i32 + ret i32 %a +} + +define i64 @half_to_u64(half %x) { +; X64-LABEL: half_to_u64: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2usi %xmm0, %rax +; X64-NEXT: retq +; +; X86-LABEL: half_to_u64: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vcvttph2uqq %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: vpextrd $1, %xmm0, %edx +; X86-NEXT: retl + %a = fptoui half %x to i64 + ret i64 %a +} + +define i128 @half_to_u128(half %x) { +; X64-LABEL: half_to_u128: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: callq __fixunshfti@PLT +; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; X86-LABEL: half_to_u128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: vmovsh 12(%ebp), %xmm0 +; X86-NEXT: vmovsh %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __fixunshfti +; X86-NEXT: subl $4, %esp +; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovups %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl $4 + %a = fptoui half %x to i128 + ret i128 %a +} + +define x86_fp80 @half_to_f80(half %x) nounwind { +; X64-LABEL: half_to_f80: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: callq __extendhfxf2@PLT +; X64-NEXT: popq %rax +; X64-NEXT: retq +; +; X86-LABEL: half_to_f80: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovsh %xmm0, (%esp) +; X86-NEXT: calll __extendhfxf2 +; X86-NEXT: popl %eax +; X86-NEXT: retl + %a = fpext half %x to x86_fp80 + ret x86_fp80 %a +} + +define half @f80_to_half(x86_fp80 %x) nounwind { +; X64-LABEL: f80_to_half: +; X64: # %bb.0: +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: callq __truncxfhf2@PLT +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +; +; X86-LABEL: f80_to_half: +; X86: # %bb.0: +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: calll __truncxfhf2 +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %a = fptrunc x86_fp80 %x to half + ret half %a +} + +; FIXME: We're doing a two step conversion here on 32-bit. +; First from f16->f32 then f32->f128. This is occuring +; due to common code in LegalizeFloatTypes that thinks +; there are no libcalls for f16 to any type but f32. +; Changing this may break other non-x86 targets. The code +; generated here should be functional. +define fp128 @half_to_f128(half %x) nounwind { +; X64-LABEL: half_to_f128: +; X64: # %bb.0: +; X64-NEXT: jmp __extendhftf2@PLT # TAILCALL +; +; X86-LABEL: half_to_f128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: vmovsh 12(%ebp), %xmm0 +; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __extendsftf2 +; X86-NEXT: subl $4, %esp +; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovaps %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %a = fpext half %x to fp128 + ret fp128 %a +} + +define half @f128_to_half(fp128 %x) nounwind { +; X64-LABEL: f128_to_half: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: callq __trunctfhf2@PLT +; X64-NEXT: popq %rax +; X64-NEXT: retq +; +; X86-LABEL: f128_to_half: +; X86: # %bb.0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovups %xmm0, (%esp) +; X86-NEXT: calll __trunctfhf2 +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl + %a = fptrunc fp128 %x to half + ret half %a +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -187,3 +187,506 @@ %res13 = and i8 %res11, %res12 ret i8 %res13 } + +declare <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32>, i32) + +define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512(<16 x i32> %x0, <16 x half> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %mask = bitcast i16 %x2 to <16 x i1> + %res0 = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4) + %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> %x1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_r(<16 x i32> %x0, <16 x half> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph {ru-sae}, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %mask = bitcast i16 %x2 to <16 x i1> + %res0 = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 10) + %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> %x1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_nomask(<16 x i32> %x0, <16 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %zmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4) + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_z(<16 x i32> %x0, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i16 %x2 to <16 x i1> + %res0 = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4) + %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define <16 x half> @sint_to_fp_16i32_to_16f16(<16 x i32> %x) { +; CHECK-LABEL: sint_to_fp_16i32_to_16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %zmm0, %ymm0 +; CHECK-NEXT: retq + %res = sitofp <16 x i32> %x to <16 x half> + ret <16 x half> %res +} + +declare <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32>, i32) + +define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_r(<16 x i32> %x0, <16 x half> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ph {ru-sae}, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %mask = bitcast i16 %x2 to <16 x i1> + %res0 = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 10) + %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> %x1 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_nomask(<16 x i32> %x0, <16 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %zmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4) + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_z(<16 x i32> %x0, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ph %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i16 %x2 to <16 x i1> + %res0 = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4) + %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define <16 x half> @uint_to_fp_16i32_to_16f16(<16 x i32> %x) { +; CHECK-LABEL: uint_to_fp_16i32_to_16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %zmm0, %ymm0 +; CHECK-NEXT: retq + %res = uitofp <16 x i32> %x to <16 x half> + ret <16 x half> %res +} + +declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half>, <16 x i32>, i16, i32) + +define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2dq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2dq {ru-sae}, %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtph2dq {rn-sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 10) + %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half>, <16 x i32>, i16, i32) + +define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq {ru-sae}, %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtph2udq {rn-sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 10) + %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half>, <16 x i32>, i16, i32) + +define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttph2dq {sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 4) + %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half>, <16 x i32>, i16, i32) + +define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %ymm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttph2udq {sae}, %ymm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 4) + %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64>, i32) + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512(<8 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4) + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_r(<8 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph {ru-sae}, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 10) + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_nomask(<8 x i64> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_z(<8 x i64> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph %zmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4) + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64>, i32) + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512(<8 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4) + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_r(<8 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph {ru-sae}, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 10) + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_nomask(<8 x i64> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_z(<8 x i64> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph %zmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4) + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +declare <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half>, <8 x i64>, i8, i32) + +define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2qq_512(<8 x half> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2qq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2qq {ru-sae}, %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtph2qq {rn-sae}, %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> %x0, <8 x i64> %x1, i8 %x2, i32 10) + %res1 = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> %x0, <8 x i64> %x1, i8 -1, i32 8) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half>, <8 x i64>, i8, i32) + +define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2uqq_512(<8 x half> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2uqq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2uqq {ru-sae}, %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvtph2uqq {rn-sae}, %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 %x2, i32 10) + %res1 = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 -1, i32 8) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half>, <8 x i64>, i8, i32) + +define <8 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_512(<8 x half> %x0, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2uqq_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uqq {sae}, %xmm0, %zmm1 {%k1} +; CHECK-NEXT: vcvttph2uqq %xmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 %x2, i32 8) + %res1 = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 -1, i32 4) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half>, i32) + +define i32 @test_x86_avx512fp16_vcvtsh2si32(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2si32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2si %xmm0, %ecx +; CHECK-NEXT: vcvtsh2si {rz-sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq + %res1 = call i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half> %arg0, i32 4) + %res2 = call i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half> %arg0, i32 11) + %res = add i32 %res1, %res2 + ret i32 %res +} + +declare i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half>, i32) + +define i64 @test_x86_avx512fp16_vcvtsh2si64(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2si64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2si %xmm0, %rcx +; CHECK-NEXT: vcvtsh2si {ru-sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %res1 = call i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half> %arg0, i32 4) + %res2 = call i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half> %arg0, i32 10) + %res = add i64 %res1, %res2 + ret i64 %res +} + +declare i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half>, i32) + +define i32 @test_x86_avx512fp16_vcvttsh2si32(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2si32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttsh2si %xmm0, %ecx +; CHECK-NEXT: vcvttsh2si {sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq + %res1 = call i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half> %arg0, i32 4) + %res2 = call i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half> %arg0, i32 8) + %res = add i32 %res1, %res2 + ret i32 %res +} + +declare i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half>, i32) + +define i64 @test_x86_avx512fp16_vcvttsh2si64(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2si64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttsh2si %xmm0, %rcx +; CHECK-NEXT: vcvttsh2si {sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %res1 = call i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half> %arg0, i32 4) + %res2 = call i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half> %arg0, i32 8) + %res = add i64 %res1, %res2 + ret i64 %res +} + + +declare i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half>, i32) + +define i32 @test_x86_avx512fp16_vcvtsh2usi32(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2usi32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2usi %xmm0, %ecx +; CHECK-NEXT: vcvtsh2usi {rd-sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq + %res1 = call i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half> %arg0, i32 4) + %res2 = call i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half> %arg0, i32 9) + %res = add i32 %res1, %res2 + ret i32 %res +} + + +declare i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half>, i32) + +define i64 @test_x86_avx512fp16_vcvtsh2usi64(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2usi64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2usi %xmm0, %rcx +; CHECK-NEXT: vcvtsh2usi {ru-sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %res1 = call i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half> %arg0, i32 4) + %res2 = call i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half> %arg0, i32 10) + %res = add i64 %res1, %res2 + ret i64 %res +} + +declare i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half>, i32) + +define i32 @test_x86_avx512fp16_vcvttsh2usi32(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2usi32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttsh2usi %xmm0, %ecx +; CHECK-NEXT: vcvttsh2usi {sae}, %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq + %res1 = call i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half> %arg0, i32 4) + %res2 = call i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half> %arg0, i32 8) + %res = add i32 %res1, %res2 + ret i32 %res +} + +declare i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half>, i32) + +define i64 @test_x86_avx512fp16_vcvttsh2usi64(<8 x half> %arg0) { +; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2usi64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttsh2usi %xmm0, %rcx +; CHECK-NEXT: vcvttsh2usi {sae}, %xmm0, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq + %res1 = call i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half> %arg0, i32 4) + %res2 = call i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half> %arg0, i32 8) + %res = add i64 %res1, %res2 + ret i64 %res +} + +declare <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half>, i32, i32) + +define <8 x half> @test_x86_avx512fp16_vcvtsi2sh(<8 x half> %arg0, i32 %arg1) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtsi2sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsi2sh %edi, %xmm0, %xmm1 +; CHECK-NEXT: vcvtsi2sh %edi, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half> %arg0, i32 %arg1, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half> %arg0, i32 %arg1, i32 9) + %res = fadd <8 x half> %res1, %res2 + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half>, i64, i32) + +define <8 x half> @test_x86_avx512fp16_vcvtsi642sh(<8 x half> %arg0, i64 %arg1) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtsi642sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsi2sh %rdi, %xmm0, %xmm1 +; CHECK-NEXT: vcvtsi2sh %rdi, {rn-sae}, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half> %arg0, i64 %arg1, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half> %arg0, i64 %arg1, i32 8) + %res = fadd <8 x half> %res1, %res2 + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half>, i32, i32) + +define <8 x half> @test_x86_avx512fp16_vcvtusi2sh(<8 x half> %arg0, i32 %arg1) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtusi2sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtusi2sh %edi, %xmm0, %xmm1 +; CHECK-NEXT: vcvtusi2sh %edi, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half> %arg0, i32 %arg1, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half> %arg0, i32 %arg1, i32 9) + %res = fadd <8 x half> %res1, %res2 + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half>, i64, i32) + +define <8 x half> @test_x86_avx512fp16_vcvtusi642sh(<8 x half> %arg0, i64 %arg1) { +; CHECK-LABEL: test_x86_avx512fp16_vcvtusi642sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtusi2sh %rdi, %xmm0, %xmm1 +; CHECK-NEXT: vcvtusi2sh %rdi, {rd-sae}, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half> %arg0, i64 %arg1, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half> %arg0, i64 %arg1, i32 9) + %res = fadd <8 x half> %res1, %res2 + ret <8 x half> %res +} diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll @@ -23,3 +23,925 @@ %0 = bitcast <8 x i16> %vecinit7.i to <2 x i64> ret <2 x i64> %0 } + +define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_256(<8 x i32> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = sitofp <8 x i32> %x0 to <8 x half> + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_256_z(<8 x i32> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = sitofp <8 x i32> %x0 to <8 x half> + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define <8 x half> @sint_to_fp_8i32_to_8f16(<8 x i32> %x) { +; CHECK-LABEL: sint_to_fp_8i32_to_8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = sitofp <8 x i32> %x to <8 x half> + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_128(<4 x i32> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_128_nomask(<4 x i32> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_128_z(<4 x i32> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <4 x half> @sint_to_fp_4i32_to_4f16(<4 x i32> %x) { +; CHECK-LABEL: sint_to_fp_4i32_to_4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <4 x i32> %x to <4 x half> + ret <4 x half> %res +} + +define <2 x half> @sint_to_fp_2i32_to_2f16(<2 x i32> %x) { +; CHECK-LABEL: sint_to_fp_2i32_to_2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <2 x i32> %x to <2 x half> + ret <2 x half> %res +} + +define <4 x i32> @fp_to_sint_4f16_to_4i32(<4 x half> %x) { +; CHECK-LABEL: fp_to_sint_4f16_to_4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptosi <4 x half> %x to <4 x i32> + ret <4 x i32> %res +} + +define <2 x i32> @fp_to_sint_2f16_to_2i32(<2 x half> %x) { +; CHECK-LABEL: fp_to_sint_2f16_to_2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptosi <2 x half> %x to <2 x i32> + ret <2 x i32> %res +} + +define <2 x i16> @fp_to_sint_2f16_to_2i16(<2 x half> %x) { +; CHECK-LABEL: fp_to_sint_2f16_to_2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptosi <2 x half> %x to <2 x i16> + ret <2 x i16> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_256(<8 x i32> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ph %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = uitofp <8 x i32> %x0 to <8 x half> + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_256_z(<8 x i32> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ph %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %mask = bitcast i8 %x2 to <8 x i1> + %res0 = uitofp <8 x i32> %x0 to <8 x half> + %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define <8 x half> @uint_to_fp_8i32_to_8f16(<8 x i32> %x) { +; CHECK-LABEL: uint_to_fp_8i32_to_8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = uitofp <8 x i32> %x to <8 x half> + ret <8 x half> %res +} + +define <8 x i32> @fp_to_uint_8f16_to_8i32(<8 x half> %x) { +; CHECK-LABEL: fp_to_uint_8f16_to_8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = fptoui <8 x half> %x to <8 x i32> + ret <8 x i32> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_128(<4 x i32> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_128_nomask(<4 x i32> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_128_z(<4 x i32> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtudq2ph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <4 x half> @uint_to_fp_4i32_to_4f16(<4 x i32> %x) { +; CHECK-LABEL: uint_to_fp_4i32_to_4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <4 x i32> %x to <4 x half> + ret <4 x half> %res +} + +define <2 x half> @uint_to_fp_2i32_to_2f16(<2 x i32> %x) { +; CHECK-LABEL: uint_to_fp_2i32_to_2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <2 x i32> %x to <2 x half> + ret <2 x half> %res +} + +define <4 x i32> @fp_to_uint_4f16_to_4i32(<4 x half> %x) { +; CHECK-LABEL: fp_to_uint_4f16_to_4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptoui <4 x half> %x to <4 x i32> + ret <4 x i32> %res +} + +define <2 x i32> @fp_to_uint_2f16_to_2i32(<2 x half> %x) { +; CHECK-LABEL: fp_to_uint_2f16_to_2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptoui <2 x half> %x to <2 x i32> + ret <2 x i32> %res +} + +define <2 x i16> @fp_to_uint_2f16_to_2i16(<2 x half> %x) { +; CHECK-LABEL: fp_to_uint_2f16_to_2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptoui <2 x half> %x to <2 x i16> + ret <2 x i16> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvt_ph2dq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2dq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2dq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2dq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2dq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvt_ph2dq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2dq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2dq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2dq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2dq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvt_ph2udq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2udq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvt_ph2udq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2udq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2udq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvtt_ph2dq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvtt_ph2dq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half>, <4 x i32>, i8) + +define <4 x i32> @test_int_x86_avx512_cvtt_ph2udq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half>, <8 x i32>, i8) + +define <8 x i32> @test_int_x86_avx512_cvtt_ph2udq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half>, <4 x double>, i8) + +define <4 x double> @test_int_x86_avx512_mask_cvt_ph2pd_256(<8 x half> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 %x2) + ret <4 x double> %res +} + +define <4 x double> @test_int_x86_avx512_mask_cvt_ph2pd_256_nomask(<8 x half> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half>, <2 x double>, i8) + +define <2 x double> @test_int_x86_avx512_mask_cvt_ph2pd_128(<8 x half> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 %x2) + ret <2 x double> %res +} + +define <2 x double> @test_int_x86_avx512_mask_cvt_ph2pd_128_nomask(<8 x half> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_256(<4 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ph %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_256_load(<4 x double>* %px0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_256_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtpd2phy (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %x0 = load <4 x double>, <4 x double>* %px0, align 32 + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_128(<2 x double> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtpd2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_128_load(<2 x double>* %px0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_128_load: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vcvtpd2phx (%rdi), %xmm0 {%k1} +; CHECK-NEXT: retq + %x0 = load <2 x double>, <2 x double>* %px0, align 16 + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_256(<4 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_256_nomask(<4 x i64> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_256_z(<4 x i64> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <4 x half> @sint_to_fp_4i64_to_4f16(<4 x i64> %x) { +; CHECK-LABEL: sint_to_fp_4i64_to_4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = sitofp <4 x i64> %x to <4 x half> + ret <4 x half> %res +} + +define <4 x i64> @fp_to_sint_4f16_to_4i64(<4 x half> %x) { +; CHECK-LABEL: fp_to_sint_4f16_to_4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2qq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = fptosi <4 x half> %x to <4 x i64> + ret <4 x i64> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_128(<2 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_128_nomask(<2 x i64> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_128_z(<2 x i64> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <2 x half> @sint_to_fp_2i64_to_2f16(<2 x i64> %x) { +; CHECK-LABEL: sint_to_fp_2i64_to_2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = sitofp <2 x i64> %x to <2 x half> + ret <2 x half> %res +} + +define <2 x i64> @fp_to_sint_2f16_to_2i64(<2 x half> %x) { +; CHECK-LABEL: fp_to_sint_2f16_to_2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2qq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptosi <2 x half> %x to <2 x i64> + ret <2 x i64> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_256(<4 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_256_nomask(<4 x i64> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_256_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_256_z(<4 x i64> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_256_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <4 x half> @uint_to_fp_4i64_to_4f16(<4 x i64> %x) { +; CHECK-LABEL: uint_to_fp_4i64_to_4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = uitofp <4 x i64> %x to <4 x half> + ret <4 x half> %res +} + +define <4 x i64> @fp_to_uint_4f16_to_4i64(<4 x half> %x) { +; CHECK-LABEL: fp_to_uint_4f16_to_4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uqq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = fptoui <4 x half> %x to <4 x i64> + ret <4 x i64> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64>, <8 x half>, i8) + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_128(<2 x i64> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_128_nomask(<2 x i64> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_128_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_128_z(<2 x i64> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_128_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <2 x half> @uint_to_fp_2i64_to_2f16(<2 x i64> %x) { +; CHECK-LABEL: uint_to_fp_2i64_to_2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = uitofp <2 x i64> %x to <2 x half> + ret <2 x half> %res +} + +define <2 x i64> @fp_to_uint_2f16_to_2i64(<2 x half> %x) { +; CHECK-LABEL: fp_to_uint_2f16_to_2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fptoui <2 x half> %x to <2 x i64> + ret <2 x i64> %res +} + +declare <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half>, <2 x i64>, i8) + +define <2 x i64> @test_int_x86_avx512_cvtt_ph2qq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2qq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half> %x0, <2 x i64> undef, i8 -1) + ret <2 x i64> %res +} + +define <2 x i64> @test_int_x86_avx512_mask_cvtt_ph2qq_128(<8 x half> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2qq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ph2qq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2qq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half>, <4 x i64>, i8) + +define <4 x i64> @test_int_x86_avx512_cvtt_ph2qq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2qq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half> %x0, <4 x i64> undef, i8 -1) + ret <4 x i64> %res +} + +define <4 x i64> @test_int_x86_avx512_mask_cvtt_ph2qq_256(<8 x half> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2qq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64> @test_int_x86_avx512_maskz_cvtt_ph2qq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2qq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res +} + +declare <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half>, <2 x i64>, i8) + +define <2 x i64> @test_int_x86_avx512_cvtt_ph2uqq_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half> %x0, <2 x i64> undef, i8 -1) + ret <2 x i64> %res +} + +define <2 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_128(<8 x half> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ph2uqq_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half>, <4 x i64>, i8) + +define <4 x i64> @test_int_x86_avx512_cvtt_ph2uqq_256(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uqq %xmm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> undef, i8 -1) + ret <4 x i64> %res +} + +define <4 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_256(<8 x half> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uqq %xmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64> @test_int_x86_avx512_maskz_cvtt_ph2uqq_256(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcvttph2uqq %xmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res +} diff --git a/llvm/test/CodeGen/X86/cvt16-2.ll b/llvm/test/CodeGen/X86/cvt16-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/cvt16-2.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-avx512fp16 | FileCheck %s -check-prefix=LIBCALL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512fp16 | FileCheck %s -check-prefix=FP16 + +define void @test1(float %src, i16* %dest) { +; LIBCALL-LABEL: test1: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: pushq %rbx +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: .cfi_offset %rbx, -16 +; LIBCALL-NEXT: movq %rdi, %rbx +; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; LIBCALL-NEXT: movw %ax, (%rbx) +; LIBCALL-NEXT: popq %rbx +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq +; +; FP16-LABEL: test1: +; FP16: # %bb.0: +; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rdi) +; FP16-NEXT: retq + %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) + store i16 %1, i16* %dest, align 2 + ret void +} + +define float @test2(i16* nocapture %src) { +; LIBCALL-LABEL: test2: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL +; +; FP16-LABEL: test2: +; FP16: # %bb.0: +; FP16-NEXT: vmovsh (%rdi), %xmm0 +; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; FP16-NEXT: retq + %1 = load i16, i16* %src, align 2 + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) + ret float %2 +} + +define float @test3(float %src) nounwind uwtable readnone { +; LIBCALL-LABEL: test3: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: pushq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; LIBCALL-NEXT: movzwl %ax, %edi +; LIBCALL-NEXT: popq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL +; +; FP16-LABEL: test3: +; FP16: # %bb.0: +; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; FP16-NEXT: retq + %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) + %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) + ret float %2 +} + +; FIXME: Should it be __extendhfdf2? +define double @test4(i16* nocapture %src) { +; LIBCALL-LABEL: test4: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: pushq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT +; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; LIBCALL-NEXT: popq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq +; +; FP16-LABEL: test4: +; FP16: # %bb.0: +; FP16-NEXT: vmovsh (%rdi), %xmm0 +; FP16-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; FP16-NEXT: retq + %1 = load i16, i16* %src, align 2 + %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1) + ret double %2 +} + +define i16 @test5(double %src) { +; LIBCALL-LABEL: test5: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; +; FP16-LABEL: test5: +; FP16: # %bb.0: +; FP16-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: # kill: def $ax killed $ax killed $eax +; FP16-NEXT: retq + %val = tail call i16 @llvm.convert.to.fp16.f64(double %src) + ret i16 %val +} + +; FIXME: Should it be __extendhfxf2? +define x86_fp80 @test6(i16* nocapture %src) { +; LIBCALL-LABEL: test6: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: pushq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT +; LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) +; LIBCALL-NEXT: flds {{[0-9]+}}(%rsp) +; LIBCALL-NEXT: popq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq +; +; FP16-LABEL: test6: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rax +; FP16-NEXT: .cfi_def_cfa_offset 16 +; FP16-NEXT: vmovsh (%rdi), %xmm0 +; FP16-NEXT: callq __extendhfxf2@PLT +; FP16-NEXT: popq %rax +; FP16-NEXT: .cfi_def_cfa_offset 8 +; FP16-NEXT: retq + %1 = load i16, i16* %src, align 2 + %2 = tail call x86_fp80 @llvm.convert.from.fp16.f80(i16 %1) + ret x86_fp80 %2 +} + +define i16 @test7(x86_fp80 %src) { +; LIBCALL-LABEL: test7: +; LIBCALL: # %bb.0: +; LIBCALL-NEXT: jmp __truncxfhf2@PLT # TAILCALL +; +; FP16-LABEL: test7: +; FP16: # %bb.0: +; FP16-NEXT: subq $24, %rsp +; FP16-NEXT: .cfi_def_cfa_offset 32 +; FP16-NEXT: fldt {{[0-9]+}}(%rsp) +; FP16-NEXT: fstpt (%rsp) +; FP16-NEXT: callq __truncxfhf2@PLT +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: # kill: def $ax killed $ax killed $eax +; FP16-NEXT: addq $24, %rsp +; FP16-NEXT: .cfi_def_cfa_offset 8 +; FP16-NEXT: retq + %val = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %src) + ret i16 %val +} + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone +declare x86_fp80 @llvm.convert.from.fp16.f80(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f80(x86_fp80) nounwind readnone diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -6,6 +6,10 @@ declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) +declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata) +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) +declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) define half @fadd_f16(half %a, half %b) nounwind strictfp { ; X86-LABEL: fadd_f16: @@ -75,4 +79,98 @@ ret half %ret } +define void @fpext_f16_to_f32(half* %val, float* %ret) nounwind strictfp { +; X86-LABEL: fpext_f16_to_f32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovsh (%ecx), %xmm0 +; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: fpext_f16_to_f32: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovss %xmm0, (%rsi) +; X64-NEXT: retq + %1 = load half, half* %val, align 4 + %res = call float @llvm.experimental.constrained.fpext.f32.f16(half %1, + metadata !"fpexcept.strict") #0 + store float %res, float* %ret, align 8 + ret void +} + +define void @fpext_f16_to_f64(half* %val, double* %ret) nounwind strictfp { +; X86-LABEL: fpext_f16_to_f64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovsh (%ecx), %xmm0 +; X86-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: fpext_f16_to_f64: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: vcvtsh2sd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovsd %xmm0, (%rsi) +; X64-NEXT: retq + %1 = load half, half* %val, align 4 + %res = call double @llvm.experimental.constrained.fpext.f64.f16(half %1, + metadata !"fpexcept.strict") #0 + store double %res, double* %ret, align 8 + ret void +} + +define void @fptrunc_float_to_f16(float* %val, half *%ret) nounwind strictfp { +; X86-LABEL: fptrunc_float_to_f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: fptrunc_float_to_f16: +; X64: # %bb.0: +; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovsh %xmm0, (%rsi) +; X64-NEXT: retq + %1 = load float, float* %val, align 8 + %res = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + store half %res, half* %ret, align 4 + ret void +} + +define void @fptrunc_double_to_f16(double* %val, half *%ret) nounwind strictfp { +; X86-LABEL: fptrunc_double_to_f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: fptrunc_double_to_f16: +; X64: # %bb.0: +; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovsh %xmm0, (%rsi) +; X64-NEXT: retq + %1 = load double, double* %val, align 8 + %res = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + store half %res, half* %ret, align 4 + ret void +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 + +declare i1 @llvm.experimental.constrained.fptosi.i1.f16(half, metadata) +declare i8 @llvm.experimental.constrained.fptosi.i8.f16(half, metadata) +declare i16 @llvm.experimental.constrained.fptosi.i16.f16(half, metadata) +declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata) +declare i64 @llvm.experimental.constrained.fptosi.i64.f16(half, metadata) +declare i1 @llvm.experimental.constrained.fptoui.i1.f16(half, metadata) +declare i8 @llvm.experimental.constrained.fptoui.i8.f16(half, metadata) +declare i16 @llvm.experimental.constrained.fptoui.i16.f16(half, metadata) +declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata) + +define i1 @fptosi_f16toi1(half %x) #0 { +; X86-LABEL: fptosi_f16toi1: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: fptosi_f16toi1: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %result = call i1 @llvm.experimental.constrained.fptosi.i1.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fptosi_f16toi8(half %x) #0 { +; X86-LABEL: fptosi_f16toi8: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: fptosi_f16toi8: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %result = call i8 @llvm.experimental.constrained.fptosi.i8.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fptosi_f16toi16(half %x) #0 { +; X86-LABEL: fptosi_f16toi16: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: fptosi_f16toi16: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %result = call i16 @llvm.experimental.constrained.fptosi.i16.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fptosi_f16toi32(half %x) #0 { +; X86-LABEL: fptosi_f16toi32: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: fptosi_f16toi32: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: retq + %result = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fptosi_f16toi64(half %x) #0 { +; X86-LABEL: fptosi_f16toi64: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vcvttph2qq %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: vpextrd $1, %xmm0, %edx +; X86-NEXT: retl +; +; X64-LABEL: fptosi_f16toi64: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %rax +; X64-NEXT: retq + %result = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +define i1 @fptoui_f16toi1(half %x) #0 { +; X86-LABEL: fptoui_f16toi1: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: fptoui_f16toi1: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %result = call i1 @llvm.experimental.constrained.fptoui.i1.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fptoui_f16toi8(half %x) #0 { +; X86-LABEL: fptoui_f16toi8: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: fptoui_f16toi8: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %result = call i8 @llvm.experimental.constrained.fptoui.i8.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fptoui_f16toi16(half %x) #0 { +; X86-LABEL: fptoui_f16toi16: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: fptoui_f16toi16: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2si %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %result = call i16 @llvm.experimental.constrained.fptoui.i16.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fptoui_f16toi32(half %x) #0 { +; X86-LABEL: fptoui_f16toi32: +; X86: # %bb.0: +; X86-NEXT: vcvttsh2usi {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: fptoui_f16toi32: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2usi %xmm0, %eax +; X64-NEXT: retq + %result = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fptoui_f16toi64(half %x) #0 { +; X86-LABEL: fptoui_f16toi64: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vcvttph2uqq %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: vpextrd $1, %xmm0, %edx +; X86-NEXT: retl +; +; X64-LABEL: fptoui_f16toi64: +; X64: # %bb.0: +; X64-NEXT: vcvttsh2usi %xmm0, %rax +; X64-NEXT: retq + %result = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll @@ -0,0 +1,197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 + +declare half @llvm.experimental.constrained.sitofp.f16.i1(i1, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i8(i8, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i16(i16, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata) +declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i1(i1, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i8(i8, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i16(i16, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata) +declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata) + +define half @sitofp_i1tof16(i1 %x) #0 { +; X86-LABEL: sitofp_i1tof16: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: andb $1, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: sitofp_i1tof16: +; X64: # %bb.0: +; X64-NEXT: andb $1, %dil +; X64-NEXT: negb %dil +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.sitofp.f16.i1(i1 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @sitofp_i8tof16(i8 %x) #0 { +; X86-LABEL: sitofp_i8tof16: +; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: sitofp_i8tof16: +; X64: # %bb.0: +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @sitofp_i16tof16(i16 %x) #0 { +; X86-LABEL: sitofp_i16tof16: +; X86: # %bb.0: +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: sitofp_i16tof16: +; X64: # %bb.0: +; X64-NEXT: movswl %di, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @sitofp_i32tof16(i32 %x) #0 { +; X86-LABEL: sitofp_i32tof16: +; X86: # %bb.0: +; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: sitofp_i32tof16: +; X64: # %bb.0: +; X64-NEXT: vcvtsi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @sitofp_i64tof16(i64 %x) #0 { +; X86-LABEL: sitofp_i64tof16: +; X86: # %bb.0: +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcvtqq2ph %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: sitofp_i64tof16: +; X64: # %bb.0: +; X64-NEXT: vcvtsi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @uitofp_i1tof16(i1 %x) #0 { +; X86-LABEL: uitofp_i1tof16: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: andb $1, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_i1tof16: +; X64: # %bb.0: +; X64-NEXT: andl $1, %edi +; X64-NEXT: vcvtsi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.uitofp.f16.i1(i1 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @uitofp_i8tof16(i8 %x) #0 { +; X86-LABEL: uitofp_i8tof16: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_i8tof16: +; X64: # %bb.0: +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.uitofp.f16.i8(i8 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @uitofp_i16tof16(i16 %x) #0 { +; X86-LABEL: uitofp_i16tof16: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_i16tof16: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.uitofp.f16.i16(i16 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @uitofp_i32tof16(i32 %x) #0 { +; X86-LABEL: uitofp_i32tof16: +; X86: # %bb.0: +; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_i32tof16: +; X64: # %bb.0: +; X64-NEXT: vcvtusi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +define half @uitofp_i64tof16(i64 %x) #0 { +; X86-LABEL: uitofp_i64tof16: +; X86: # %bb.0: +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_i64tof16: +; X64: # %bb.0: +; X64-NEXT: vcvtusi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: retq + %result = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %result +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll @@ -144,5 +144,21 @@ ret <16 x half> %2 } +define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_subph + ;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fsub <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_subph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_subph_ymm + ;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fsub <16 x half> %a0, %a1 + ret <16 x half> %2 +} + attributes #0 = { "unsafe-fp-math"="false" } attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll @@ -6,6 +6,16 @@ declare <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half>, <8 x half>, metadata, metadata) declare <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half>, <8 x half>, metadata, metadata) declare <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) +declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64(<2 x double>, metadata, metadata) +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) +declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata) +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) +declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata) define <8 x half> @f2(<8 x half> %a, <8 x half> %b) #0 { ; CHECK-LABEL: f2: @@ -51,4 +61,130 @@ ret <8 x half> %ret } +define <8 x half> @f11(<2 x double> %a0, <8 x half> %a1) #0 { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsd2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ext = extractelement <2 x double> %a0, i32 0 + %cvt = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %ext, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %res = insertelement <8 x half> %a1, half %cvt, i32 0 + ret <8 x half> %res +} + +define <2 x double> @f12(<2 x double> %a0, <8 x half> %a1) #0 { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2sd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; CHECK-NEXT: ret{{[l|q]}} + %ext = extractelement <8 x half> %a1, i32 0 + %cvt = call double @llvm.experimental.constrained.fpext.f64.f16(half %ext, + metadata !"fpexcept.strict") #0 + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <2 x double> @f15(<2 x half> %a) #0 { +; CHECK-LABEL: f15: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16( + <2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x double> %ret +} + +define <2 x half> @f16(<2 x double> %a) #0 { +; CHECK-LABEL: f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64( + <2 x double> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x half> %ret +} + +define <8 x half> @f17(<4 x float> %a0, <8 x half> %a1) #0 { +; CHECK-LABEL: f17: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ext = extractelement <4 x float> %a0, i32 0 + %cvt = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %ext, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %res = insertelement <8 x half> %a1, half %cvt, i32 0 + ret <8 x half> %res +} + +define <4 x float> @f18(<4 x float> %a0, <8 x half> %a1) #0 { +; CHECK-LABEL: f18: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: ret{{[l|q]}} + %ext = extractelement <8 x half> %a1, i32 0 + %cvt = call float @llvm.experimental.constrained.fpext.f32.f16(half %ext, + metadata !"fpexcept.strict") #0 + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +define <2 x float> @f19(<2 x half> %a) #0 { +; CHECK-LABEL: f19: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16( + <2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x float> %ret +} + +define <4 x float> @f20(<4 x half> %a) #0 { +; CHECK-LABEL: f20: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16( + <4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x float> %ret +} + +define <2 x half> @f21(<2 x float> %a) #0 { +; CHECK-LABEL: f21: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvtps2phx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32( + <2 x float> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x half> %ret +} + +define <4 x half> @f22(<4 x float> %a) #0 { +; CHECK-LABEL: f22: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32( + <4 x float> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x half> %ret +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll @@ -6,6 +6,10 @@ declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16 x half>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) +declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(<4 x double>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32(<8 x float>, metadata, metadata) define <16 x half> @f2(<16 x half> %a, <16 x half> %b) #0 { ; CHECK-LABEL: f2: @@ -51,4 +55,52 @@ ret <16 x half> %ret } +define <4 x double> @f11(<4 x half> %a) #0 { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16( + <4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x double> %ret +} + +define <4 x half> @f12(<4 x double> %a) #0 { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64( + <4 x double> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x half> %ret +} + +define <8 x float> @f14(<8 x half> %a) #0 { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16( + <8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x float> %ret +} + +define <8 x half> @f15(<8 x float> %a) #0 { +; CHECK-LABEL: f15: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32( + <8 x float> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll @@ -6,6 +6,10 @@ declare <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half>, <32 x half>, metadata, metadata) declare <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half>, <32 x half>, metadata, metadata) declare <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) +declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) +declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(<8 x double>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(<16 x float>, metadata, metadata) define <32 x half> @f2(<32 x half> %a, <32 x half> %b) #0 { ; CHECK-LABEL: f2: @@ -51,4 +55,51 @@ ret <32 x half> %ret } +define <8 x double> @f11(<8 x half> %a) #0 { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2pd %xmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16( + <8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x double> %ret +} + +define <8 x half> @f12(<8 x double> %a) #0 { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ph %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64( + <8 x double> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <16 x float> @f14(<16 x half> %a) #0 { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtph2psx %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16( + <16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x float> %ret +} + +define <16 x half> @f15(<16 x float> %a) #0 { +; CHECK-LABEL: f15: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2phx %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32( + <16 x float> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK + +declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half>, metadata) +declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half>, metadata) +declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half>, metadata) +declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half>, metadata) +declare <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half>, metadata) +declare <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half>, metadata) +declare <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half>, metadata) +declare <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half>, metadata) +declare <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f16(<2 x half>, metadata) +declare <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f16(<2 x half>, metadata) +declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f16(<4 x half>, metadata) +declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f16(<4 x half>, metadata) +declare <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half>, metadata) +declare <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half>, metadata) +declare <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f16(<4 x half>, metadata) +declare <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f16(<4 x half>, metadata) +declare <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f16(<8 x half>, metadata) +declare <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f16(<8 x half>, metadata) +declare <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half>, metadata) +declare <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f16(<8 x half>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f16(<8 x half>, metadata) + +define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2qq %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i64> %ret +} + +define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i64> %ret +} + +define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i32> %ret +} + +define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i32> %ret +} + +define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i16> %ret +} + +define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i16> %ret +} + +define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i8> %ret +} + +define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i8> %ret +} + +define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: vpmovm2q %k0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i1> %ret +} + +define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: vpmovm2q %k0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f16(<2 x half> %a, + metadata !"fpexcept.strict") #0 + ret <2 x i1> %ret +} + +define <4 x i32> @strict_vector_fptosi_v4f16_to_v4i32(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i32> %ret +} + +define <4 x i32> @strict_vector_fptoui_v4f16_to_v4i32(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i32> %ret +} + +define <4 x i16> @strict_vector_fptosi_v4f16_to_v4i16(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i16> %ret +} + +define <4 x i16> @strict_vector_fptoui_v4f16_to_v4i16(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i16> %ret +} + +define <4 x i8> @strict_vector_fptosi_v4f16_to_v4i8(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i8> %ret +} + +define <4 x i8> @strict_vector_fptoui_v4f16_to_v4i8(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i8> %ret +} + +define <4 x i1> @strict_vector_fptosi_v4f16_to_v4i1(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: vpmovm2d %k0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i1> %ret +} + +define <4 x i1> @strict_vector_fptoui_v4f16_to_v4i1(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 +; CHECK-NEXT: vpmovw2m %xmm0, %k0 +; CHECK-NEXT: vpmovm2d %k0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i1> %ret +} + +define <8 x i16> @strict_vector_fptosi_v8f16_to_v8i16(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i16> %ret +} + +define <8 x i16> @strict_vector_fptoui_v8f16_to_v8i16(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i16> %ret +} + +define <8 x i8> @strict_vector_fptosi_v8f16_to_v8i8(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i8> %ret +} + +define <8 x i8> @strict_vector_fptoui_v8f16_to_v8i8(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i8> %ret +} + +define <8 x i1> @strict_vector_fptosi_v8f16_to_v8i1(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 +; CHECK-NEXT: vpmovd2m %ymm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i1> %ret +} + +define <8 x i1> @strict_vector_fptoui_v8f16_to_v8i1(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 +; CHECK-NEXT: vpslld $31, %ymm0, %ymm0 +; CHECK-NEXT: vpmovd2m %ymm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i1> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK + + +declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half>, metadata) +declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f16(<4 x half>, metadata) +declare <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half>, metadata) +declare <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half>, metadata) +declare <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f16(<16 x half>, metadata) +declare <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f16(<16 x half>, metadata) +declare <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f16(<16 x half>, metadata) +declare <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f16(<16 x half>, metadata) +declare <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f16(<16 x half>, metadata) +declare <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f16(<16 x half>, metadata) + +define <4 x i64> @strict_vector_fptosi_v4f16_to_v4i64(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2qq %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i64> %ret +} + +define <4 x i64> @strict_vector_fptoui_v4f16_to_v4i64(<4 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-NEXT: vcvttph2uqq %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f16(<4 x half> %a, + metadata !"fpexcept.strict") #0 + ret <4 x i64> %ret +} + +define <8 x i32> @strict_vector_fptosi_v8f16_to_v8i32(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i32> %ret +} + +define <8 x i32> @strict_vector_fptoui_v8f16_to_v8i32(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i32> %ret +} + +define <16 x i16> @strict_vector_fptosi_v16f16_to_v16i16(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i16> %ret +} + +define <16 x i16> @strict_vector_fptoui_v16f16_to_v16i16(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i16> %ret +} + +define <16 x i8> @strict_vector_fptosi_v16f16_to_v16i8(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i8> %ret +} + +define <16 x i8> @strict_vector_fptoui_v16f16_to_v16i8(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i8> %ret +} + +define <16 x i1> @strict_vector_fptosi_v16f16_to_v16i1(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i1> %ret +} + +define <16 x i1> @strict_vector_fptoui_v16f16_to_v16i1(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 +; CHECK-NEXT: vpmovd2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i1> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512-fp16.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK + + +declare <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f16(<8 x half>, metadata) +declare <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f16(<8 x half>, metadata) +declare <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f16(<16 x half>, metadata) +declare <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f16(<16 x half>, metadata) +declare <32 x i16> @llvm.experimental.constrained.fptosi.v32i16.v32f16(<32 x half>, metadata) +declare <32 x i16> @llvm.experimental.constrained.fptoui.v32i16.v32f16(<32 x half>, metadata) +declare <32 x i8> @llvm.experimental.constrained.fptosi.v32i8.v32f16(<32 x half>, metadata) +declare <32 x i8> @llvm.experimental.constrained.fptoui.v32i8.v32f16(<32 x half>, metadata) +declare <32 x i1> @llvm.experimental.constrained.fptosi.v32i1.v32f16(<32 x half>, metadata) +declare <32 x i1> @llvm.experimental.constrained.fptoui.v32i1.v32f16(<32 x half>, metadata) + +define <8 x i64> @strict_vector_fptosi_v8f16_to_v8i64(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2qq %xmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i64> %ret +} + +define <8 x i64> @strict_vector_fptoui_v8f16_to_v8i64(<8 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uqq %xmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f16(<8 x half> %a, + metadata !"fpexcept.strict") #0 + ret <8 x i64> %ret +} + +define <16 x i32> @strict_vector_fptosi_v16f16_to_v16i32(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2dq %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i32> %ret +} + +define <16 x i32> @strict_vector_fptoui_v16f16_to_v16i32(<16 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2udq %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f16(<16 x half> %a, + metadata !"fpexcept.strict") #0 + ret <16 x i32> %ret +} + +define <32 x i16> @strict_vector_fptosi_v32f16_to_v32i16(<32 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v32f16_to_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x i16> @llvm.experimental.constrained.fptosi.v32i16.v32f16(<32 x half> %a, + metadata !"fpexcept.strict") #0 + ret <32 x i16> %ret +} + +define <32 x i16> @strict_vector_fptoui_v32f16_to_v32i16(<32 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v32f16_to_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2uw %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x i16> @llvm.experimental.constrained.fptoui.v32i16.v32f16(<32 x half> %a, + metadata !"fpexcept.strict") #0 + ret <32 x i16> %ret +} + +define <32 x i8> @strict_vector_fptosi_v32f16_to_v32i8(<32 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v32f16_to_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x i8> @llvm.experimental.constrained.fptosi.v32i8.v32f16(<32 x half> %a, + metadata !"fpexcept.strict") #0 + ret <32 x i8> %ret +} + +define <32 x i8> @strict_vector_fptoui_v32f16_to_v32i8(<32 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v32f16_to_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x i8> @llvm.experimental.constrained.fptoui.v32i8.v32f16(<32 x half> %a, + metadata !"fpexcept.strict") #0 + ret <32 x i8> %ret +} + +define <32 x i1> @strict_vector_fptosi_v32f16_to_v32i1(<32 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v32f16_to_v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 +; CHECK-NEXT: vpmovw2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x i1> @llvm.experimental.constrained.fptosi.v32i1.v32f16(<32 x half> %a, + metadata !"fpexcept.strict") #0 + ret <32 x i1> %ret +} + +define <32 x i1> @strict_vector_fptoui_v32f16_to_v32i1(<32 x half> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v32f16_to_v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttph2w %zmm0, %zmm0 +; CHECK-NEXT: vpsllw $15, %zmm0, %zmm0 +; CHECK-NEXT: vpmovw2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x i1> @llvm.experimental.constrained.fptoui.v32i1.v32f16(<32 x half> %a, + metadata !"fpexcept.strict") #0 + ret <32 x i1> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X64 + +declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i1(<8 x i1>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i8(<8 x i8>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i8(<8 x i8>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i16(<8 x i16>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i16(<8 x i16>, metadata, metadata) +declare <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i32(<4 x i32>, metadata, metadata) +declare <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i32(<4 x i32>, metadata, metadata) +declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64>, metadata, metadata) +declare <2 x half> @llvm.experimental.constrained.uitofp.v2f16.v2i64(<2 x i64>, metadata, metadata) + +define <4 x half> @sitofp_v4i32_v4f16(<4 x i32> %x) #0 { +; CHECK-LABEL: sitofp_v4i32_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x half> %result +} + +define <4 x half> @uitofp_v4i32_v4f16(<4 x i32> %x) #0 { +; CHECK-LABEL: uitofp_v4i32_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x half> %result +} + +define <2 x half> @sitofp_v2i64_v2f16(<2 x i64> %x) #0 { +; CHECK-LABEL: sitofp_v2i64_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x half> %result +} + +define <2 x half> @uitofp_v2i64_v2f16(<2 x i64> %x) #0 { +; CHECK-LABEL: uitofp_v2i64_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <2 x half> @llvm.experimental.constrained.uitofp.v2f16.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x half> %result +} + +define <8 x half> @sitofp_v8i1_v8f16(<8 x i1> %x) #0 { +; CHECK-LABEL: sitofp_v8i1_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 +; CHECK-NEXT: vpsraw $15, %xmm0, %xmm0 +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i1(<8 x i1> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @uitofp_v8i1_v8f16(<8 x i1> %x) #0 { +; X86-LABEL: uitofp_v8i1_v8f16: +; X86: # %bb.0: +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vcvtuw2ph %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_v8i1_v8f16: +; X64: # %bb.0: +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vcvtuw2ph %xmm0, %xmm0 +; X64-NEXT: retq + %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @sitofp_v8i8_v8f16(<8 x i8> %x) #0 { +; CHECK-LABEL: sitofp_v8i8_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i8(<8 x i8> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @uitofp_v8i8_v8f16(<8 x i8> %x) #0 { +; CHECK-LABEL: uitofp_v8i8_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i8(<8 x i8> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @sitofp_v8i16_v8f16(<8 x i16> %x) #0 { +; CHECK-LABEL: sitofp_v8i16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i16(<8 x i16> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @uitofp_v8i16_v8f16(<8 x i16> %x) #0 { +; CHECK-LABEL: uitofp_v8i16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i16(<8 x i16> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X64 + +declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i1(<16 x i1>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i1(<16 x i1>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i8(<16 x i8>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i8(<16 x i8>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i16(<16 x i16>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i16(<16 x i16>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i32(<8 x i32>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i32(<8 x i32>, metadata, metadata) +declare <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i64(<4 x i64>, metadata, metadata) +declare <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i64(<4 x i64>, metadata, metadata) + +define <16 x half> @sitofp_v16i1_v16f16(<16 x i1> %x) #0 { +; CHECK-LABEL: sitofp_v16i1_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0 +; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0 +; CHECK-NEXT: vcvtw2ph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i1(<16 x i1> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <16 x half> @uitofp_v16i1_v16f16(<16 x i1> %x) #0 { +; X86-LABEL: uitofp_v16i1_v16f16: +; X86: # %bb.0: +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X86-NEXT: vcvtuw2ph %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_v16i1_v16f16: +; X64: # %bb.0: +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-NEXT: vcvtuw2ph %ymm0, %ymm0 +; X64-NEXT: retq + %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i1(<16 x i1> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <16 x half> @sitofp_v16i8_v16f16(<16 x i8> %x) #0 { +; CHECK-LABEL: sitofp_v16i8_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-NEXT: vcvtw2ph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i8(<16 x i8> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <16 x half> @uitofp_v16i8_v16f16(<16 x i8> %x) #0 { +; CHECK-LABEL: uitofp_v16i8_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: vcvtuw2ph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i8(<16 x i8> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <16 x half> @sitofp_v16i16_v16f16(<16 x i16> %x) #0 { +; CHECK-LABEL: sitofp_v16i16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i16(<16 x i16> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <16 x half> @uitofp_v16i16_v16f16(<16 x i16> %x) #0 { +; CHECK-LABEL: uitofp_v16i16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i16(<16 x i16> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <8 x half> @sitofp_v8i32_v8f16(<8 x i32> %x) #0 { +; CHECK-LABEL: sitofp_v8i32_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i32(<8 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @uitofp_v8i32_v8f16(<8 x i32> %x) #0 { +; CHECK-LABEL: uitofp_v8i32_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i32(<8 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <4 x half> @sitofp_v4i64_v4f16(<4 x i64> %x) #0 { +; CHECK-LABEL: sitofp_v4i64_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %result = call <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x half> %result +} + +define <4 x half> @uitofp_v4i64_v4f16(<4 x i64> %x) #0 { +; CHECK-LABEL: uitofp_v4i64_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %result = call <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x half> %result +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK,X64 + +declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i1(<32 x i1>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i1(<32 x i1>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i8(<32 x i8>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i8(<32 x i8>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i16(<32 x i16>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i16(<32 x i16>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i32(<16 x i32>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i32(<16 x i32>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i64(<8 x i64>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i64(<8 x i64>, metadata, metadata) + +define <32 x half> @sitofp_v32i1_v32f16(<32 x i1> %x) #0 { +; CHECK-LABEL: sitofp_v32i1_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; CHECK-NEXT: vpsllw $15, %zmm0, %zmm0 +; CHECK-NEXT: vpsraw $15, %zmm0, %zmm0 +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i1(<32 x i1> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %result +} + +define <32 x half> @uitofp_v32i1_v32f16(<32 x i1> %x) #0 { +; X86-LABEL: uitofp_v32i1_v32f16: +; X86: # %bb.0: +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; X86-NEXT: vcvtuw2ph %zmm0, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: uitofp_v32i1_v32f16: +; X64: # %bb.0: +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; X64-NEXT: vcvtuw2ph %zmm0, %zmm0 +; X64-NEXT: retq + %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i1(<32 x i1> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %result +} + +define <32 x half> @sitofp_v32i8_v32f16(<32 x i8> %x) #0 { +; CHECK-LABEL: sitofp_v32i8_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %ymm0, %zmm0 +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i8(<32 x i8> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %result +} + +define <32 x half> @uitofp_v32i8_v32f16(<32 x i8> %x) #0 { +; CHECK-LABEL: uitofp_v32i8_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i8(<32 x i8> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %result +} + +define <32 x half> @sitofp_v32i16_v32f16(<32 x i16> %x) #0 { +; CHECK-LABEL: sitofp_v32i16_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtw2ph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i16(<32 x i16> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %result +} + +define <32 x half> @uitofp_v32i16_v32f16(<32 x i16> %x) #0 { +; CHECK-LABEL: uitofp_v32i16_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i16(<32 x i16> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %result +} + +define <16 x half> @sitofp_v16i32_v16f16(<16 x i32> %x) #0 { +; CHECK-LABEL: sitofp_v16i32_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ph %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i32(<16 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <16 x half> @uitofp_v16i32_v16f16(<16 x i32> %x) #0 { +; CHECK-LABEL: uitofp_v16i32_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ph %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i32(<16 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %result +} + +define <8 x half> @sitofp_v8i64_v8f16(<8 x i64> %x) #0 { +; CHECK-LABEL: sitofp_v8i64_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ph %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i64(<8 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +define <8 x half> @uitofp_v8i64_v8f16(<8 x i64> %x) #0 { +; CHECK-LABEL: uitofp_v8i64_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ph %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i64(<8 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %result +} + +attributes #0 = { strictfp } diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -460,3 +460,899 @@ # ATT: vucomish -256(%rdx), %xmm30 # INTEL: vucomish xmm30, word ptr [rdx - 256] 0x62,0x65,0x7c,0x08,0x2e,0x72,0x80 + +# ATT: vcvtdq2ph %zmm29, %ymm30 +# INTEL: vcvtdq2ph ymm30, zmm29 +0x62,0x05,0x7c,0x48,0x5b,0xf5 + +# ATT: vcvtdq2ph {rn-sae}, %zmm29, %ymm30 +# INTEL: vcvtdq2ph ymm30, zmm29, {rn-sae} +0x62,0x05,0x7c,0x18,0x5b,0xf5 + +# ATT: vcvtdq2ph 268435456(%rbp,%r14,8), %ymm30 {%k7} +# INTEL: vcvtdq2ph ymm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtdq2ph (%r9){1to16}, %ymm30 +# INTEL: vcvtdq2ph ymm30, dword ptr [r9]{1to16} +0x62,0x45,0x7c,0x58,0x5b,0x31 + +# ATT: vcvtdq2ph 8128(%rcx), %ymm30 +# INTEL: vcvtdq2ph ymm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7c,0x48,0x5b,0x71,0x7f + +# ATT: vcvtdq2ph -512(%rdx){1to16}, %ymm30 {%k7} {z} +# INTEL: vcvtdq2ph ymm30 {k7} {z}, dword ptr [rdx - 512]{1to16} +0x62,0x65,0x7c,0xdf,0x5b,0x72,0x80 + +# ATT: vcvtpd2ph %zmm29, %xmm30 +# INTEL: vcvtpd2ph xmm30, zmm29 +0x62,0x05,0xfd,0x48,0x5a,0xf5 + +# ATT: vcvtpd2ph {rn-sae}, %zmm29, %xmm30 +# INTEL: vcvtpd2ph xmm30, zmm29, {rn-sae} +0x62,0x05,0xfd,0x18,0x5a,0xf5 + +# ATT: vcvtpd2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} +# INTEL: vcvtpd2ph xmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0xfd,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtpd2ph (%r9){1to8}, %xmm30 +# INTEL: vcvtpd2ph xmm30, qword ptr [r9]{1to8} +0x62,0x45,0xfd,0x58,0x5a,0x31 + +# ATT: vcvtpd2phz 8128(%rcx), %xmm30 +# INTEL: vcvtpd2ph xmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0xfd,0x48,0x5a,0x71,0x7f + +# ATT: vcvtpd2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} +# INTEL: vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to8} +0x62,0x65,0xfd,0xdf,0x5a,0x72,0x80 + +# ATT: vcvtph2dq %ymm29, %zmm30 +# INTEL: vcvtph2dq zmm30, ymm29 +0x62,0x05,0x7d,0x48,0x5b,0xf5 + +# ATT: vcvtph2dq {rn-sae}, %ymm29, %zmm30 +# INTEL: vcvtph2dq zmm30, ymm29, {rn-sae} +0x62,0x05,0x7d,0x18,0x5b,0xf5 + +# ATT: vcvtph2dq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2dq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2dq (%r9){1to16}, %zmm30 +# INTEL: vcvtph2dq zmm30, word ptr [r9]{1to16} +0x62,0x45,0x7d,0x58,0x5b,0x31 + +# ATT: vcvtph2dq 4064(%rcx), %zmm30 +# INTEL: vcvtph2dq zmm30, ymmword ptr [rcx + 4064] +0x62,0x65,0x7d,0x48,0x5b,0x71,0x7f + +# ATT: vcvtph2dq -256(%rdx){1to16}, %zmm30 {%k7} {z} +# INTEL: vcvtph2dq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0x65,0x7d,0xdf,0x5b,0x72,0x80 + +# ATT: vcvtph2pd %xmm29, %zmm30 +# INTEL: vcvtph2pd zmm30, xmm29 +0x62,0x05,0x7c,0x48,0x5a,0xf5 + +# ATT: vcvtph2pd {sae}, %xmm29, %zmm30 +# INTEL: vcvtph2pd zmm30, xmm29, {sae} +0x62,0x05,0x7c,0x18,0x5a,0xf5 + +# ATT: vcvtph2pd 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2pd zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2pd (%r9){1to8}, %zmm30 +# INTEL: vcvtph2pd zmm30, word ptr [r9]{1to8} +0x62,0x45,0x7c,0x58,0x5a,0x31 + +# ATT: vcvtph2pd 2032(%rcx), %zmm30 +# INTEL: vcvtph2pd zmm30, xmmword ptr [rcx + 2032] +0x62,0x65,0x7c,0x48,0x5a,0x71,0x7f + +# ATT: vcvtph2pd -256(%rdx){1to8}, %zmm30 {%k7} {z} +# INTEL: vcvtph2pd zmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0x65,0x7c,0xdf,0x5a,0x72,0x80 + +# ATT: vcvtph2psx %ymm29, %zmm30 +# INTEL: vcvtph2psx zmm30, ymm29 +0x62,0x06,0x7d,0x48,0x13,0xf5 + +# ATT: vcvtph2psx {sae}, %ymm29, %zmm30 +# INTEL: vcvtph2psx zmm30, ymm29, {sae} +0x62,0x06,0x7d,0x18,0x13,0xf5 + +# ATT: vcvtph2psx 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2psx zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x7d,0x4f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2psx (%r9){1to16}, %zmm30 +# INTEL: vcvtph2psx zmm30, word ptr [r9]{1to16} +0x62,0x46,0x7d,0x58,0x13,0x31 + +# ATT: vcvtph2psx 4064(%rcx), %zmm30 +# INTEL: vcvtph2psx zmm30, ymmword ptr [rcx + 4064] +0x62,0x66,0x7d,0x48,0x13,0x71,0x7f + +# ATT: vcvtph2psx -256(%rdx){1to16}, %zmm30 {%k7} {z} +# INTEL: vcvtph2psx zmm30 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0x66,0x7d,0xdf,0x13,0x72,0x80 + +# ATT: vcvtph2qq %xmm29, %zmm30 +# INTEL: vcvtph2qq zmm30, xmm29 +0x62,0x05,0x7d,0x48,0x7b,0xf5 + +# ATT: vcvtph2qq {rn-sae}, %xmm29, %zmm30 +# INTEL: vcvtph2qq zmm30, xmm29, {rn-sae} +0x62,0x05,0x7d,0x18,0x7b,0xf5 + +# ATT: vcvtph2qq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2qq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2qq (%r9){1to8}, %zmm30 +# INTEL: vcvtph2qq zmm30, word ptr [r9]{1to8} +0x62,0x45,0x7d,0x58,0x7b,0x31 + +# ATT: vcvtph2qq 2032(%rcx), %zmm30 +# INTEL: vcvtph2qq zmm30, xmmword ptr [rcx + 2032] +0x62,0x65,0x7d,0x48,0x7b,0x71,0x7f + +# ATT: vcvtph2qq -256(%rdx){1to8}, %zmm30 {%k7} {z} +# INTEL: vcvtph2qq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0x65,0x7d,0xdf,0x7b,0x72,0x80 + +# ATT: vcvtph2udq %ymm29, %zmm30 +# INTEL: vcvtph2udq zmm30, ymm29 +0x62,0x05,0x7c,0x48,0x79,0xf5 + +# ATT: vcvtph2udq {rn-sae}, %ymm29, %zmm30 +# INTEL: vcvtph2udq zmm30, ymm29, {rn-sae} +0x62,0x05,0x7c,0x18,0x79,0xf5 + +# ATT: vcvtph2udq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2udq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2udq (%r9){1to16}, %zmm30 +# INTEL: vcvtph2udq zmm30, word ptr [r9]{1to16} +0x62,0x45,0x7c,0x58,0x79,0x31 + +# ATT: vcvtph2udq 4064(%rcx), %zmm30 +# INTEL: vcvtph2udq zmm30, ymmword ptr [rcx + 4064] +0x62,0x65,0x7c,0x48,0x79,0x71,0x7f + +# ATT: vcvtph2udq -256(%rdx){1to16}, %zmm30 {%k7} {z} +# INTEL: vcvtph2udq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0x65,0x7c,0xdf,0x79,0x72,0x80 + +# ATT: vcvtph2uqq %xmm29, %zmm30 +# INTEL: vcvtph2uqq zmm30, xmm29 +0x62,0x05,0x7d,0x48,0x79,0xf5 + +# ATT: vcvtph2uqq {rn-sae}, %xmm29, %zmm30 +# INTEL: vcvtph2uqq zmm30, xmm29, {rn-sae} +0x62,0x05,0x7d,0x18,0x79,0xf5 + +# ATT: vcvtph2uqq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2uqq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2uqq (%r9){1to8}, %zmm30 +# INTEL: vcvtph2uqq zmm30, word ptr [r9]{1to8} +0x62,0x45,0x7d,0x58,0x79,0x31 + +# ATT: vcvtph2uqq 2032(%rcx), %zmm30 +# INTEL: vcvtph2uqq zmm30, xmmword ptr [rcx + 2032] +0x62,0x65,0x7d,0x48,0x79,0x71,0x7f + +# ATT: vcvtph2uqq -256(%rdx){1to8}, %zmm30 {%k7} {z} +# INTEL: vcvtph2uqq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0x65,0x7d,0xdf,0x79,0x72,0x80 + +# ATT: vcvtph2uw %zmm29, %zmm30 +# INTEL: vcvtph2uw zmm30, zmm29 +0x62,0x05,0x7c,0x48,0x7d,0xf5 + +# ATT: vcvtph2uw {rn-sae}, %zmm29, %zmm30 +# INTEL: vcvtph2uw zmm30, zmm29, {rn-sae} +0x62,0x05,0x7c,0x18,0x7d,0xf5 + +# ATT: vcvtph2uw 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2uw zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2uw (%r9){1to32}, %zmm30 +# INTEL: vcvtph2uw zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7c,0x58,0x7d,0x31 + +# ATT: vcvtph2uw 8128(%rcx), %zmm30 +# INTEL: vcvtph2uw zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7c,0x48,0x7d,0x71,0x7f + +# ATT: vcvtph2uw -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vcvtph2uw zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7c,0xdf,0x7d,0x72,0x80 + +# ATT: vcvtph2w %zmm29, %zmm30 +# INTEL: vcvtph2w zmm30, zmm29 +0x62,0x05,0x7d,0x48,0x7d,0xf5 + +# ATT: vcvtph2w {rn-sae}, %zmm29, %zmm30 +# INTEL: vcvtph2w zmm30, zmm29, {rn-sae} +0x62,0x05,0x7d,0x18,0x7d,0xf5 + +# ATT: vcvtph2w 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtph2w zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2w (%r9){1to32}, %zmm30 +# INTEL: vcvtph2w zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7d,0x58,0x7d,0x31 + +# ATT: vcvtph2w 8128(%rcx), %zmm30 +# INTEL: vcvtph2w zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7d,0x48,0x7d,0x71,0x7f + +# ATT: vcvtph2w -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vcvtph2w zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7d,0xdf,0x7d,0x72,0x80 + +# ATT: vcvtps2phx %zmm29, %ymm30 +# INTEL: vcvtps2phx ymm30, zmm29 +0x62,0x05,0x7d,0x48,0x1d,0xf5 + +# ATT: vcvtps2phx {rn-sae}, %zmm29, %ymm30 +# INTEL: vcvtps2phx ymm30, zmm29, {rn-sae} +0x62,0x05,0x7d,0x18,0x1d,0xf5 + +# ATT: vcvtps2phx 268435456(%rbp,%r14,8), %ymm30 {%k7} +# INTEL: vcvtps2phx ymm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtps2phx (%r9){1to16}, %ymm30 +# INTEL: vcvtps2phx ymm30, dword ptr [r9]{1to16} +0x62,0x45,0x7d,0x58,0x1d,0x31 + +# ATT: vcvtps2phx 8128(%rcx), %ymm30 +# INTEL: vcvtps2phx ymm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7d,0x48,0x1d,0x71,0x7f + +# ATT: vcvtps2phx -512(%rdx){1to16}, %ymm30 {%k7} {z} +# INTEL: vcvtps2phx ymm30 {k7} {z}, dword ptr [rdx - 512]{1to16} +0x62,0x65,0x7d,0xdf,0x1d,0x72,0x80 + +# ATT: vcvtqq2ph %zmm29, %xmm30 +# INTEL: vcvtqq2ph xmm30, zmm29 +0x62,0x05,0xfc,0x48,0x5b,0xf5 + +# ATT: vcvtqq2ph {rn-sae}, %zmm29, %xmm30 +# INTEL: vcvtqq2ph xmm30, zmm29, {rn-sae} +0x62,0x05,0xfc,0x18,0x5b,0xf5 + +# ATT: vcvtqq2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} +# INTEL: vcvtqq2ph xmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0xfc,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtqq2ph (%r9){1to8}, %xmm30 +# INTEL: vcvtqq2ph xmm30, qword ptr [r9]{1to8} +0x62,0x45,0xfc,0x58,0x5b,0x31 + +# ATT: vcvtqq2phz 8128(%rcx), %xmm30 +# INTEL: vcvtqq2ph xmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0xfc,0x48,0x5b,0x71,0x7f + +# ATT: vcvtqq2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} +# INTEL: vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to8} +0x62,0x65,0xfc,0xdf,0x5b,0x72,0x80 + +# ATT: vcvtsd2sh %xmm28, %xmm29, %xmm30 +# INTEL: vcvtsd2sh xmm30, xmm29, xmm28 +0x62,0x05,0x97,0x00,0x5a,0xf4 + +# ATT: vcvtsd2sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vcvtsd2sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x97,0x10,0x5a,0xf4 + +# ATT: vcvtsd2sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vcvtsd2sh xmm30 {k7}, xmm29, qword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x97,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsd2sh (%r9), %xmm29, %xmm30 +# INTEL: vcvtsd2sh xmm30, xmm29, qword ptr [r9] +0x62,0x45,0x97,0x00,0x5a,0x31 + +# ATT: vcvtsd2sh 1016(%rcx), %xmm29, %xmm30 +# INTEL: vcvtsd2sh xmm30, xmm29, qword ptr [rcx + 1016] +0x62,0x65,0x97,0x00,0x5a,0x71,0x7f + +# ATT: vcvtsd2sh -1024(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vcvtsd2sh xmm30 {k7} {z}, xmm29, qword ptr [rdx - 1024] +0x62,0x65,0x97,0x87,0x5a,0x72,0x80 + +# ATT: vcvtsh2sd %xmm28, %xmm29, %xmm30 +# INTEL: vcvtsh2sd xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5a,0xf4 + +# ATT: vcvtsh2sd {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vcvtsh2sd xmm30, xmm29, xmm28, {sae} +0x62,0x05,0x16,0x10,0x5a,0xf4 + +# ATT: vcvtsh2sd 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vcvtsh2sd xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsh2sd (%r9), %xmm29, %xmm30 +# INTEL: vcvtsh2sd xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5a,0x31 + +# ATT: vcvtsh2sd 254(%rcx), %xmm29, %xmm30 +# INTEL: vcvtsh2sd xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5a,0x71,0x7f + +# ATT: vcvtsh2sd -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vcvtsh2sd xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5a,0x72,0x80 + +# ATT: vcvtsh2si %xmm30, %edx +# INTEL: vcvtsh2si edx, xmm30 +0x62,0x95,0x7e,0x08,0x2d,0xd6 + +# ATT: vcvtsh2si {rn-sae}, %xmm30, %edx +# INTEL: vcvtsh2si edx, xmm30, {rn-sae} +0x62,0x95,0x7e,0x18,0x2d,0xd6 + +# ATT: vcvtsh2si %xmm30, %r12 +# INTEL: vcvtsh2si r12, xmm30 +0x62,0x15,0xfe,0x08,0x2d,0xe6 + +# ATT: vcvtsh2si {rn-sae}, %xmm30, %r12 +# INTEL: vcvtsh2si r12, xmm30, {rn-sae} +0x62,0x15,0xfe,0x18,0x2d,0xe6 + +# ATT: vcvtsh2si 268435456(%rbp,%r14,8), %edx +# INTEL: vcvtsh2si edx, word ptr [rbp + 8*r14 + 268435456] +0x62,0xb5,0x7e,0x08,0x2d,0x94,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsh2si (%r9), %edx +# INTEL: vcvtsh2si edx, word ptr [r9] +0x62,0xd5,0x7e,0x08,0x2d,0x11 + +# ATT: vcvtsh2si 254(%rcx), %edx +# INTEL: vcvtsh2si edx, word ptr [rcx + 254] +0x62,0xf5,0x7e,0x08,0x2d,0x51,0x7f + +# ATT: vcvtsh2si -256(%rdx), %edx +# INTEL: vcvtsh2si edx, word ptr [rdx - 256] +0x62,0xf5,0x7e,0x08,0x2d,0x52,0x80 + +# ATT: vcvtsh2si 268435456(%rbp,%r14,8), %r12 +# INTEL: vcvtsh2si r12, word ptr [rbp + 8*r14 + 268435456] +0x62,0x35,0xfe,0x08,0x2d,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsh2si (%r9), %r12 +# INTEL: vcvtsh2si r12, word ptr [r9] +0x62,0x55,0xfe,0x08,0x2d,0x21 + +# ATT: vcvtsh2si 254(%rcx), %r12 +# INTEL: vcvtsh2si r12, word ptr [rcx + 254] +0x62,0x75,0xfe,0x08,0x2d,0x61,0x7f + +# ATT: vcvtsh2si -256(%rdx), %r12 +# INTEL: vcvtsh2si r12, word ptr [rdx - 256] +0x62,0x75,0xfe,0x08,0x2d,0x62,0x80 + +# ATT: vcvtsh2ss %xmm28, %xmm29, %xmm30 +# INTEL: vcvtsh2ss xmm30, xmm29, xmm28 +0x62,0x06,0x14,0x00,0x13,0xf4 + +# ATT: vcvtsh2ss {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vcvtsh2ss xmm30, xmm29, xmm28, {sae} +0x62,0x06,0x14,0x10,0x13,0xf4 + +# ATT: vcvtsh2ss 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vcvtsh2ss xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x14,0x07,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsh2ss (%r9), %xmm29, %xmm30 +# INTEL: vcvtsh2ss xmm30, xmm29, word ptr [r9] +0x62,0x46,0x14,0x00,0x13,0x31 + +# ATT: vcvtsh2ss 254(%rcx), %xmm29, %xmm30 +# INTEL: vcvtsh2ss xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x14,0x00,0x13,0x71,0x7f + +# ATT: vcvtsh2ss -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vcvtsh2ss xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x14,0x87,0x13,0x72,0x80 + +# ATT: vcvtsh2usi %xmm30, %edx +# INTEL: vcvtsh2usi edx, xmm30 +0x62,0x95,0x7e,0x08,0x79,0xd6 + +# ATT: vcvtsh2usi {rn-sae}, %xmm30, %edx +# INTEL: vcvtsh2usi edx, xmm30, {rn-sae} +0x62,0x95,0x7e,0x18,0x79,0xd6 + +# ATT: vcvtsh2usi %xmm30, %r12 +# INTEL: vcvtsh2usi r12, xmm30 +0x62,0x15,0xfe,0x08,0x79,0xe6 + +# ATT: vcvtsh2usi {rn-sae}, %xmm30, %r12 +# INTEL: vcvtsh2usi r12, xmm30, {rn-sae} +0x62,0x15,0xfe,0x18,0x79,0xe6 + +# ATT: vcvtsh2usi 268435456(%rbp,%r14,8), %edx +# INTEL: vcvtsh2usi edx, word ptr [rbp + 8*r14 + 268435456] +0x62,0xb5,0x7e,0x08,0x79,0x94,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsh2usi (%r9), %edx +# INTEL: vcvtsh2usi edx, word ptr [r9] +0x62,0xd5,0x7e,0x08,0x79,0x11 + +# ATT: vcvtsh2usi 254(%rcx), %edx +# INTEL: vcvtsh2usi edx, word ptr [rcx + 254] +0x62,0xf5,0x7e,0x08,0x79,0x51,0x7f + +# ATT: vcvtsh2usi -256(%rdx), %edx +# INTEL: vcvtsh2usi edx, word ptr [rdx - 256] +0x62,0xf5,0x7e,0x08,0x79,0x52,0x80 + +# ATT: vcvtsh2usi 268435456(%rbp,%r14,8), %r12 +# INTEL: vcvtsh2usi r12, word ptr [rbp + 8*r14 + 268435456] +0x62,0x35,0xfe,0x08,0x79,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsh2usi (%r9), %r12 +# INTEL: vcvtsh2usi r12, word ptr [r9] +0x62,0x55,0xfe,0x08,0x79,0x21 + +# ATT: vcvtsh2usi 254(%rcx), %r12 +# INTEL: vcvtsh2usi r12, word ptr [rcx + 254] +0x62,0x75,0xfe,0x08,0x79,0x61,0x7f + +# ATT: vcvtsh2usi -256(%rdx), %r12 +# INTEL: vcvtsh2usi r12, word ptr [rdx - 256] +0x62,0x75,0xfe,0x08,0x79,0x62,0x80 + +# ATT: vcvtsi2sh %r12, %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, r12 +0x62,0x45,0x96,0x00,0x2a,0xf4 + +# ATT: vcvtsi2sh %r12, {rn-sae}, %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, {rn-sae}, r12 +0x62,0x45,0x96,0x10,0x2a,0xf4 + +# ATT: vcvtsi2sh %edx, %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, edx +0x62,0x65,0x16,0x00,0x2a,0xf2 + +# ATT: vcvtsi2sh %edx, {rn-sae}, %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, {rn-sae}, edx +0x62,0x65,0x16,0x10,0x2a,0xf2 + +# ATT: vcvtsi2shl 268435456(%rbp,%r14,8), %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x00,0x2a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtsi2shl (%r9), %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [r9] +0x62,0x45,0x16,0x00,0x2a,0x31 + +# ATT: vcvtsi2shl 508(%rcx), %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x65,0x16,0x00,0x2a,0x71,0x7f + +# ATT: vcvtsi2shl -512(%rdx), %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [rdx - 512] +0x62,0x65,0x16,0x00,0x2a,0x72,0x80 + +# ATT: vcvtsi2shq 1016(%rcx), %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, qword ptr [rcx + 1016] +0x62,0x65,0x96,0x00,0x2a,0x71,0x7f + +# ATT: vcvtsi2shq -1024(%rdx), %xmm29, %xmm30 +# INTEL: vcvtsi2sh xmm30, xmm29, qword ptr [rdx - 1024] +0x62,0x65,0x96,0x00,0x2a,0x72,0x80 + +# ATT: vcvtss2sh %xmm28, %xmm29, %xmm30 +# INTEL: vcvtss2sh xmm30, xmm29, xmm28 +0x62,0x05,0x14,0x00,0x1d,0xf4 + +# ATT: vcvtss2sh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vcvtss2sh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x1d,0xf4 + +# ATT: vcvtss2sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vcvtss2sh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x07,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtss2sh (%r9), %xmm29, %xmm30 +# INTEL: vcvtss2sh xmm30, xmm29, dword ptr [r9] +0x62,0x45,0x14,0x00,0x1d,0x31 + +# ATT: vcvtss2sh 508(%rcx), %xmm29, %xmm30 +# INTEL: vcvtss2sh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x65,0x14,0x00,0x1d,0x71,0x7f + +# ATT: vcvtss2sh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vcvtss2sh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512] +0x62,0x65,0x14,0x87,0x1d,0x72,0x80 + +# ATT: vcvttph2dq %ymm29, %zmm30 +# INTEL: vcvttph2dq zmm30, ymm29 +0x62,0x05,0x7e,0x48,0x5b,0xf5 + +# ATT: vcvttph2dq {sae}, %ymm29, %zmm30 +# INTEL: vcvttph2dq zmm30, ymm29, {sae} +0x62,0x05,0x7e,0x18,0x5b,0xf5 + +# ATT: vcvttph2dq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvttph2dq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7e,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2dq (%r9){1to16}, %zmm30 +# INTEL: vcvttph2dq zmm30, word ptr [r9]{1to16} +0x62,0x45,0x7e,0x58,0x5b,0x31 + +# ATT: vcvttph2dq 4064(%rcx), %zmm30 +# INTEL: vcvttph2dq zmm30, ymmword ptr [rcx + 4064] +0x62,0x65,0x7e,0x48,0x5b,0x71,0x7f + +# ATT: vcvttph2dq -256(%rdx){1to16}, %zmm30 {%k7} {z} +# INTEL: vcvttph2dq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0x65,0x7e,0xdf,0x5b,0x72,0x80 + +# ATT: vcvttph2qq %xmm29, %zmm30 +# INTEL: vcvttph2qq zmm30, xmm29 +0x62,0x05,0x7d,0x48,0x7a,0xf5 + +# ATT: vcvttph2qq {sae}, %xmm29, %zmm30 +# INTEL: vcvttph2qq zmm30, xmm29, {sae} +0x62,0x05,0x7d,0x18,0x7a,0xf5 + +# ATT: vcvttph2qq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvttph2qq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2qq (%r9){1to8}, %zmm30 +# INTEL: vcvttph2qq zmm30, word ptr [r9]{1to8} +0x62,0x45,0x7d,0x58,0x7a,0x31 + +# ATT: vcvttph2qq 2032(%rcx), %zmm30 +# INTEL: vcvttph2qq zmm30, xmmword ptr [rcx + 2032] +0x62,0x65,0x7d,0x48,0x7a,0x71,0x7f + +# ATT: vcvttph2qq -256(%rdx){1to8}, %zmm30 {%k7} {z} +# INTEL: vcvttph2qq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0x65,0x7d,0xdf,0x7a,0x72,0x80 + +# ATT: vcvttph2udq %ymm29, %zmm30 +# INTEL: vcvttph2udq zmm30, ymm29 +0x62,0x05,0x7c,0x48,0x78,0xf5 + +# ATT: vcvttph2udq {sae}, %ymm29, %zmm30 +# INTEL: vcvttph2udq zmm30, ymm29, {sae} +0x62,0x05,0x7c,0x18,0x78,0xf5 + +# ATT: vcvttph2udq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvttph2udq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2udq (%r9){1to16}, %zmm30 +# INTEL: vcvttph2udq zmm30, word ptr [r9]{1to16} +0x62,0x45,0x7c,0x58,0x78,0x31 + +# ATT: vcvttph2udq 4064(%rcx), %zmm30 +# INTEL: vcvttph2udq zmm30, ymmword ptr [rcx + 4064] +0x62,0x65,0x7c,0x48,0x78,0x71,0x7f + +# ATT: vcvttph2udq -256(%rdx){1to16}, %zmm30 {%k7} {z} +# INTEL: vcvttph2udq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0x65,0x7c,0xdf,0x78,0x72,0x80 + +# ATT: vcvttph2uqq %xmm29, %zmm30 +# INTEL: vcvttph2uqq zmm30, xmm29 +0x62,0x05,0x7d,0x48,0x78,0xf5 + +# ATT: vcvttph2uqq {sae}, %xmm29, %zmm30 +# INTEL: vcvttph2uqq zmm30, xmm29, {sae} +0x62,0x05,0x7d,0x18,0x78,0xf5 + +# ATT: vcvttph2uqq 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvttph2uqq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2uqq (%r9){1to8}, %zmm30 +# INTEL: vcvttph2uqq zmm30, word ptr [r9]{1to8} +0x62,0x45,0x7d,0x58,0x78,0x31 + +# ATT: vcvttph2uqq 2032(%rcx), %zmm30 +# INTEL: vcvttph2uqq zmm30, xmmword ptr [rcx + 2032] +0x62,0x65,0x7d,0x48,0x78,0x71,0x7f + +# ATT: vcvttph2uqq -256(%rdx){1to8}, %zmm30 {%k7} {z} +# INTEL: vcvttph2uqq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0x65,0x7d,0xdf,0x78,0x72,0x80 + +# ATT: vcvttph2uw %zmm29, %zmm30 +# INTEL: vcvttph2uw zmm30, zmm29 +0x62,0x05,0x7c,0x48,0x7c,0xf5 + +# ATT: vcvttph2uw {sae}, %zmm29, %zmm30 +# INTEL: vcvttph2uw zmm30, zmm29, {sae} +0x62,0x05,0x7c,0x18,0x7c,0xf5 + +# ATT: vcvttph2uw 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvttph2uw zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2uw (%r9){1to32}, %zmm30 +# INTEL: vcvttph2uw zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7c,0x58,0x7c,0x31 + +# ATT: vcvttph2uw 8128(%rcx), %zmm30 +# INTEL: vcvttph2uw zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7c,0x48,0x7c,0x71,0x7f + +# ATT: vcvttph2uw -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vcvttph2uw zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7c,0xdf,0x7c,0x72,0x80 + +# ATT: vcvttph2w %zmm29, %zmm30 +# INTEL: vcvttph2w zmm30, zmm29 +0x62,0x05,0x7d,0x48,0x7c,0xf5 + +# ATT: vcvttph2w {sae}, %zmm29, %zmm30 +# INTEL: vcvttph2w zmm30, zmm29, {sae} +0x62,0x05,0x7d,0x18,0x7c,0xf5 + +# ATT: vcvttph2w 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvttph2w zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2w (%r9){1to32}, %zmm30 +# INTEL: vcvttph2w zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7d,0x58,0x7c,0x31 + +# ATT: vcvttph2w 8128(%rcx), %zmm30 +# INTEL: vcvttph2w zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7d,0x48,0x7c,0x71,0x7f + +# ATT: vcvttph2w -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vcvttph2w zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7d,0xdf,0x7c,0x72,0x80 + +# ATT: vcvttsh2si %xmm30, %edx +# INTEL: vcvttsh2si edx, xmm30 +0x62,0x95,0x7e,0x08,0x2c,0xd6 + +# ATT: vcvttsh2si {sae}, %xmm30, %edx +# INTEL: vcvttsh2si edx, xmm30, {sae} +0x62,0x95,0x7e,0x18,0x2c,0xd6 + +# ATT: vcvttsh2si %xmm30, %r12 +# INTEL: vcvttsh2si r12, xmm30 +0x62,0x15,0xfe,0x08,0x2c,0xe6 + +# ATT: vcvttsh2si {sae}, %xmm30, %r12 +# INTEL: vcvttsh2si r12, xmm30, {sae} +0x62,0x15,0xfe,0x18,0x2c,0xe6 + +# ATT: vcvttsh2si 268435456(%rbp,%r14,8), %edx +# INTEL: vcvttsh2si edx, word ptr [rbp + 8*r14 + 268435456] +0x62,0xb5,0x7e,0x08,0x2c,0x94,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttsh2si (%r9), %edx +# INTEL: vcvttsh2si edx, word ptr [r9] +0x62,0xd5,0x7e,0x08,0x2c,0x11 + +# ATT: vcvttsh2si 254(%rcx), %edx +# INTEL: vcvttsh2si edx, word ptr [rcx + 254] +0x62,0xf5,0x7e,0x08,0x2c,0x51,0x7f + +# ATT: vcvttsh2si -256(%rdx), %edx +# INTEL: vcvttsh2si edx, word ptr [rdx - 256] +0x62,0xf5,0x7e,0x08,0x2c,0x52,0x80 + +# ATT: vcvttsh2si 268435456(%rbp,%r14,8), %r12 +# INTEL: vcvttsh2si r12, word ptr [rbp + 8*r14 + 268435456] +0x62,0x35,0xfe,0x08,0x2c,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttsh2si (%r9), %r12 +# INTEL: vcvttsh2si r12, word ptr [r9] +0x62,0x55,0xfe,0x08,0x2c,0x21 + +# ATT: vcvttsh2si 254(%rcx), %r12 +# INTEL: vcvttsh2si r12, word ptr [rcx + 254] +0x62,0x75,0xfe,0x08,0x2c,0x61,0x7f + +# ATT: vcvttsh2si -256(%rdx), %r12 +# INTEL: vcvttsh2si r12, word ptr [rdx - 256] +0x62,0x75,0xfe,0x08,0x2c,0x62,0x80 + +# ATT: vcvttsh2usi %xmm30, %edx +# INTEL: vcvttsh2usi edx, xmm30 +0x62,0x95,0x7e,0x08,0x78,0xd6 + +# ATT: vcvttsh2usi {sae}, %xmm30, %edx +# INTEL: vcvttsh2usi edx, xmm30, {sae} +0x62,0x95,0x7e,0x18,0x78,0xd6 + +# ATT: vcvttsh2usi %xmm30, %r12 +# INTEL: vcvttsh2usi r12, xmm30 +0x62,0x15,0xfe,0x08,0x78,0xe6 + +# ATT: vcvttsh2usi {sae}, %xmm30, %r12 +# INTEL: vcvttsh2usi r12, xmm30, {sae} +0x62,0x15,0xfe,0x18,0x78,0xe6 + +# ATT: vcvttsh2usi 268435456(%rbp,%r14,8), %edx +# INTEL: vcvttsh2usi edx, word ptr [rbp + 8*r14 + 268435456] +0x62,0xb5,0x7e,0x08,0x78,0x94,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttsh2usi (%r9), %edx +# INTEL: vcvttsh2usi edx, word ptr [r9] +0x62,0xd5,0x7e,0x08,0x78,0x11 + +# ATT: vcvttsh2usi 254(%rcx), %edx +# INTEL: vcvttsh2usi edx, word ptr [rcx + 254] +0x62,0xf5,0x7e,0x08,0x78,0x51,0x7f + +# ATT: vcvttsh2usi -256(%rdx), %edx +# INTEL: vcvttsh2usi edx, word ptr [rdx - 256] +0x62,0xf5,0x7e,0x08,0x78,0x52,0x80 + +# ATT: vcvttsh2usi 268435456(%rbp,%r14,8), %r12 +# INTEL: vcvttsh2usi r12, word ptr [rbp + 8*r14 + 268435456] +0x62,0x35,0xfe,0x08,0x78,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvttsh2usi (%r9), %r12 +# INTEL: vcvttsh2usi r12, word ptr [r9] +0x62,0x55,0xfe,0x08,0x78,0x21 + +# ATT: vcvttsh2usi 254(%rcx), %r12 +# INTEL: vcvttsh2usi r12, word ptr [rcx + 254] +0x62,0x75,0xfe,0x08,0x78,0x61,0x7f + +# ATT: vcvttsh2usi -256(%rdx), %r12 +# INTEL: vcvttsh2usi r12, word ptr [rdx - 256] +0x62,0x75,0xfe,0x08,0x78,0x62,0x80 + +# ATT: vcvtudq2ph %zmm29, %ymm30 +# INTEL: vcvtudq2ph ymm30, zmm29 +0x62,0x05,0x7f,0x48,0x7a,0xf5 + +# ATT: vcvtudq2ph {rn-sae}, %zmm29, %ymm30 +# INTEL: vcvtudq2ph ymm30, zmm29, {rn-sae} +0x62,0x05,0x7f,0x18,0x7a,0xf5 + +# ATT: vcvtudq2ph 268435456(%rbp,%r14,8), %ymm30 {%k7} +# INTEL: vcvtudq2ph ymm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7f,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtudq2ph (%r9){1to16}, %ymm30 +# INTEL: vcvtudq2ph ymm30, dword ptr [r9]{1to16} +0x62,0x45,0x7f,0x58,0x7a,0x31 + +# ATT: vcvtudq2ph 8128(%rcx), %ymm30 +# INTEL: vcvtudq2ph ymm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7f,0x48,0x7a,0x71,0x7f + +# ATT: vcvtudq2ph -512(%rdx){1to16}, %ymm30 {%k7} {z} +# INTEL: vcvtudq2ph ymm30 {k7} {z}, dword ptr [rdx - 512]{1to16} +0x62,0x65,0x7f,0xdf,0x7a,0x72,0x80 + +# ATT: vcvtuqq2ph %zmm29, %xmm30 +# INTEL: vcvtuqq2ph xmm30, zmm29 +0x62,0x05,0xff,0x48,0x7a,0xf5 + +# ATT: vcvtuqq2ph {rn-sae}, %zmm29, %xmm30 +# INTEL: vcvtuqq2ph xmm30, zmm29, {rn-sae} +0x62,0x05,0xff,0x18,0x7a,0xf5 + +# ATT: vcvtuqq2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} +# INTEL: vcvtuqq2ph xmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0xff,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtuqq2ph (%r9){1to8}, %xmm30 +# INTEL: vcvtuqq2ph xmm30, qword ptr [r9]{1to8} +0x62,0x45,0xff,0x58,0x7a,0x31 + +# ATT: vcvtuqq2phz 8128(%rcx), %xmm30 +# INTEL: vcvtuqq2ph xmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0xff,0x48,0x7a,0x71,0x7f + +# ATT: vcvtuqq2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} +# INTEL: vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to8} +0x62,0x65,0xff,0xdf,0x7a,0x72,0x80 + +# ATT: vcvtusi2sh %r12, %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, r12 +0x62,0x45,0x96,0x00,0x7b,0xf4 + +# ATT: vcvtusi2sh %r12, {rn-sae}, %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, {rn-sae}, r12 +0x62,0x45,0x96,0x10,0x7b,0xf4 + +# ATT: vcvtusi2sh %edx, %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, edx +0x62,0x65,0x16,0x00,0x7b,0xf2 + +# ATT: vcvtusi2sh %edx, {rn-sae}, %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, {rn-sae}, edx +0x62,0x65,0x16,0x10,0x7b,0xf2 + +# ATT: vcvtusi2shl 268435456(%rbp,%r14,8), %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x00,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtusi2shl (%r9), %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [r9] +0x62,0x45,0x16,0x00,0x7b,0x31 + +# ATT: vcvtusi2shl 508(%rcx), %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [rcx + 508] +0x62,0x65,0x16,0x00,0x7b,0x71,0x7f + +# ATT: vcvtusi2shl -512(%rdx), %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [rdx - 512] +0x62,0x65,0x16,0x00,0x7b,0x72,0x80 + +# ATT: vcvtusi2shq 1016(%rcx), %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, qword ptr [rcx + 1016] +0x62,0x65,0x96,0x00,0x7b,0x71,0x7f + +# ATT: vcvtusi2shq -1024(%rdx), %xmm29, %xmm30 +# INTEL: vcvtusi2sh xmm30, xmm29, qword ptr [rdx - 1024] +0x62,0x65,0x96,0x00,0x7b,0x72,0x80 + +# ATT: vcvtuw2ph %zmm29, %zmm30 +# INTEL: vcvtuw2ph zmm30, zmm29 +0x62,0x05,0x7f,0x48,0x7d,0xf5 + +# ATT: vcvtuw2ph {rn-sae}, %zmm29, %zmm30 +# INTEL: vcvtuw2ph zmm30, zmm29, {rn-sae} +0x62,0x05,0x7f,0x18,0x7d,0xf5 + +# ATT: vcvtuw2ph 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtuw2ph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7f,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtuw2ph (%r9){1to32}, %zmm30 +# INTEL: vcvtuw2ph zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7f,0x58,0x7d,0x31 + +# ATT: vcvtuw2ph 8128(%rcx), %zmm30 +# INTEL: vcvtuw2ph zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7f,0x48,0x7d,0x71,0x7f + +# ATT: vcvtuw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vcvtuw2ph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7f,0xdf,0x7d,0x72,0x80 + +# ATT: vcvtw2ph %zmm29, %zmm30 +# INTEL: vcvtw2ph zmm30, zmm29 +0x62,0x05,0x7e,0x48,0x7d,0xf5 + +# ATT: vcvtw2ph {rn-sae}, %zmm29, %zmm30 +# INTEL: vcvtw2ph zmm30, zmm29, {rn-sae} +0x62,0x05,0x7e,0x18,0x7d,0xf5 + +# ATT: vcvtw2ph 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vcvtw2ph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7e,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtw2ph (%r9){1to32}, %zmm30 +# INTEL: vcvtw2ph zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7e,0x58,0x7d,0x31 + +# ATT: vcvtw2ph 8128(%rcx), %zmm30 +# INTEL: vcvtw2ph zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7e,0x48,0x7d,0x71,0x7f + +# ATT: vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vcvtw2ph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt --- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -280,3 +280,859 @@ # ATT: vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} # INTEL: vsubph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} 0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80 + +# ATT: vcvtdq2ph %xmm5, %xmm6 +# INTEL: vcvtdq2ph xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x5b,0xf5 + +# ATT: vcvtdq2ph %ymm5, %xmm6 +# INTEL: vcvtdq2ph xmm6, ymm5 +0x62,0xf5,0x7c,0x28,0x5b,0xf5 + +# ATT: vcvtdq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtdq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtdq2ph (%ecx){1to4}, %xmm6 +# INTEL: vcvtdq2ph xmm6, dword ptr [ecx]{1to4} +0x62,0xf5,0x7c,0x18,0x5b,0x31 + +# ATT: vcvtdq2phx 2032(%ecx), %xmm6 +# INTEL: vcvtdq2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7c,0x08,0x5b,0x71,0x7f + +# ATT: vcvtdq2ph -512(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtdq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to4} +0x62,0xf5,0x7c,0x9f,0x5b,0x72,0x80 + +# ATT: vcvtdq2ph (%ecx){1to8}, %xmm6 +# INTEL: vcvtdq2ph xmm6, dword ptr [ecx]{1to8} +0x62,0xf5,0x7c,0x38,0x5b,0x31 + +# ATT: vcvtdq2phy 4064(%ecx), %xmm6 +# INTEL: vcvtdq2ph xmm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7c,0x28,0x5b,0x71,0x7f + +# ATT: vcvtdq2ph -512(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtdq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to8} +0x62,0xf5,0x7c,0xbf,0x5b,0x72,0x80 + +# ATT: vcvtpd2ph %xmm5, %xmm6 +# INTEL: vcvtpd2ph xmm6, xmm5 +0x62,0xf5,0xfd,0x08,0x5a,0xf5 + +# ATT: vcvtpd2ph %ymm5, %xmm6 +# INTEL: vcvtpd2ph xmm6, ymm5 +0x62,0xf5,0xfd,0x28,0x5a,0xf5 + +# ATT: vcvtpd2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtpd2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0xfd,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtpd2ph (%ecx){1to2}, %xmm6 +# INTEL: vcvtpd2ph xmm6, qword ptr [ecx]{1to2} +0x62,0xf5,0xfd,0x18,0x5a,0x31 + +# ATT: vcvtpd2phx 2032(%ecx), %xmm6 +# INTEL: vcvtpd2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0xfd,0x08,0x5a,0x71,0x7f + +# ATT: vcvtpd2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to2} +0x62,0xf5,0xfd,0x9f,0x5a,0x72,0x80 + +# ATT: vcvtpd2ph (%ecx){1to4}, %xmm6 +# INTEL: vcvtpd2ph xmm6, qword ptr [ecx]{1to4} +0x62,0xf5,0xfd,0x38,0x5a,0x31 + +# ATT: vcvtpd2phy 4064(%ecx), %xmm6 +# INTEL: vcvtpd2ph xmm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0xfd,0x28,0x5a,0x71,0x7f + +# ATT: vcvtpd2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to4} +0x62,0xf5,0xfd,0xbf,0x5a,0x72,0x80 + +# ATT: vcvtph2dq %xmm5, %xmm6 +# INTEL: vcvtph2dq xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x5b,0xf5 + +# ATT: vcvtph2dq %xmm5, %ymm6 +# INTEL: vcvtph2dq ymm6, xmm5 +0x62,0xf5,0x7d,0x28,0x5b,0xf5 + +# ATT: vcvtph2dq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2dq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2dq (%ecx){1to4}, %xmm6 +# INTEL: vcvtph2dq xmm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7d,0x18,0x5b,0x31 + +# ATT: vcvtph2dq 1016(%ecx), %xmm6 +# INTEL: vcvtph2dq xmm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7d,0x08,0x5b,0x71,0x7f + +# ATT: vcvtph2dq -256(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtph2dq xmm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7d,0x9f,0x5b,0x72,0x80 + +# ATT: vcvtph2dq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2dq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2dq (%ecx){1to8}, %ymm6 +# INTEL: vcvtph2dq ymm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7d,0x38,0x5b,0x31 + +# ATT: vcvtph2dq 2032(%ecx), %ymm6 +# INTEL: vcvtph2dq ymm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7d,0x28,0x5b,0x71,0x7f + +# ATT: vcvtph2dq -256(%edx){1to8}, %ymm6 {%k7} {z} +# INTEL: vcvtph2dq ymm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7d,0xbf,0x5b,0x72,0x80 + +# ATT: vcvtph2pd %xmm5, %xmm6 +# INTEL: vcvtph2pd xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x5a,0xf5 + +# ATT: vcvtph2pd %xmm5, %ymm6 +# INTEL: vcvtph2pd ymm6, xmm5 +0x62,0xf5,0x7c,0x28,0x5a,0xf5 + +# ATT: vcvtph2pd 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2pd xmm6 {k7}, dword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2pd (%ecx){1to2}, %xmm6 +# INTEL: vcvtph2pd xmm6, word ptr [ecx]{1to2} +0x62,0xf5,0x7c,0x18,0x5a,0x31 + +# ATT: vcvtph2pd 508(%ecx), %xmm6 +# INTEL: vcvtph2pd xmm6, dword ptr [ecx + 508] +0x62,0xf5,0x7c,0x08,0x5a,0x71,0x7f + +# ATT: vcvtph2pd -256(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvtph2pd xmm6 {k7} {z}, word ptr [edx - 256]{1to2} +0x62,0xf5,0x7c,0x9f,0x5a,0x72,0x80 + +# ATT: vcvtph2pd 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2pd ymm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x2f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2pd (%ecx){1to4}, %ymm6 +# INTEL: vcvtph2pd ymm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7c,0x38,0x5a,0x31 + +# ATT: vcvtph2pd 1016(%ecx), %ymm6 +# INTEL: vcvtph2pd ymm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7c,0x28,0x5a,0x71,0x7f + +# ATT: vcvtph2pd -256(%edx){1to4}, %ymm6 {%k7} {z} +# INTEL: vcvtph2pd ymm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7c,0xbf,0x5a,0x72,0x80 + +# ATT: vcvtph2psx %xmm5, %xmm6 +# INTEL: vcvtph2psx xmm6, xmm5 +0x62,0xf6,0x7d,0x08,0x13,0xf5 + +# ATT: vcvtph2psx %xmm5, %ymm6 +# INTEL: vcvtph2psx ymm6, xmm5 +0x62,0xf6,0x7d,0x28,0x13,0xf5 + +# ATT: vcvtph2psx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2psx xmm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x0f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2psx (%ecx){1to4}, %xmm6 +# INTEL: vcvtph2psx xmm6, word ptr [ecx]{1to4} +0x62,0xf6,0x7d,0x18,0x13,0x31 + +# ATT: vcvtph2psx 1016(%ecx), %xmm6 +# INTEL: vcvtph2psx xmm6, qword ptr [ecx + 1016] +0x62,0xf6,0x7d,0x08,0x13,0x71,0x7f + +# ATT: vcvtph2psx -256(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtph2psx xmm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf6,0x7d,0x9f,0x13,0x72,0x80 + +# ATT: vcvtph2psx 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2psx ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x2f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2psx (%ecx){1to8}, %ymm6 +# INTEL: vcvtph2psx ymm6, word ptr [ecx]{1to8} +0x62,0xf6,0x7d,0x38,0x13,0x31 + +# ATT: vcvtph2psx 2032(%ecx), %ymm6 +# INTEL: vcvtph2psx ymm6, xmmword ptr [ecx + 2032] +0x62,0xf6,0x7d,0x28,0x13,0x71,0x7f + +# ATT: vcvtph2psx -256(%edx){1to8}, %ymm6 {%k7} {z} +# INTEL: vcvtph2psx ymm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf6,0x7d,0xbf,0x13,0x72,0x80 + +# ATT: vcvtph2qq %xmm5, %xmm6 +# INTEL: vcvtph2qq xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x7b,0xf5 + +# ATT: vcvtph2qq %xmm5, %ymm6 +# INTEL: vcvtph2qq ymm6, xmm5 +0x62,0xf5,0x7d,0x28,0x7b,0xf5 + +# ATT: vcvtph2qq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2qq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2qq (%ecx){1to2}, %xmm6 +# INTEL: vcvtph2qq xmm6, word ptr [ecx]{1to2} +0x62,0xf5,0x7d,0x18,0x7b,0x31 + +# ATT: vcvtph2qq 508(%ecx), %xmm6 +# INTEL: vcvtph2qq xmm6, dword ptr [ecx + 508] +0x62,0xf5,0x7d,0x08,0x7b,0x71,0x7f + +# ATT: vcvtph2qq -256(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvtph2qq xmm6 {k7} {z}, word ptr [edx - 256]{1to2} +0x62,0xf5,0x7d,0x9f,0x7b,0x72,0x80 + +# ATT: vcvtph2qq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2qq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2qq (%ecx){1to4}, %ymm6 +# INTEL: vcvtph2qq ymm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7d,0x38,0x7b,0x31 + +# ATT: vcvtph2qq 1016(%ecx), %ymm6 +# INTEL: vcvtph2qq ymm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7d,0x28,0x7b,0x71,0x7f + +# ATT: vcvtph2qq -256(%edx){1to4}, %ymm6 {%k7} {z} +# INTEL: vcvtph2qq ymm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7d,0xbf,0x7b,0x72,0x80 + +# ATT: vcvtph2udq %xmm5, %xmm6 +# INTEL: vcvtph2udq xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x79,0xf5 + +# ATT: vcvtph2udq %xmm5, %ymm6 +# INTEL: vcvtph2udq ymm6, xmm5 +0x62,0xf5,0x7c,0x28,0x79,0xf5 + +# ATT: vcvtph2udq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2udq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2udq (%ecx){1to4}, %xmm6 +# INTEL: vcvtph2udq xmm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7c,0x18,0x79,0x31 + +# ATT: vcvtph2udq 1016(%ecx), %xmm6 +# INTEL: vcvtph2udq xmm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7c,0x08,0x79,0x71,0x7f + +# ATT: vcvtph2udq -256(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtph2udq xmm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7c,0x9f,0x79,0x72,0x80 + +# ATT: vcvtph2udq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2udq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2udq (%ecx){1to8}, %ymm6 +# INTEL: vcvtph2udq ymm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7c,0x38,0x79,0x31 + +# ATT: vcvtph2udq 2032(%ecx), %ymm6 +# INTEL: vcvtph2udq ymm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7c,0x28,0x79,0x71,0x7f + +# ATT: vcvtph2udq -256(%edx){1to8}, %ymm6 {%k7} {z} +# INTEL: vcvtph2udq ymm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7c,0xbf,0x79,0x72,0x80 + +# ATT: vcvtph2uqq %xmm5, %xmm6 +# INTEL: vcvtph2uqq xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x79,0xf5 + +# ATT: vcvtph2uqq %xmm5, %ymm6 +# INTEL: vcvtph2uqq ymm6, xmm5 +0x62,0xf5,0x7d,0x28,0x79,0xf5 + +# ATT: vcvtph2uqq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2uqq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2uqq (%ecx){1to2}, %xmm6 +# INTEL: vcvtph2uqq xmm6, word ptr [ecx]{1to2} +0x62,0xf5,0x7d,0x18,0x79,0x31 + +# ATT: vcvtph2uqq 508(%ecx), %xmm6 +# INTEL: vcvtph2uqq xmm6, dword ptr [ecx + 508] +0x62,0xf5,0x7d,0x08,0x79,0x71,0x7f + +# ATT: vcvtph2uqq -256(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvtph2uqq xmm6 {k7} {z}, word ptr [edx - 256]{1to2} +0x62,0xf5,0x7d,0x9f,0x79,0x72,0x80 + +# ATT: vcvtph2uqq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2uqq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2uqq (%ecx){1to4}, %ymm6 +# INTEL: vcvtph2uqq ymm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7d,0x38,0x79,0x31 + +# ATT: vcvtph2uqq 1016(%ecx), %ymm6 +# INTEL: vcvtph2uqq ymm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7d,0x28,0x79,0x71,0x7f + +# ATT: vcvtph2uqq -256(%edx){1to4}, %ymm6 {%k7} {z} +# INTEL: vcvtph2uqq ymm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7d,0xbf,0x79,0x72,0x80 + +# ATT: vcvtph2uw %xmm5, %xmm6 +# INTEL: vcvtph2uw xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x7d,0xf5 + +# ATT: vcvtph2uw %ymm5, %ymm6 +# INTEL: vcvtph2uw ymm6, ymm5 +0x62,0xf5,0x7c,0x28,0x7d,0xf5 + +# ATT: vcvtph2uw 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2uw xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2uw (%ecx){1to8}, %xmm6 +# INTEL: vcvtph2uw xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7c,0x18,0x7d,0x31 + +# ATT: vcvtph2uw 2032(%ecx), %xmm6 +# INTEL: vcvtph2uw xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7c,0x08,0x7d,0x71,0x7f + +# ATT: vcvtph2uw -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtph2uw xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7c,0x9f,0x7d,0x72,0x80 + +# ATT: vcvtph2uw 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2uw ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2uw (%ecx){1to16}, %ymm6 +# INTEL: vcvtph2uw ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7c,0x38,0x7d,0x31 + +# ATT: vcvtph2uw 4064(%ecx), %ymm6 +# INTEL: vcvtph2uw ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7c,0x28,0x7d,0x71,0x7f + +# ATT: vcvtph2uw -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vcvtph2uw ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7c,0xbf,0x7d,0x72,0x80 + +# ATT: vcvtph2w %xmm5, %xmm6 +# INTEL: vcvtph2w xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x7d,0xf5 + +# ATT: vcvtph2w %ymm5, %ymm6 +# INTEL: vcvtph2w ymm6, ymm5 +0x62,0xf5,0x7d,0x28,0x7d,0xf5 + +# ATT: vcvtph2w 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtph2w xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2w (%ecx){1to8}, %xmm6 +# INTEL: vcvtph2w xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7d,0x18,0x7d,0x31 + +# ATT: vcvtph2w 2032(%ecx), %xmm6 +# INTEL: vcvtph2w xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7d,0x08,0x7d,0x71,0x7f + +# ATT: vcvtph2w -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtph2w xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7d,0x9f,0x7d,0x72,0x80 + +# ATT: vcvtph2w 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtph2w ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtph2w (%ecx){1to16}, %ymm6 +# INTEL: vcvtph2w ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7d,0x38,0x7d,0x31 + +# ATT: vcvtph2w 4064(%ecx), %ymm6 +# INTEL: vcvtph2w ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7d,0x28,0x7d,0x71,0x7f + +# ATT: vcvtph2w -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vcvtph2w ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7d,0xbf,0x7d,0x72,0x80 + +# ATT: vcvtps2phx %xmm5, %xmm6 +# INTEL: vcvtps2phx xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x1d,0xf5 + +# ATT: vcvtps2phx %ymm5, %xmm6 +# INTEL: vcvtps2phx xmm6, ymm5 +0x62,0xf5,0x7d,0x28,0x1d,0xf5 + +# ATT: vcvtps2phxx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtps2phx xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtps2phx (%ecx){1to4}, %xmm6 +# INTEL: vcvtps2phx xmm6, dword ptr [ecx]{1to4} +0x62,0xf5,0x7d,0x18,0x1d,0x31 + +# ATT: vcvtps2phxx 2032(%ecx), %xmm6 +# INTEL: vcvtps2phx xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7d,0x08,0x1d,0x71,0x7f + +# ATT: vcvtps2phx -512(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtps2phx xmm6 {k7} {z}, dword ptr [edx - 512]{1to4} +0x62,0xf5,0x7d,0x9f,0x1d,0x72,0x80 + +# ATT: vcvtps2phx (%ecx){1to8}, %xmm6 +# INTEL: vcvtps2phx xmm6, dword ptr [ecx]{1to8} +0x62,0xf5,0x7d,0x38,0x1d,0x31 + +# ATT: vcvtps2phxy 4064(%ecx), %xmm6 +# INTEL: vcvtps2phx xmm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7d,0x28,0x1d,0x71,0x7f + +# ATT: vcvtps2phx -512(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtps2phx xmm6 {k7} {z}, dword ptr [edx - 512]{1to8} +0x62,0xf5,0x7d,0xbf,0x1d,0x72,0x80 + +# ATT: vcvtqq2ph %xmm5, %xmm6 +# INTEL: vcvtqq2ph xmm6, xmm5 +0x62,0xf5,0xfc,0x08,0x5b,0xf5 + +# ATT: vcvtqq2ph %ymm5, %xmm6 +# INTEL: vcvtqq2ph xmm6, ymm5 +0x62,0xf5,0xfc,0x28,0x5b,0xf5 + +# ATT: vcvtqq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtqq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0xfc,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtqq2ph (%ecx){1to2}, %xmm6 +# INTEL: vcvtqq2ph xmm6, qword ptr [ecx]{1to2} +0x62,0xf5,0xfc,0x18,0x5b,0x31 + +# ATT: vcvtqq2phx 2032(%ecx), %xmm6 +# INTEL: vcvtqq2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0xfc,0x08,0x5b,0x71,0x7f + +# ATT: vcvtqq2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to2} +0x62,0xf5,0xfc,0x9f,0x5b,0x72,0x80 + +# ATT: vcvtqq2ph (%ecx){1to4}, %xmm6 +# INTEL: vcvtqq2ph xmm6, qword ptr [ecx]{1to4} +0x62,0xf5,0xfc,0x38,0x5b,0x31 + +# ATT: vcvtqq2phy 4064(%ecx), %xmm6 +# INTEL: vcvtqq2ph xmm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0xfc,0x28,0x5b,0x71,0x7f + +# ATT: vcvtqq2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to4} +0x62,0xf5,0xfc,0xbf,0x5b,0x72,0x80 + +# ATT: vcvttph2dq %xmm5, %xmm6 +# INTEL: vcvttph2dq xmm6, xmm5 +0x62,0xf5,0x7e,0x08,0x5b,0xf5 + +# ATT: vcvttph2dq %xmm5, %ymm6 +# INTEL: vcvttph2dq ymm6, xmm5 +0x62,0xf5,0x7e,0x28,0x5b,0xf5 + +# ATT: vcvttph2dq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvttph2dq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2dq (%ecx){1to4}, %xmm6 +# INTEL: vcvttph2dq xmm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7e,0x18,0x5b,0x31 + +# ATT: vcvttph2dq 1016(%ecx), %xmm6 +# INTEL: vcvttph2dq xmm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7e,0x08,0x5b,0x71,0x7f + +# ATT: vcvttph2dq -256(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvttph2dq xmm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7e,0x9f,0x5b,0x72,0x80 + +# ATT: vcvttph2dq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvttph2dq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2dq (%ecx){1to8}, %ymm6 +# INTEL: vcvttph2dq ymm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7e,0x38,0x5b,0x31 + +# ATT: vcvttph2dq 2032(%ecx), %ymm6 +# INTEL: vcvttph2dq ymm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7e,0x28,0x5b,0x71,0x7f + +# ATT: vcvttph2dq -256(%edx){1to8}, %ymm6 {%k7} {z} +# INTEL: vcvttph2dq ymm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7e,0xbf,0x5b,0x72,0x80 + +# ATT: vcvttph2qq %xmm5, %xmm6 +# INTEL: vcvttph2qq xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x7a,0xf5 + +# ATT: vcvttph2qq %xmm5, %ymm6 +# INTEL: vcvttph2qq ymm6, xmm5 +0x62,0xf5,0x7d,0x28,0x7a,0xf5 + +# ATT: vcvttph2qq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvttph2qq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2qq (%ecx){1to2}, %xmm6 +# INTEL: vcvttph2qq xmm6, word ptr [ecx]{1to2} +0x62,0xf5,0x7d,0x18,0x7a,0x31 + +# ATT: vcvttph2qq 508(%ecx), %xmm6 +# INTEL: vcvttph2qq xmm6, dword ptr [ecx + 508] +0x62,0xf5,0x7d,0x08,0x7a,0x71,0x7f + +# ATT: vcvttph2qq -256(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvttph2qq xmm6 {k7} {z}, word ptr [edx - 256]{1to2} +0x62,0xf5,0x7d,0x9f,0x7a,0x72,0x80 + +# ATT: vcvttph2qq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvttph2qq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2qq (%ecx){1to4}, %ymm6 +# INTEL: vcvttph2qq ymm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7d,0x38,0x7a,0x31 + +# ATT: vcvttph2qq 1016(%ecx), %ymm6 +# INTEL: vcvttph2qq ymm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7d,0x28,0x7a,0x71,0x7f + +# ATT: vcvttph2qq -256(%edx){1to4}, %ymm6 {%k7} {z} +# INTEL: vcvttph2qq ymm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7d,0xbf,0x7a,0x72,0x80 + +# ATT: vcvttph2udq %xmm5, %xmm6 +# INTEL: vcvttph2udq xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x78,0xf5 + +# ATT: vcvttph2udq %xmm5, %ymm6 +# INTEL: vcvttph2udq ymm6, xmm5 +0x62,0xf5,0x7c,0x28,0x78,0xf5 + +# ATT: vcvttph2udq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvttph2udq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2udq (%ecx){1to4}, %xmm6 +# INTEL: vcvttph2udq xmm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7c,0x18,0x78,0x31 + +# ATT: vcvttph2udq 1016(%ecx), %xmm6 +# INTEL: vcvttph2udq xmm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7c,0x08,0x78,0x71,0x7f + +# ATT: vcvttph2udq -256(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvttph2udq xmm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7c,0x9f,0x78,0x72,0x80 + +# ATT: vcvttph2udq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvttph2udq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2udq (%ecx){1to8}, %ymm6 +# INTEL: vcvttph2udq ymm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7c,0x38,0x78,0x31 + +# ATT: vcvttph2udq 2032(%ecx), %ymm6 +# INTEL: vcvttph2udq ymm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7c,0x28,0x78,0x71,0x7f + +# ATT: vcvttph2udq -256(%edx){1to8}, %ymm6 {%k7} {z} +# INTEL: vcvttph2udq ymm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7c,0xbf,0x78,0x72,0x80 + +# ATT: vcvttph2uqq %xmm5, %xmm6 +# INTEL: vcvttph2uqq xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x78,0xf5 + +# ATT: vcvttph2uqq %xmm5, %ymm6 +# INTEL: vcvttph2uqq ymm6, xmm5 +0x62,0xf5,0x7d,0x28,0x78,0xf5 + +# ATT: vcvttph2uqq 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvttph2uqq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2uqq (%ecx){1to2}, %xmm6 +# INTEL: vcvttph2uqq xmm6, word ptr [ecx]{1to2} +0x62,0xf5,0x7d,0x18,0x78,0x31 + +# ATT: vcvttph2uqq 508(%ecx), %xmm6 +# INTEL: vcvttph2uqq xmm6, dword ptr [ecx + 508] +0x62,0xf5,0x7d,0x08,0x78,0x71,0x7f + +# ATT: vcvttph2uqq -256(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvttph2uqq xmm6 {k7} {z}, word ptr [edx - 256]{1to2} +0x62,0xf5,0x7d,0x9f,0x78,0x72,0x80 + +# ATT: vcvttph2uqq 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvttph2uqq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2uqq (%ecx){1to4}, %ymm6 +# INTEL: vcvttph2uqq ymm6, word ptr [ecx]{1to4} +0x62,0xf5,0x7d,0x38,0x78,0x31 + +# ATT: vcvttph2uqq 1016(%ecx), %ymm6 +# INTEL: vcvttph2uqq ymm6, qword ptr [ecx + 1016] +0x62,0xf5,0x7d,0x28,0x78,0x71,0x7f + +# ATT: vcvttph2uqq -256(%edx){1to4}, %ymm6 {%k7} {z} +# INTEL: vcvttph2uqq ymm6 {k7} {z}, word ptr [edx - 256]{1to4} +0x62,0xf5,0x7d,0xbf,0x78,0x72,0x80 + +# ATT: vcvttph2uw %xmm5, %xmm6 +# INTEL: vcvttph2uw xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x7c,0xf5 + +# ATT: vcvttph2uw %ymm5, %ymm6 +# INTEL: vcvttph2uw ymm6, ymm5 +0x62,0xf5,0x7c,0x28,0x7c,0xf5 + +# ATT: vcvttph2uw 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvttph2uw xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2uw (%ecx){1to8}, %xmm6 +# INTEL: vcvttph2uw xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7c,0x18,0x7c,0x31 + +# ATT: vcvttph2uw 2032(%ecx), %xmm6 +# INTEL: vcvttph2uw xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7c,0x08,0x7c,0x71,0x7f + +# ATT: vcvttph2uw -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvttph2uw xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7c,0x9f,0x7c,0x72,0x80 + +# ATT: vcvttph2uw 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvttph2uw ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2uw (%ecx){1to16}, %ymm6 +# INTEL: vcvttph2uw ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7c,0x38,0x7c,0x31 + +# ATT: vcvttph2uw 4064(%ecx), %ymm6 +# INTEL: vcvttph2uw ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7c,0x28,0x7c,0x71,0x7f + +# ATT: vcvttph2uw -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vcvttph2uw ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7c,0xbf,0x7c,0x72,0x80 + +# ATT: vcvttph2w %xmm5, %xmm6 +# INTEL: vcvttph2w xmm6, xmm5 +0x62,0xf5,0x7d,0x08,0x7c,0xf5 + +# ATT: vcvttph2w %ymm5, %ymm6 +# INTEL: vcvttph2w ymm6, ymm5 +0x62,0xf5,0x7d,0x28,0x7c,0xf5 + +# ATT: vcvttph2w 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvttph2w xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2w (%ecx){1to8}, %xmm6 +# INTEL: vcvttph2w xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7d,0x18,0x7c,0x31 + +# ATT: vcvttph2w 2032(%ecx), %xmm6 +# INTEL: vcvttph2w xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7d,0x08,0x7c,0x71,0x7f + +# ATT: vcvttph2w -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvttph2w xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7d,0x9f,0x7c,0x72,0x80 + +# ATT: vcvttph2w 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvttph2w ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7d,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvttph2w (%ecx){1to16}, %ymm6 +# INTEL: vcvttph2w ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7d,0x38,0x7c,0x31 + +# ATT: vcvttph2w 4064(%ecx), %ymm6 +# INTEL: vcvttph2w ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7d,0x28,0x7c,0x71,0x7f + +# ATT: vcvttph2w -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vcvttph2w ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7d,0xbf,0x7c,0x72,0x80 + +# ATT: vcvtudq2ph %xmm5, %xmm6 +# INTEL: vcvtudq2ph xmm6, xmm5 +0x62,0xf5,0x7f,0x08,0x7a,0xf5 + +# ATT: vcvtudq2ph %ymm5, %xmm6 +# INTEL: vcvtudq2ph xmm6, ymm5 +0x62,0xf5,0x7f,0x28,0x7a,0xf5 + +# ATT: vcvtudq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtudq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtudq2ph (%ecx){1to4}, %xmm6 +# INTEL: vcvtudq2ph xmm6, dword ptr [ecx]{1to4} +0x62,0xf5,0x7f,0x18,0x7a,0x31 + +# ATT: vcvtudq2phx 2032(%ecx), %xmm6 +# INTEL: vcvtudq2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7f,0x08,0x7a,0x71,0x7f + +# ATT: vcvtudq2ph -512(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtudq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to4} +0x62,0xf5,0x7f,0x9f,0x7a,0x72,0x80 + +# ATT: vcvtudq2ph (%ecx){1to8}, %xmm6 +# INTEL: vcvtudq2ph xmm6, dword ptr [ecx]{1to8} +0x62,0xf5,0x7f,0x38,0x7a,0x31 + +# ATT: vcvtudq2phy 4064(%ecx), %xmm6 +# INTEL: vcvtudq2ph xmm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7f,0x28,0x7a,0x71,0x7f + +# ATT: vcvtudq2ph -512(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtudq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to8} +0x62,0xf5,0x7f,0xbf,0x7a,0x72,0x80 + +# ATT: vcvtuqq2ph %xmm5, %xmm6 +# INTEL: vcvtuqq2ph xmm6, xmm5 +0x62,0xf5,0xff,0x08,0x7a,0xf5 + +# ATT: vcvtuqq2ph %ymm5, %xmm6 +# INTEL: vcvtuqq2ph xmm6, ymm5 +0x62,0xf5,0xff,0x28,0x7a,0xf5 + +# ATT: vcvtuqq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtuqq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0xff,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtuqq2ph (%ecx){1to2}, %xmm6 +# INTEL: vcvtuqq2ph xmm6, qword ptr [ecx]{1to2} +0x62,0xf5,0xff,0x18,0x7a,0x31 + +# ATT: vcvtuqq2phx 2032(%ecx), %xmm6 +# INTEL: vcvtuqq2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0xff,0x08,0x7a,0x71,0x7f + +# ATT: vcvtuqq2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} +# INTEL: vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to2} +0x62,0xf5,0xff,0x9f,0x7a,0x72,0x80 + +# ATT: vcvtuqq2ph (%ecx){1to4}, %xmm6 +# INTEL: vcvtuqq2ph xmm6, qword ptr [ecx]{1to4} +0x62,0xf5,0xff,0x38,0x7a,0x31 + +# ATT: vcvtuqq2phy 4064(%ecx), %xmm6 +# INTEL: vcvtuqq2ph xmm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0xff,0x28,0x7a,0x71,0x7f + +# ATT: vcvtuqq2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} +# INTEL: vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to4} +0x62,0xf5,0xff,0xbf,0x7a,0x72,0x80 + +# ATT: vcvtuw2ph %xmm5, %xmm6 +# INTEL: vcvtuw2ph xmm6, xmm5 +0x62,0xf5,0x7f,0x08,0x7d,0xf5 + +# ATT: vcvtuw2ph %ymm5, %ymm6 +# INTEL: vcvtuw2ph ymm6, ymm5 +0x62,0xf5,0x7f,0x28,0x7d,0xf5 + +# ATT: vcvtuw2ph 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtuw2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtuw2ph (%ecx){1to8}, %xmm6 +# INTEL: vcvtuw2ph xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7f,0x18,0x7d,0x31 + +# ATT: vcvtuw2ph 2032(%ecx), %xmm6 +# INTEL: vcvtuw2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7f,0x08,0x7d,0x71,0x7f + +# ATT: vcvtuw2ph -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtuw2ph xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7f,0x9f,0x7d,0x72,0x80 + +# ATT: vcvtuw2ph 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtuw2ph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtuw2ph (%ecx){1to16}, %ymm6 +# INTEL: vcvtuw2ph ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7f,0x38,0x7d,0x31 + +# ATT: vcvtuw2ph 4064(%ecx), %ymm6 +# INTEL: vcvtuw2ph ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7f,0x28,0x7d,0x71,0x7f + +# ATT: vcvtuw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vcvtuw2ph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7f,0xbf,0x7d,0x72,0x80 + +# ATT: vcvtw2ph %xmm5, %xmm6 +# INTEL: vcvtw2ph xmm6, xmm5 +0x62,0xf5,0x7e,0x08,0x7d,0xf5 + +# ATT: vcvtw2ph %ymm5, %ymm6 +# INTEL: vcvtw2ph ymm6, ymm5 +0x62,0xf5,0x7e,0x28,0x7d,0xf5 + +# ATT: vcvtw2ph 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vcvtw2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtw2ph (%ecx){1to8}, %xmm6 +# INTEL: vcvtw2ph xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7e,0x18,0x7d,0x31 + +# ATT: vcvtw2ph 2032(%ecx), %xmm6 +# INTEL: vcvtw2ph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7e,0x08,0x7d,0x71,0x7f + +# ATT: vcvtw2ph -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vcvtw2ph xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7e,0x9f,0x7d,0x72,0x80 + +# ATT: vcvtw2ph 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vcvtw2ph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtw2ph (%ecx){1to16}, %ymm6 +# INTEL: vcvtw2ph ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7e,0x38,0x7d,0x31 + +# ATT: vcvtw2ph 4064(%ecx), %ymm6 +# INTEL: vcvtw2ph ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7e,0x28,0x7d,0x71,0x7f + +# ATT: vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vcvtw2ph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s --- a/llvm/test/MC/X86/avx512fp16.s +++ b/llvm/test/MC/X86/avx512fp16.s @@ -459,3 +459,899 @@ // CHECK: vucomish -256(%rdx), %xmm30 // CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x72,0x80] vucomish -256(%rdx), %xmm30 + +// CHECK: vcvtdq2ph %zmm29, %ymm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x5b,0xf5] + vcvtdq2ph %zmm29, %ymm30 + +// CHECK: vcvtdq2ph {rn-sae}, %zmm29, %ymm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x5b,0xf5] + vcvtdq2ph {rn-sae}, %zmm29, %ymm30 + +// CHECK: vcvtdq2ph 268435456(%rbp,%r14,8), %ymm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtdq2ph 268435456(%rbp,%r14,8), %ymm30 {%k7} + +// CHECK: vcvtdq2ph (%r9){1to16}, %ymm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x5b,0x31] + vcvtdq2ph (%r9){1to16}, %ymm30 + +// CHECK: vcvtdq2ph 8128(%rcx), %ymm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x5b,0x71,0x7f] + vcvtdq2ph 8128(%rcx), %ymm30 + +// CHECK: vcvtdq2ph -512(%rdx){1to16}, %ymm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x5b,0x72,0x80] + vcvtdq2ph -512(%rdx){1to16}, %ymm30 {%k7} {z} + +// CHECK: vcvtpd2ph %zmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0xfd,0x48,0x5a,0xf5] + vcvtpd2ph %zmm29, %xmm30 + +// CHECK: vcvtpd2ph {rn-sae}, %zmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0xfd,0x18,0x5a,0xf5] + vcvtpd2ph {rn-sae}, %zmm29, %xmm30 + +// CHECK: vcvtpd2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0xfd,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtpd2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} + +// CHECK: vcvtpd2ph (%r9){1to8}, %xmm30 +// CHECK: encoding: [0x62,0x45,0xfd,0x58,0x5a,0x31] + vcvtpd2ph (%r9){1to8}, %xmm30 + +// CHECK: vcvtpd2phz 8128(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0xfd,0x48,0x5a,0x71,0x7f] + vcvtpd2phz 8128(%rcx), %xmm30 + +// CHECK: vcvtpd2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0xfd,0xdf,0x5a,0x72,0x80] + vcvtpd2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} + +// CHECK: vcvtph2dq %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x5b,0xf5] + vcvtph2dq %ymm29, %zmm30 + +// CHECK: vcvtph2dq {rn-sae}, %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x5b,0xf5] + vcvtph2dq {rn-sae}, %ymm29, %zmm30 + +// CHECK: vcvtph2dq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2dq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2dq (%r9){1to16}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x5b,0x31] + vcvtph2dq (%r9){1to16}, %zmm30 + +// CHECK: vcvtph2dq 4064(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x5b,0x71,0x7f] + vcvtph2dq 4064(%rcx), %zmm30 + +// CHECK: vcvtph2dq -256(%rdx){1to16}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x5b,0x72,0x80] + vcvtph2dq -256(%rdx){1to16}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2pd %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x5a,0xf5] + vcvtph2pd %xmm29, %zmm30 + +// CHECK: vcvtph2pd {sae}, %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x5a,0xf5] + vcvtph2pd {sae}, %xmm29, %zmm30 + +// CHECK: vcvtph2pd 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2pd 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2pd (%r9){1to8}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x5a,0x31] + vcvtph2pd (%r9){1to8}, %zmm30 + +// CHECK: vcvtph2pd 2032(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x5a,0x71,0x7f] + vcvtph2pd 2032(%rcx), %zmm30 + +// CHECK: vcvtph2pd -256(%rdx){1to8}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x5a,0x72,0x80] + vcvtph2pd -256(%rdx){1to8}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2psx %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x13,0xf5] + vcvtph2psx %ymm29, %zmm30 + +// CHECK: vcvtph2psx {sae}, %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x7d,0x18,0x13,0xf5] + vcvtph2psx {sae}, %ymm29, %zmm30 + +// CHECK: vcvtph2psx 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2psx 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2psx (%r9){1to16}, %zmm30 +// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x13,0x31] + vcvtph2psx (%r9){1to16}, %zmm30 + +// CHECK: vcvtph2psx 4064(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x13,0x71,0x7f] + vcvtph2psx 4064(%rcx), %zmm30 + +// CHECK: vcvtph2psx -256(%rdx){1to16}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x13,0x72,0x80] + vcvtph2psx -256(%rdx){1to16}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2qq %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7b,0xf5] + vcvtph2qq %xmm29, %zmm30 + +// CHECK: vcvtph2qq {rn-sae}, %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7b,0xf5] + vcvtph2qq {rn-sae}, %xmm29, %zmm30 + +// CHECK: vcvtph2qq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2qq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2qq (%r9){1to8}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7b,0x31] + vcvtph2qq (%r9){1to8}, %zmm30 + +// CHECK: vcvtph2qq 2032(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7b,0x71,0x7f] + vcvtph2qq 2032(%rcx), %zmm30 + +// CHECK: vcvtph2qq -256(%rdx){1to8}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7b,0x72,0x80] + vcvtph2qq -256(%rdx){1to8}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2udq %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x79,0xf5] + vcvtph2udq %ymm29, %zmm30 + +// CHECK: vcvtph2udq {rn-sae}, %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x79,0xf5] + vcvtph2udq {rn-sae}, %ymm29, %zmm30 + +// CHECK: vcvtph2udq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2udq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2udq (%r9){1to16}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x79,0x31] + vcvtph2udq (%r9){1to16}, %zmm30 + +// CHECK: vcvtph2udq 4064(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x79,0x71,0x7f] + vcvtph2udq 4064(%rcx), %zmm30 + +// CHECK: vcvtph2udq -256(%rdx){1to16}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x79,0x72,0x80] + vcvtph2udq -256(%rdx){1to16}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2uqq %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x79,0xf5] + vcvtph2uqq %xmm29, %zmm30 + +// CHECK: vcvtph2uqq {rn-sae}, %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x79,0xf5] + vcvtph2uqq {rn-sae}, %xmm29, %zmm30 + +// CHECK: vcvtph2uqq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2uqq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2uqq (%r9){1to8}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x79,0x31] + vcvtph2uqq (%r9){1to8}, %zmm30 + +// CHECK: vcvtph2uqq 2032(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x79,0x71,0x7f] + vcvtph2uqq 2032(%rcx), %zmm30 + +// CHECK: vcvtph2uqq -256(%rdx){1to8}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x79,0x72,0x80] + vcvtph2uqq -256(%rdx){1to8}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2uw %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x7d,0xf5] + vcvtph2uw %zmm29, %zmm30 + +// CHECK: vcvtph2uw {rn-sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x7d,0xf5] + vcvtph2uw {rn-sae}, %zmm29, %zmm30 + +// CHECK: vcvtph2uw 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2uw 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2uw (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x7d,0x31] + vcvtph2uw (%r9){1to32}, %zmm30 + +// CHECK: vcvtph2uw 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x7d,0x71,0x7f] + vcvtph2uw 8128(%rcx), %zmm30 + +// CHECK: vcvtph2uw -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x7d,0x72,0x80] + vcvtph2uw -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vcvtph2w %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7d,0xf5] + vcvtph2w %zmm29, %zmm30 + +// CHECK: vcvtph2w {rn-sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7d,0xf5] + vcvtph2w {rn-sae}, %zmm29, %zmm30 + +// CHECK: vcvtph2w 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2w 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtph2w (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7d,0x31] + vcvtph2w (%r9){1to32}, %zmm30 + +// CHECK: vcvtph2w 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7d,0x71,0x7f] + vcvtph2w 8128(%rcx), %zmm30 + +// CHECK: vcvtph2w -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7d,0x72,0x80] + vcvtph2w -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vcvtps2phx %zmm29, %ymm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x1d,0xf5] + vcvtps2phx %zmm29, %ymm30 + +// CHECK: vcvtps2phx {rn-sae}, %zmm29, %ymm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x1d,0xf5] + vcvtps2phx {rn-sae}, %zmm29, %ymm30 + +// CHECK: vcvtps2phx 268435456(%rbp,%r14,8), %ymm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtps2phx 268435456(%rbp,%r14,8), %ymm30 {%k7} + +// CHECK: vcvtps2phx (%r9){1to16}, %ymm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x1d,0x31] + vcvtps2phx (%r9){1to16}, %ymm30 + +// CHECK: vcvtps2phx 8128(%rcx), %ymm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x1d,0x71,0x7f] + vcvtps2phx 8128(%rcx), %ymm30 + +// CHECK: vcvtps2phx -512(%rdx){1to16}, %ymm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x1d,0x72,0x80] + vcvtps2phx -512(%rdx){1to16}, %ymm30 {%k7} {z} + +// CHECK: vcvtqq2ph %zmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0xfc,0x48,0x5b,0xf5] + vcvtqq2ph %zmm29, %xmm30 + +// CHECK: vcvtqq2ph {rn-sae}, %zmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0xfc,0x18,0x5b,0xf5] + vcvtqq2ph {rn-sae}, %zmm29, %xmm30 + +// CHECK: vcvtqq2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0xfc,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtqq2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} + +// CHECK: vcvtqq2ph (%r9){1to8}, %xmm30 +// CHECK: encoding: [0x62,0x45,0xfc,0x58,0x5b,0x31] + vcvtqq2ph (%r9){1to8}, %xmm30 + +// CHECK: vcvtqq2phz 8128(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0xfc,0x48,0x5b,0x71,0x7f] + vcvtqq2phz 8128(%rcx), %xmm30 + +// CHECK: vcvtqq2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0xfc,0xdf,0x5b,0x72,0x80] + vcvtqq2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} + +// CHECK: vcvtsd2sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x97,0x00,0x5a,0xf4] + vcvtsd2sh %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtsd2sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x97,0x10,0x5a,0xf4] + vcvtsd2sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtsd2sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x97,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtsd2sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vcvtsd2sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x97,0x00,0x5a,0x31] + vcvtsd2sh (%r9), %xmm29, %xmm30 + +// CHECK: vcvtsd2sh 1016(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x97,0x00,0x5a,0x71,0x7f] + vcvtsd2sh 1016(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtsd2sh -1024(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x97,0x87,0x5a,0x72,0x80] + vcvtsd2sh -1024(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vcvtsh2sd %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5a,0xf4] + vcvtsh2sd %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtsh2sd {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5a,0xf4] + vcvtsh2sd {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtsh2sd 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtsh2sd 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vcvtsh2sd (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5a,0x31] + vcvtsh2sd (%r9), %xmm29, %xmm30 + +// CHECK: vcvtsh2sd 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5a,0x71,0x7f] + vcvtsh2sd 254(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtsh2sd -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5a,0x72,0x80] + vcvtsh2sd -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vcvtsh2si %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x2d,0xd6] + vcvtsh2si %xmm30, %edx + +// CHECK: vcvtsh2si {rn-sae}, %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x2d,0xd6] + vcvtsh2si {rn-sae}, %xmm30, %edx + +// CHECK: vcvtsh2si %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x2d,0xe6] + vcvtsh2si %xmm30, %r12 + +// CHECK: vcvtsh2si {rn-sae}, %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x2d,0xe6] + vcvtsh2si {rn-sae}, %xmm30, %r12 + +// CHECK: vcvtsh2si 268435456(%rbp,%r14,8), %edx +// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x2d,0x94,0xf5,0x00,0x00,0x00,0x10] + vcvtsh2si 268435456(%rbp,%r14,8), %edx + +// CHECK: vcvtsh2si (%r9), %edx +// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x2d,0x11] + vcvtsh2si (%r9), %edx + +// CHECK: vcvtsh2si 254(%rcx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x51,0x7f] + vcvtsh2si 254(%rcx), %edx + +// CHECK: vcvtsh2si -256(%rdx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x52,0x80] + vcvtsh2si -256(%rdx), %edx + +// CHECK: vcvtsh2si 268435456(%rbp,%r14,8), %r12 +// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x2d,0xa4,0xf5,0x00,0x00,0x00,0x10] + vcvtsh2si 268435456(%rbp,%r14,8), %r12 + +// CHECK: vcvtsh2si (%r9), %r12 +// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x2d,0x21] + vcvtsh2si (%r9), %r12 + +// CHECK: vcvtsh2si 254(%rcx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2d,0x61,0x7f] + vcvtsh2si 254(%rcx), %r12 + +// CHECK: vcvtsh2si -256(%rdx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2d,0x62,0x80] + vcvtsh2si -256(%rdx), %r12 + +// CHECK: vcvtsh2ss %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x14,0x00,0x13,0xf4] + vcvtsh2ss %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtsh2ss {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x14,0x10,0x13,0xf4] + vcvtsh2ss {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtsh2ss 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x14,0x07,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtsh2ss 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vcvtsh2ss (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x14,0x00,0x13,0x31] + vcvtsh2ss (%r9), %xmm29, %xmm30 + +// CHECK: vcvtsh2ss 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x14,0x00,0x13,0x71,0x7f] + vcvtsh2ss 254(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtsh2ss -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x14,0x87,0x13,0x72,0x80] + vcvtsh2ss -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vcvtsh2usi %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x79,0xd6] + vcvtsh2usi %xmm30, %edx + +// CHECK: vcvtsh2usi {rn-sae}, %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x79,0xd6] + vcvtsh2usi {rn-sae}, %xmm30, %edx + +// CHECK: vcvtsh2usi %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x79,0xe6] + vcvtsh2usi %xmm30, %r12 + +// CHECK: vcvtsh2usi {rn-sae}, %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x79,0xe6] + vcvtsh2usi {rn-sae}, %xmm30, %r12 + +// CHECK: vcvtsh2usi 268435456(%rbp,%r14,8), %edx +// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x79,0x94,0xf5,0x00,0x00,0x00,0x10] + vcvtsh2usi 268435456(%rbp,%r14,8), %edx + +// CHECK: vcvtsh2usi (%r9), %edx +// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x79,0x11] + vcvtsh2usi (%r9), %edx + +// CHECK: vcvtsh2usi 254(%rcx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x51,0x7f] + vcvtsh2usi 254(%rcx), %edx + +// CHECK: vcvtsh2usi -256(%rdx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x52,0x80] + vcvtsh2usi -256(%rdx), %edx + +// CHECK: vcvtsh2usi 268435456(%rbp,%r14,8), %r12 +// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x79,0xa4,0xf5,0x00,0x00,0x00,0x10] + vcvtsh2usi 268435456(%rbp,%r14,8), %r12 + +// CHECK: vcvtsh2usi (%r9), %r12 +// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x79,0x21] + vcvtsh2usi (%r9), %r12 + +// CHECK: vcvtsh2usi 254(%rcx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x79,0x61,0x7f] + vcvtsh2usi 254(%rcx), %r12 + +// CHECK: vcvtsh2usi -256(%rdx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x79,0x62,0x80] + vcvtsh2usi -256(%rdx), %r12 + +// CHECK: vcvtsi2sh %r12, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x96,0x00,0x2a,0xf4] + vcvtsi2sh %r12, %xmm29, %xmm30 + +// CHECK: vcvtsi2sh %r12, {rn-sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x96,0x10,0x2a,0xf4] + vcvtsi2sh %r12, {rn-sae}, %xmm29, %xmm30 + +// CHECK: vcvtsi2sh %edx, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x2a,0xf2] + vcvtsi2sh %edx, %xmm29, %xmm30 + +// CHECK: vcvtsi2sh %edx, {rn-sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x10,0x2a,0xf2] + vcvtsi2sh %edx, {rn-sae}, %xmm29, %xmm30 + +// CHECK: vcvtsi2shl 268435456(%rbp,%r14,8), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x25,0x16,0x00,0x2a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtsi2shl 268435456(%rbp,%r14,8), %xmm29, %xmm30 + +// CHECK: vcvtsi2shl (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x2a,0x31] + vcvtsi2shl (%r9), %xmm29, %xmm30 + +// CHECK: vcvtsi2shl 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x2a,0x71,0x7f] + vcvtsi2shl 508(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtsi2shl -512(%rdx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x2a,0x72,0x80] + vcvtsi2shl -512(%rdx), %xmm29, %xmm30 + +// CHECK: vcvtsi2shq 1016(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x96,0x00,0x2a,0x71,0x7f] + vcvtsi2shq 1016(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtsi2shq -1024(%rdx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x96,0x00,0x2a,0x72,0x80] + vcvtsi2shq -1024(%rdx), %xmm29, %xmm30 + +// CHECK: vcvtss2sh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x1d,0xf4] + vcvtss2sh %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtss2sh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x1d,0xf4] + vcvtss2sh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vcvtss2sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtss2sh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vcvtss2sh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x00,0x1d,0x31] + vcvtss2sh (%r9), %xmm29, %xmm30 + +// CHECK: vcvtss2sh 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x1d,0x71,0x7f] + vcvtss2sh 508(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtss2sh -512(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0x87,0x1d,0x72,0x80] + vcvtss2sh -512(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vcvttph2dq %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7e,0x48,0x5b,0xf5] + vcvttph2dq %ymm29, %zmm30 + +// CHECK: vcvttph2dq {sae}, %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7e,0x18,0x5b,0xf5] + vcvttph2dq {sae}, %ymm29, %zmm30 + +// CHECK: vcvttph2dq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7e,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2dq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvttph2dq (%r9){1to16}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7e,0x58,0x5b,0x31] + vcvttph2dq (%r9){1to16}, %zmm30 + +// CHECK: vcvttph2dq 4064(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7e,0x48,0x5b,0x71,0x7f] + vcvttph2dq 4064(%rcx), %zmm30 + +// CHECK: vcvttph2dq -256(%rdx){1to16}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7e,0xdf,0x5b,0x72,0x80] + vcvttph2dq -256(%rdx){1to16}, %zmm30 {%k7} {z} + +// CHECK: vcvttph2qq %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7a,0xf5] + vcvttph2qq %xmm29, %zmm30 + +// CHECK: vcvttph2qq {sae}, %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7a,0xf5] + vcvttph2qq {sae}, %xmm29, %zmm30 + +// CHECK: vcvttph2qq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2qq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvttph2qq (%r9){1to8}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7a,0x31] + vcvttph2qq (%r9){1to8}, %zmm30 + +// CHECK: vcvttph2qq 2032(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7a,0x71,0x7f] + vcvttph2qq 2032(%rcx), %zmm30 + +// CHECK: vcvttph2qq -256(%rdx){1to8}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7a,0x72,0x80] + vcvttph2qq -256(%rdx){1to8}, %zmm30 {%k7} {z} + +// CHECK: vcvttph2udq %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x78,0xf5] + vcvttph2udq %ymm29, %zmm30 + +// CHECK: vcvttph2udq {sae}, %ymm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x78,0xf5] + vcvttph2udq {sae}, %ymm29, %zmm30 + +// CHECK: vcvttph2udq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2udq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvttph2udq (%r9){1to16}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x78,0x31] + vcvttph2udq (%r9){1to16}, %zmm30 + +// CHECK: vcvttph2udq 4064(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x78,0x71,0x7f] + vcvttph2udq 4064(%rcx), %zmm30 + +// CHECK: vcvttph2udq -256(%rdx){1to16}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x78,0x72,0x80] + vcvttph2udq -256(%rdx){1to16}, %zmm30 {%k7} {z} + +// CHECK: vcvttph2uqq %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x78,0xf5] + vcvttph2uqq %xmm29, %zmm30 + +// CHECK: vcvttph2uqq {sae}, %xmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x78,0xf5] + vcvttph2uqq {sae}, %xmm29, %zmm30 + +// CHECK: vcvttph2uqq 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2uqq 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvttph2uqq (%r9){1to8}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x78,0x31] + vcvttph2uqq (%r9){1to8}, %zmm30 + +// CHECK: vcvttph2uqq 2032(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x78,0x71,0x7f] + vcvttph2uqq 2032(%rcx), %zmm30 + +// CHECK: vcvttph2uqq -256(%rdx){1to8}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x78,0x72,0x80] + vcvttph2uqq -256(%rdx){1to8}, %zmm30 {%k7} {z} + +// CHECK: vcvttph2uw %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x7c,0xf5] + vcvttph2uw %zmm29, %zmm30 + +// CHECK: vcvttph2uw {sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x7c,0xf5] + vcvttph2uw {sae}, %zmm29, %zmm30 + +// CHECK: vcvttph2uw 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2uw 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvttph2uw (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x7c,0x31] + vcvttph2uw (%r9){1to32}, %zmm30 + +// CHECK: vcvttph2uw 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x7c,0x71,0x7f] + vcvttph2uw 8128(%rcx), %zmm30 + +// CHECK: vcvttph2uw -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x7c,0x72,0x80] + vcvttph2uw -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vcvttph2w %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7c,0xf5] + vcvttph2w %zmm29, %zmm30 + +// CHECK: vcvttph2w {sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7c,0xf5] + vcvttph2w {sae}, %zmm29, %zmm30 + +// CHECK: vcvttph2w 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2w 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvttph2w (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7c,0x31] + vcvttph2w (%r9){1to32}, %zmm30 + +// CHECK: vcvttph2w 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7c,0x71,0x7f] + vcvttph2w 8128(%rcx), %zmm30 + +// CHECK: vcvttph2w -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7c,0x72,0x80] + vcvttph2w -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vcvttsh2si %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x2c,0xd6] + vcvttsh2si %xmm30, %edx + +// CHECK: vcvttsh2si {sae}, %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x2c,0xd6] + vcvttsh2si {sae}, %xmm30, %edx + +// CHECK: vcvttsh2si %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x2c,0xe6] + vcvttsh2si %xmm30, %r12 + +// CHECK: vcvttsh2si {sae}, %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x2c,0xe6] + vcvttsh2si {sae}, %xmm30, %r12 + +// CHECK: vcvttsh2si 268435456(%rbp,%r14,8), %edx +// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x2c,0x94,0xf5,0x00,0x00,0x00,0x10] + vcvttsh2si 268435456(%rbp,%r14,8), %edx + +// CHECK: vcvttsh2si (%r9), %edx +// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x2c,0x11] + vcvttsh2si (%r9), %edx + +// CHECK: vcvttsh2si 254(%rcx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x51,0x7f] + vcvttsh2si 254(%rcx), %edx + +// CHECK: vcvttsh2si -256(%rdx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x52,0x80] + vcvttsh2si -256(%rdx), %edx + +// CHECK: vcvttsh2si 268435456(%rbp,%r14,8), %r12 +// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x2c,0xa4,0xf5,0x00,0x00,0x00,0x10] + vcvttsh2si 268435456(%rbp,%r14,8), %r12 + +// CHECK: vcvttsh2si (%r9), %r12 +// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x2c,0x21] + vcvttsh2si (%r9), %r12 + +// CHECK: vcvttsh2si 254(%rcx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2c,0x61,0x7f] + vcvttsh2si 254(%rcx), %r12 + +// CHECK: vcvttsh2si -256(%rdx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2c,0x62,0x80] + vcvttsh2si -256(%rdx), %r12 + +// CHECK: vcvttsh2usi %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x78,0xd6] + vcvttsh2usi %xmm30, %edx + +// CHECK: vcvttsh2usi {sae}, %xmm30, %edx +// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x78,0xd6] + vcvttsh2usi {sae}, %xmm30, %edx + +// CHECK: vcvttsh2usi %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x78,0xe6] + vcvttsh2usi %xmm30, %r12 + +// CHECK: vcvttsh2usi {sae}, %xmm30, %r12 +// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x78,0xe6] + vcvttsh2usi {sae}, %xmm30, %r12 + +// CHECK: vcvttsh2usi 268435456(%rbp,%r14,8), %edx +// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x78,0x94,0xf5,0x00,0x00,0x00,0x10] + vcvttsh2usi 268435456(%rbp,%r14,8), %edx + +// CHECK: vcvttsh2usi (%r9), %edx +// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x78,0x11] + vcvttsh2usi (%r9), %edx + +// CHECK: vcvttsh2usi 254(%rcx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x51,0x7f] + vcvttsh2usi 254(%rcx), %edx + +// CHECK: vcvttsh2usi -256(%rdx), %edx +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x52,0x80] + vcvttsh2usi -256(%rdx), %edx + +// CHECK: vcvttsh2usi 268435456(%rbp,%r14,8), %r12 +// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x78,0xa4,0xf5,0x00,0x00,0x00,0x10] + vcvttsh2usi 268435456(%rbp,%r14,8), %r12 + +// CHECK: vcvttsh2usi (%r9), %r12 +// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x78,0x21] + vcvttsh2usi (%r9), %r12 + +// CHECK: vcvttsh2usi 254(%rcx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x78,0x61,0x7f] + vcvttsh2usi 254(%rcx), %r12 + +// CHECK: vcvttsh2usi -256(%rdx), %r12 +// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x78,0x62,0x80] + vcvttsh2usi -256(%rdx), %r12 + +// CHECK: vcvtudq2ph %zmm29, %ymm30 +// CHECK: encoding: [0x62,0x05,0x7f,0x48,0x7a,0xf5] + vcvtudq2ph %zmm29, %ymm30 + +// CHECK: vcvtudq2ph {rn-sae}, %zmm29, %ymm30 +// CHECK: encoding: [0x62,0x05,0x7f,0x18,0x7a,0xf5] + vcvtudq2ph {rn-sae}, %zmm29, %ymm30 + +// CHECK: vcvtudq2ph 268435456(%rbp,%r14,8), %ymm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7f,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtudq2ph 268435456(%rbp,%r14,8), %ymm30 {%k7} + +// CHECK: vcvtudq2ph (%r9){1to16}, %ymm30 +// CHECK: encoding: [0x62,0x45,0x7f,0x58,0x7a,0x31] + vcvtudq2ph (%r9){1to16}, %ymm30 + +// CHECK: vcvtudq2ph 8128(%rcx), %ymm30 +// CHECK: encoding: [0x62,0x65,0x7f,0x48,0x7a,0x71,0x7f] + vcvtudq2ph 8128(%rcx), %ymm30 + +// CHECK: vcvtudq2ph -512(%rdx){1to16}, %ymm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7f,0xdf,0x7a,0x72,0x80] + vcvtudq2ph -512(%rdx){1to16}, %ymm30 {%k7} {z} + +// CHECK: vcvtuqq2ph %zmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0xff,0x48,0x7a,0xf5] + vcvtuqq2ph %zmm29, %xmm30 + +// CHECK: vcvtuqq2ph {rn-sae}, %zmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0xff,0x18,0x7a,0xf5] + vcvtuqq2ph {rn-sae}, %zmm29, %xmm30 + +// CHECK: vcvtuqq2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0xff,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtuqq2phz 268435456(%rbp,%r14,8), %xmm30 {%k7} + +// CHECK: vcvtuqq2ph (%r9){1to8}, %xmm30 +// CHECK: encoding: [0x62,0x45,0xff,0x58,0x7a,0x31] + vcvtuqq2ph (%r9){1to8}, %xmm30 + +// CHECK: vcvtuqq2phz 8128(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0xff,0x48,0x7a,0x71,0x7f] + vcvtuqq2phz 8128(%rcx), %xmm30 + +// CHECK: vcvtuqq2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0xff,0xdf,0x7a,0x72,0x80] + vcvtuqq2ph -1024(%rdx){1to8}, %xmm30 {%k7} {z} + +// CHECK: vcvtusi2sh %r12, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x96,0x00,0x7b,0xf4] + vcvtusi2sh %r12, %xmm29, %xmm30 + +// CHECK: vcvtusi2sh %r12, {rn-sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x96,0x10,0x7b,0xf4] + vcvtusi2sh %r12, {rn-sae}, %xmm29, %xmm30 + +// CHECK: vcvtusi2sh %edx, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x7b,0xf2] + vcvtusi2sh %edx, %xmm29, %xmm30 + +// CHECK: vcvtusi2sh %edx, {rn-sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x10,0x7b,0xf2] + vcvtusi2sh %edx, {rn-sae}, %xmm29, %xmm30 + +// CHECK: vcvtusi2shl 268435456(%rbp,%r14,8), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x25,0x16,0x00,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtusi2shl 268435456(%rbp,%r14,8), %xmm29, %xmm30 + +// CHECK: vcvtusi2shl (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x7b,0x31] + vcvtusi2shl (%r9), %xmm29, %xmm30 + +// CHECK: vcvtusi2shl 508(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x7b,0x71,0x7f] + vcvtusi2shl 508(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtusi2shl -512(%rdx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x7b,0x72,0x80] + vcvtusi2shl -512(%rdx), %xmm29, %xmm30 + +// CHECK: vcvtusi2shq 1016(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x96,0x00,0x7b,0x71,0x7f] + vcvtusi2shq 1016(%rcx), %xmm29, %xmm30 + +// CHECK: vcvtusi2shq -1024(%rdx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x96,0x00,0x7b,0x72,0x80] + vcvtusi2shq -1024(%rdx), %xmm29, %xmm30 + +// CHECK: vcvtuw2ph %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7f,0x48,0x7d,0xf5] + vcvtuw2ph %zmm29, %zmm30 + +// CHECK: vcvtuw2ph {rn-sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7f,0x18,0x7d,0xf5] + vcvtuw2ph {rn-sae}, %zmm29, %zmm30 + +// CHECK: vcvtuw2ph 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7f,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtuw2ph 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtuw2ph (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7f,0x58,0x7d,0x31] + vcvtuw2ph (%r9){1to32}, %zmm30 + +// CHECK: vcvtuw2ph 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7f,0x48,0x7d,0x71,0x7f] + vcvtuw2ph 8128(%rcx), %zmm30 + +// CHECK: vcvtuw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7f,0xdf,0x7d,0x72,0x80] + vcvtuw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vcvtw2ph %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7e,0x48,0x7d,0xf5] + vcvtw2ph %zmm29, %zmm30 + +// CHECK: vcvtw2ph {rn-sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7e,0x18,0x7d,0xf5] + vcvtw2ph {rn-sae}, %zmm29, %zmm30 + +// CHECK: vcvtw2ph 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7e,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtw2ph 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vcvtw2ph (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7e,0x58,0x7d,0x31] + vcvtw2ph (%r9){1to32}, %zmm30 + +// CHECK: vcvtw2ph 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7e,0x48,0x7d,0x71,0x7f] + vcvtw2ph 8128(%rcx), %zmm30 + +// CHECK: vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80] + vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s --- a/llvm/test/MC/X86/avx512fp16vl.s +++ b/llvm/test/MC/X86/avx512fp16vl.s @@ -279,3 +279,859 @@ // CHECK: vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80] vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vcvtdq2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5b,0xf5] + vcvtdq2ph %xmm5, %xmm6 + +// CHECK: vcvtdq2ph %ymm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5b,0xf5] + vcvtdq2ph %ymm5, %xmm6 + +// CHECK: vcvtdq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtdq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtdq2ph (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5b,0x31] + vcvtdq2ph (%ecx){1to4}, %xmm6 + +// CHECK: vcvtdq2phx 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5b,0x71,0x7f] + vcvtdq2phx 2032(%ecx), %xmm6 + +// CHECK: vcvtdq2ph -512(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x5b,0x72,0x80] + vcvtdq2ph -512(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtdq2ph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x5b,0x31] + vcvtdq2ph (%ecx){1to8}, %xmm6 + +// CHECK: vcvtdq2phy 4064(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5b,0x71,0x7f] + vcvtdq2phy 4064(%ecx), %xmm6 + +// CHECK: vcvtdq2ph -512(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x5b,0x72,0x80] + vcvtdq2ph -512(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtpd2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfd,0x08,0x5a,0xf5] + vcvtpd2ph %xmm5, %xmm6 + +// CHECK: vcvtpd2ph %ymm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfd,0x28,0x5a,0xf5] + vcvtpd2ph %ymm5, %xmm6 + +// CHECK: vcvtpd2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0xfd,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtpd2phx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtpd2ph (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfd,0x18,0x5a,0x31] + vcvtpd2ph (%ecx){1to2}, %xmm6 + +// CHECK: vcvtpd2phx 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfd,0x08,0x5a,0x71,0x7f] + vcvtpd2phx 2032(%ecx), %xmm6 + +// CHECK: vcvtpd2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0xfd,0x9f,0x5a,0x72,0x80] + vcvtpd2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvtpd2ph (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfd,0x38,0x5a,0x31] + vcvtpd2ph (%ecx){1to4}, %xmm6 + +// CHECK: vcvtpd2phy 4064(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfd,0x28,0x5a,0x71,0x7f] + vcvtpd2phy 4064(%ecx), %xmm6 + +// CHECK: vcvtpd2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0xfd,0xbf,0x5a,0x72,0x80] + vcvtpd2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2dq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x5b,0xf5] + vcvtph2dq %xmm5, %xmm6 + +// CHECK: vcvtph2dq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x5b,0xf5] + vcvtph2dq %xmm5, %ymm6 + +// CHECK: vcvtph2dq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2dq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2dq (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x5b,0x31] + vcvtph2dq (%ecx){1to4}, %xmm6 + +// CHECK: vcvtph2dq 1016(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x5b,0x71,0x7f] + vcvtph2dq 1016(%ecx), %xmm6 + +// CHECK: vcvtph2dq -256(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x5b,0x72,0x80] + vcvtph2dq -256(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2dq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2dq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2dq (%ecx){1to8}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x5b,0x31] + vcvtph2dq (%ecx){1to8}, %ymm6 + +// CHECK: vcvtph2dq 2032(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x5b,0x71,0x7f] + vcvtph2dq 2032(%ecx), %ymm6 + +// CHECK: vcvtph2dq -256(%edx){1to8}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x5b,0x72,0x80] + vcvtph2dq -256(%edx){1to8}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2pd %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5a,0xf5] + vcvtph2pd %xmm5, %xmm6 + +// CHECK: vcvtph2pd %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5a,0xf5] + vcvtph2pd %xmm5, %ymm6 + +// CHECK: vcvtph2pd 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2pd 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2pd (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5a,0x31] + vcvtph2pd (%ecx){1to2}, %xmm6 + +// CHECK: vcvtph2pd 508(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5a,0x71,0x7f] + vcvtph2pd 508(%ecx), %xmm6 + +// CHECK: vcvtph2pd -256(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x5a,0x72,0x80] + vcvtph2pd -256(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2pd 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2pd 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2pd (%ecx){1to4}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x5a,0x31] + vcvtph2pd (%ecx){1to4}, %ymm6 + +// CHECK: vcvtph2pd 1016(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5a,0x71,0x7f] + vcvtph2pd 1016(%ecx), %ymm6 + +// CHECK: vcvtph2pd -256(%edx){1to4}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x5a,0x72,0x80] + vcvtph2pd -256(%edx){1to4}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2psx %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x13,0xf5] + vcvtph2psx %xmm5, %xmm6 + +// CHECK: vcvtph2psx %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x13,0xf5] + vcvtph2psx %xmm5, %ymm6 + +// CHECK: vcvtph2psx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2psx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2psx (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x13,0x31] + vcvtph2psx (%ecx){1to4}, %xmm6 + +// CHECK: vcvtph2psx 1016(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x13,0x71,0x7f] + vcvtph2psx 1016(%ecx), %xmm6 + +// CHECK: vcvtph2psx -256(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x13,0x72,0x80] + vcvtph2psx -256(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2psx 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2psx 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2psx (%ecx){1to8}, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x13,0x31] + vcvtph2psx (%ecx){1to8}, %ymm6 + +// CHECK: vcvtph2psx 2032(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x13,0x71,0x7f] + vcvtph2psx 2032(%ecx), %ymm6 + +// CHECK: vcvtph2psx -256(%edx){1to8}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x13,0x72,0x80] + vcvtph2psx -256(%edx){1to8}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2qq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7b,0xf5] + vcvtph2qq %xmm5, %xmm6 + +// CHECK: vcvtph2qq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7b,0xf5] + vcvtph2qq %xmm5, %ymm6 + +// CHECK: vcvtph2qq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2qq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2qq (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7b,0x31] + vcvtph2qq (%ecx){1to2}, %xmm6 + +// CHECK: vcvtph2qq 508(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7b,0x71,0x7f] + vcvtph2qq 508(%ecx), %xmm6 + +// CHECK: vcvtph2qq -256(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7b,0x72,0x80] + vcvtph2qq -256(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2qq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2qq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2qq (%ecx){1to4}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7b,0x31] + vcvtph2qq (%ecx){1to4}, %ymm6 + +// CHECK: vcvtph2qq 1016(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7b,0x71,0x7f] + vcvtph2qq 1016(%ecx), %ymm6 + +// CHECK: vcvtph2qq -256(%edx){1to4}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7b,0x72,0x80] + vcvtph2qq -256(%edx){1to4}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2udq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x79,0xf5] + vcvtph2udq %xmm5, %xmm6 + +// CHECK: vcvtph2udq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x79,0xf5] + vcvtph2udq %xmm5, %ymm6 + +// CHECK: vcvtph2udq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2udq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2udq (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x79,0x31] + vcvtph2udq (%ecx){1to4}, %xmm6 + +// CHECK: vcvtph2udq 1016(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x79,0x71,0x7f] + vcvtph2udq 1016(%ecx), %xmm6 + +// CHECK: vcvtph2udq -256(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x79,0x72,0x80] + vcvtph2udq -256(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2udq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2udq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2udq (%ecx){1to8}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x79,0x31] + vcvtph2udq (%ecx){1to8}, %ymm6 + +// CHECK: vcvtph2udq 2032(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x79,0x71,0x7f] + vcvtph2udq 2032(%ecx), %ymm6 + +// CHECK: vcvtph2udq -256(%edx){1to8}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x79,0x72,0x80] + vcvtph2udq -256(%edx){1to8}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2uqq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x79,0xf5] + vcvtph2uqq %xmm5, %xmm6 + +// CHECK: vcvtph2uqq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x79,0xf5] + vcvtph2uqq %xmm5, %ymm6 + +// CHECK: vcvtph2uqq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2uqq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2uqq (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x79,0x31] + vcvtph2uqq (%ecx){1to2}, %xmm6 + +// CHECK: vcvtph2uqq 508(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x79,0x71,0x7f] + vcvtph2uqq 508(%ecx), %xmm6 + +// CHECK: vcvtph2uqq -256(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x79,0x72,0x80] + vcvtph2uqq -256(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2uqq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2uqq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2uqq (%ecx){1to4}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x79,0x31] + vcvtph2uqq (%ecx){1to4}, %ymm6 + +// CHECK: vcvtph2uqq 1016(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x79,0x71,0x7f] + vcvtph2uqq 1016(%ecx), %ymm6 + +// CHECK: vcvtph2uqq -256(%edx){1to4}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x79,0x72,0x80] + vcvtph2uqq -256(%edx){1to4}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2uw %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7d,0xf5] + vcvtph2uw %xmm5, %xmm6 + +// CHECK: vcvtph2uw %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7d,0xf5] + vcvtph2uw %ymm5, %ymm6 + +// CHECK: vcvtph2uw 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2uw 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2uw (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7d,0x31] + vcvtph2uw (%ecx){1to8}, %xmm6 + +// CHECK: vcvtph2uw 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7d,0x71,0x7f] + vcvtph2uw 2032(%ecx), %xmm6 + +// CHECK: vcvtph2uw -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x7d,0x72,0x80] + vcvtph2uw -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2uw 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2uw 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2uw (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x7d,0x31] + vcvtph2uw (%ecx){1to16}, %ymm6 + +// CHECK: vcvtph2uw 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7d,0x71,0x7f] + vcvtph2uw 4064(%ecx), %ymm6 + +// CHECK: vcvtph2uw -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x7d,0x72,0x80] + vcvtph2uw -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vcvtph2w %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7d,0xf5] + vcvtph2w %xmm5, %xmm6 + +// CHECK: vcvtph2w %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7d,0xf5] + vcvtph2w %ymm5, %ymm6 + +// CHECK: vcvtph2w 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2w 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtph2w (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7d,0x31] + vcvtph2w (%ecx){1to8}, %xmm6 + +// CHECK: vcvtph2w 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7d,0x71,0x7f] + vcvtph2w 2032(%ecx), %xmm6 + +// CHECK: vcvtph2w -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7d,0x72,0x80] + vcvtph2w -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtph2w 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2w 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtph2w (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7d,0x31] + vcvtph2w (%ecx){1to16}, %ymm6 + +// CHECK: vcvtph2w 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7d,0x71,0x7f] + vcvtph2w 4064(%ecx), %ymm6 + +// CHECK: vcvtph2w -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7d,0x72,0x80] + vcvtph2w -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vcvtps2phx %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x1d,0xf5] + vcvtps2phx %xmm5, %xmm6 + +// CHECK: vcvtps2phx %ymm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x1d,0xf5] + vcvtps2phx %ymm5, %xmm6 + +// CHECK: vcvtps2phxx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtps2phxx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtps2phx (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x1d,0x31] + vcvtps2phx (%ecx){1to4}, %xmm6 + +// CHECK: vcvtps2phxx 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x1d,0x71,0x7f] + vcvtps2phxx 2032(%ecx), %xmm6 + +// CHECK: vcvtps2phx -512(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x1d,0x72,0x80] + vcvtps2phx -512(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtps2phx (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x1d,0x31] + vcvtps2phx (%ecx){1to8}, %xmm6 + +// CHECK: vcvtps2phxy 4064(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x1d,0x71,0x7f] + vcvtps2phxy 4064(%ecx), %xmm6 + +// CHECK: vcvtps2phx -512(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x1d,0x72,0x80] + vcvtps2phx -512(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtqq2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfc,0x08,0x5b,0xf5] + vcvtqq2ph %xmm5, %xmm6 + +// CHECK: vcvtqq2ph %ymm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfc,0x28,0x5b,0xf5] + vcvtqq2ph %ymm5, %xmm6 + +// CHECK: vcvtqq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0xfc,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtqq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtqq2ph (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfc,0x18,0x5b,0x31] + vcvtqq2ph (%ecx){1to2}, %xmm6 + +// CHECK: vcvtqq2phx 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfc,0x08,0x5b,0x71,0x7f] + vcvtqq2phx 2032(%ecx), %xmm6 + +// CHECK: vcvtqq2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0xfc,0x9f,0x5b,0x72,0x80] + vcvtqq2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvtqq2ph (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfc,0x38,0x5b,0x31] + vcvtqq2ph (%ecx){1to4}, %xmm6 + +// CHECK: vcvtqq2phy 4064(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0xfc,0x28,0x5b,0x71,0x7f] + vcvtqq2phy 4064(%ecx), %xmm6 + +// CHECK: vcvtqq2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0xfc,0xbf,0x5b,0x72,0x80] + vcvtqq2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2dq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x5b,0xf5] + vcvttph2dq %xmm5, %xmm6 + +// CHECK: vcvttph2dq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x5b,0xf5] + vcvttph2dq %xmm5, %ymm6 + +// CHECK: vcvttph2dq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2dq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvttph2dq (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x5b,0x31] + vcvttph2dq (%ecx){1to4}, %xmm6 + +// CHECK: vcvttph2dq 1016(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x5b,0x71,0x7f] + vcvttph2dq 1016(%ecx), %xmm6 + +// CHECK: vcvttph2dq -256(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x5b,0x72,0x80] + vcvttph2dq -256(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2dq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2dq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvttph2dq (%ecx){1to8}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x5b,0x31] + vcvttph2dq (%ecx){1to8}, %ymm6 + +// CHECK: vcvttph2dq 2032(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x5b,0x71,0x7f] + vcvttph2dq 2032(%ecx), %ymm6 + +// CHECK: vcvttph2dq -256(%edx){1to8}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x5b,0x72,0x80] + vcvttph2dq -256(%edx){1to8}, %ymm6 {%k7} {z} + +// CHECK: vcvttph2qq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7a,0xf5] + vcvttph2qq %xmm5, %xmm6 + +// CHECK: vcvttph2qq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7a,0xf5] + vcvttph2qq %xmm5, %ymm6 + +// CHECK: vcvttph2qq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2qq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvttph2qq (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7a,0x31] + vcvttph2qq (%ecx){1to2}, %xmm6 + +// CHECK: vcvttph2qq 508(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7a,0x71,0x7f] + vcvttph2qq 508(%ecx), %xmm6 + +// CHECK: vcvttph2qq -256(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7a,0x72,0x80] + vcvttph2qq -256(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2qq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2qq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvttph2qq (%ecx){1to4}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7a,0x31] + vcvttph2qq (%ecx){1to4}, %ymm6 + +// CHECK: vcvttph2qq 1016(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7a,0x71,0x7f] + vcvttph2qq 1016(%ecx), %ymm6 + +// CHECK: vcvttph2qq -256(%edx){1to4}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7a,0x72,0x80] + vcvttph2qq -256(%edx){1to4}, %ymm6 {%k7} {z} + +// CHECK: vcvttph2udq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x78,0xf5] + vcvttph2udq %xmm5, %xmm6 + +// CHECK: vcvttph2udq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x78,0xf5] + vcvttph2udq %xmm5, %ymm6 + +// CHECK: vcvttph2udq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2udq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvttph2udq (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x78,0x31] + vcvttph2udq (%ecx){1to4}, %xmm6 + +// CHECK: vcvttph2udq 1016(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x78,0x71,0x7f] + vcvttph2udq 1016(%ecx), %xmm6 + +// CHECK: vcvttph2udq -256(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x78,0x72,0x80] + vcvttph2udq -256(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2udq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2udq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvttph2udq (%ecx){1to8}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x78,0x31] + vcvttph2udq (%ecx){1to8}, %ymm6 + +// CHECK: vcvttph2udq 2032(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x78,0x71,0x7f] + vcvttph2udq 2032(%ecx), %ymm6 + +// CHECK: vcvttph2udq -256(%edx){1to8}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x78,0x72,0x80] + vcvttph2udq -256(%edx){1to8}, %ymm6 {%k7} {z} + +// CHECK: vcvttph2uqq %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x78,0xf5] + vcvttph2uqq %xmm5, %xmm6 + +// CHECK: vcvttph2uqq %xmm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x78,0xf5] + vcvttph2uqq %xmm5, %ymm6 + +// CHECK: vcvttph2uqq 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2uqq 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvttph2uqq (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x78,0x31] + vcvttph2uqq (%ecx){1to2}, %xmm6 + +// CHECK: vcvttph2uqq 508(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x78,0x71,0x7f] + vcvttph2uqq 508(%ecx), %xmm6 + +// CHECK: vcvttph2uqq -256(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x78,0x72,0x80] + vcvttph2uqq -256(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2uqq 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2uqq 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvttph2uqq (%ecx){1to4}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x78,0x31] + vcvttph2uqq (%ecx){1to4}, %ymm6 + +// CHECK: vcvttph2uqq 1016(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x78,0x71,0x7f] + vcvttph2uqq 1016(%ecx), %ymm6 + +// CHECK: vcvttph2uqq -256(%edx){1to4}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x78,0x72,0x80] + vcvttph2uqq -256(%edx){1to4}, %ymm6 {%k7} {z} + +// CHECK: vcvttph2uw %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7c,0xf5] + vcvttph2uw %xmm5, %xmm6 + +// CHECK: vcvttph2uw %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7c,0xf5] + vcvttph2uw %ymm5, %ymm6 + +// CHECK: vcvttph2uw 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2uw 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvttph2uw (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7c,0x31] + vcvttph2uw (%ecx){1to8}, %xmm6 + +// CHECK: vcvttph2uw 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7c,0x71,0x7f] + vcvttph2uw 2032(%ecx), %xmm6 + +// CHECK: vcvttph2uw -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x7c,0x72,0x80] + vcvttph2uw -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2uw 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2uw 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvttph2uw (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x7c,0x31] + vcvttph2uw (%ecx){1to16}, %ymm6 + +// CHECK: vcvttph2uw 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7c,0x71,0x7f] + vcvttph2uw 4064(%ecx), %ymm6 + +// CHECK: vcvttph2uw -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x7c,0x72,0x80] + vcvttph2uw -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vcvttph2w %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7c,0xf5] + vcvttph2w %xmm5, %xmm6 + +// CHECK: vcvttph2w %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7c,0xf5] + vcvttph2w %ymm5, %ymm6 + +// CHECK: vcvttph2w 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2w 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvttph2w (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7c,0x31] + vcvttph2w (%ecx){1to8}, %xmm6 + +// CHECK: vcvttph2w 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7c,0x71,0x7f] + vcvttph2w 2032(%ecx), %xmm6 + +// CHECK: vcvttph2w -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7c,0x72,0x80] + vcvttph2w -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvttph2w 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2w 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvttph2w (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7c,0x31] + vcvttph2w (%ecx){1to16}, %ymm6 + +// CHECK: vcvttph2w 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7c,0x71,0x7f] + vcvttph2w 4064(%ecx), %ymm6 + +// CHECK: vcvttph2w -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7c,0x72,0x80] + vcvttph2w -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vcvtudq2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7a,0xf5] + vcvtudq2ph %xmm5, %xmm6 + +// CHECK: vcvtudq2ph %ymm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7a,0xf5] + vcvtudq2ph %ymm5, %xmm6 + +// CHECK: vcvtudq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtudq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtudq2ph (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7a,0x31] + vcvtudq2ph (%ecx){1to4}, %xmm6 + +// CHECK: vcvtudq2phx 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7a,0x71,0x7f] + vcvtudq2phx 2032(%ecx), %xmm6 + +// CHECK: vcvtudq2ph -512(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0x9f,0x7a,0x72,0x80] + vcvtudq2ph -512(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtudq2ph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x38,0x7a,0x31] + vcvtudq2ph (%ecx){1to8}, %xmm6 + +// CHECK: vcvtudq2phy 4064(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7a,0x71,0x7f] + vcvtudq2phy 4064(%ecx), %xmm6 + +// CHECK: vcvtudq2ph -512(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xbf,0x7a,0x72,0x80] + vcvtudq2ph -512(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtuqq2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xff,0x08,0x7a,0xf5] + vcvtuqq2ph %xmm5, %xmm6 + +// CHECK: vcvtuqq2ph %ymm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xff,0x28,0x7a,0xf5] + vcvtuqq2ph %ymm5, %xmm6 + +// CHECK: vcvtuqq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0xff,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtuqq2phx 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtuqq2ph (%ecx){1to2}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xff,0x18,0x7a,0x31] + vcvtuqq2ph (%ecx){1to2}, %xmm6 + +// CHECK: vcvtuqq2phx 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0xff,0x08,0x7a,0x71,0x7f] + vcvtuqq2phx 2032(%ecx), %xmm6 + +// CHECK: vcvtuqq2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0xff,0x9f,0x7a,0x72,0x80] + vcvtuqq2ph -1024(%edx){1to2}, %xmm6 {%k7} {z} + +// CHECK: vcvtuqq2ph (%ecx){1to4}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0xff,0x38,0x7a,0x31] + vcvtuqq2ph (%ecx){1to4}, %xmm6 + +// CHECK: vcvtuqq2phy 4064(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0xff,0x28,0x7a,0x71,0x7f] + vcvtuqq2phy 4064(%ecx), %xmm6 + +// CHECK: vcvtuqq2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0xff,0xbf,0x7a,0x72,0x80] + vcvtuqq2ph -1024(%edx){1to4}, %xmm6 {%k7} {z} + +// CHECK: vcvtuw2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7d,0xf5] + vcvtuw2ph %xmm5, %xmm6 + +// CHECK: vcvtuw2ph %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7d,0xf5] + vcvtuw2ph %ymm5, %ymm6 + +// CHECK: vcvtuw2ph 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtuw2ph 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtuw2ph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7d,0x31] + vcvtuw2ph (%ecx){1to8}, %xmm6 + +// CHECK: vcvtuw2ph 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7d,0x71,0x7f] + vcvtuw2ph 2032(%ecx), %xmm6 + +// CHECK: vcvtuw2ph -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0x9f,0x7d,0x72,0x80] + vcvtuw2ph -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtuw2ph 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtuw2ph 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtuw2ph (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x38,0x7d,0x31] + vcvtuw2ph (%ecx){1to16}, %ymm6 + +// CHECK: vcvtuw2ph 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7d,0x71,0x7f] + vcvtuw2ph 4064(%ecx), %ymm6 + +// CHECK: vcvtuw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xbf,0x7d,0x72,0x80] + vcvtuw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vcvtw2ph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x7d,0xf5] + vcvtw2ph %xmm5, %xmm6 + +// CHECK: vcvtw2ph %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x7d,0xf5] + vcvtw2ph %ymm5, %ymm6 + +// CHECK: vcvtw2ph 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtw2ph 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vcvtw2ph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x7d,0x31] + vcvtw2ph (%ecx){1to8}, %xmm6 + +// CHECK: vcvtw2ph 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x7d,0x71,0x7f] + vcvtw2ph 2032(%ecx), %xmm6 + +// CHECK: vcvtw2ph -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x7d,0x72,0x80] + vcvtw2ph -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vcvtw2ph 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtw2ph 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vcvtw2ph (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x7d,0x31] + vcvtw2ph (%ecx){1to16}, %ymm6 + +// CHECK: vcvtw2ph 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x7d,0x71,0x7f] + vcvtw2ph 4064(%ecx), %ymm6 + +// CHECK: vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80] + vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -459,3 +459,771 @@ // CHECK: vucomish xmm6, word ptr [edx - 256] // CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x72,0x80] vucomish xmm6, word ptr [edx - 256] + +// CHECK: vcvtdq2ph ymm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5b,0xf5] + vcvtdq2ph ymm6, zmm5 + +// CHECK: vcvtdq2ph ymm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5b,0xf5] + vcvtdq2ph ymm6, zmm5, {rn-sae} + +// CHECK: vcvtdq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtdq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtdq2ph ymm6, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x5b,0x31] + vcvtdq2ph ymm6, dword ptr [ecx]{1to16} + +// CHECK: vcvtdq2ph ymm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5b,0x71,0x7f] + vcvtdq2ph ymm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtdq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x5b,0x72,0x80] + vcvtdq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16} + +// CHECK: vcvtpd2ph xmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0xfd,0x48,0x5a,0xf5] + vcvtpd2ph xmm6, zmm5 + +// CHECK: vcvtpd2ph xmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0xfd,0x18,0x5a,0xf5] + vcvtpd2ph xmm6, zmm5, {rn-sae} + +// CHECK: vcvtpd2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0xfd,0x4f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtpd2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtpd2ph xmm6, qword ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0xfd,0x58,0x5a,0x31] + vcvtpd2ph xmm6, qword ptr [ecx]{1to8} + +// CHECK: vcvtpd2ph xmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0xfd,0x48,0x5a,0x71,0x7f] + vcvtpd2ph xmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8} +// CHECK: encoding: [0x62,0xf5,0xfd,0xdf,0x5a,0x72,0x80] + vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8} + +// CHECK: vcvtph2dq zmm6, ymm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x5b,0xf5] + vcvtph2dq zmm6, ymm5 + +// CHECK: vcvtph2dq zmm6, ymm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x5b,0xf5] + vcvtph2dq zmm6, ymm5, {rn-sae} + +// CHECK: vcvtph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2dq zmm6, word ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x5b,0x31] + vcvtph2dq zmm6, word ptr [ecx]{1to16} + +// CHECK: vcvtph2dq zmm6, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x5b,0x71,0x7f] + vcvtph2dq zmm6, ymmword ptr [ecx + 4064] + +// CHECK: vcvtph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x5b,0x72,0x80] + vcvtph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtph2pd zmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5a,0xf5] + vcvtph2pd zmm6, xmm5 + +// CHECK: vcvtph2pd zmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5a,0xf5] + vcvtph2pd zmm6, xmm5, {sae} + +// CHECK: vcvtph2pd zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2pd zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2pd zmm6, word ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x5a,0x31] + vcvtph2pd zmm6, word ptr [ecx]{1to8} + +// CHECK: vcvtph2pd zmm6, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5a,0x71,0x7f] + vcvtph2pd zmm6, xmmword ptr [ecx + 2032] + +// CHECK: vcvtph2pd zmm6 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x5a,0x72,0x80] + vcvtph2pd zmm6 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtph2psx zmm6, ymm5 +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x13,0xf5] + vcvtph2psx zmm6, ymm5 + +// CHECK: vcvtph2psx zmm6, ymm5, {sae} +// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x13,0xf5] + vcvtph2psx zmm6, ymm5, {sae} + +// CHECK: vcvtph2psx zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2psx zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2psx zmm6, word ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x13,0x31] + vcvtph2psx zmm6, word ptr [ecx]{1to16} + +// CHECK: vcvtph2psx zmm6, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x13,0x71,0x7f] + vcvtph2psx zmm6, ymmword ptr [ecx + 4064] + +// CHECK: vcvtph2psx zmm6 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x13,0x72,0x80] + vcvtph2psx zmm6 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtph2qq zmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7b,0xf5] + vcvtph2qq zmm6, xmm5 + +// CHECK: vcvtph2qq zmm6, xmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7b,0xf5] + vcvtph2qq zmm6, xmm5, {rn-sae} + +// CHECK: vcvtph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2qq zmm6, word ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7b,0x31] + vcvtph2qq zmm6, word ptr [ecx]{1to8} + +// CHECK: vcvtph2qq zmm6, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7b,0x71,0x7f] + vcvtph2qq zmm6, xmmword ptr [ecx + 2032] + +// CHECK: vcvtph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7b,0x72,0x80] + vcvtph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtph2udq zmm6, ymm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x79,0xf5] + vcvtph2udq zmm6, ymm5 + +// CHECK: vcvtph2udq zmm6, ymm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x79,0xf5] + vcvtph2udq zmm6, ymm5, {rn-sae} + +// CHECK: vcvtph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2udq zmm6, word ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x79,0x31] + vcvtph2udq zmm6, word ptr [ecx]{1to16} + +// CHECK: vcvtph2udq zmm6, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x79,0x71,0x7f] + vcvtph2udq zmm6, ymmword ptr [ecx + 4064] + +// CHECK: vcvtph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x79,0x72,0x80] + vcvtph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtph2uqq zmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x79,0xf5] + vcvtph2uqq zmm6, xmm5 + +// CHECK: vcvtph2uqq zmm6, xmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x79,0xf5] + vcvtph2uqq zmm6, xmm5, {rn-sae} + +// CHECK: vcvtph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2uqq zmm6, word ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x79,0x31] + vcvtph2uqq zmm6, word ptr [ecx]{1to8} + +// CHECK: vcvtph2uqq zmm6, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x79,0x71,0x7f] + vcvtph2uqq zmm6, xmmword ptr [ecx + 2032] + +// CHECK: vcvtph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x79,0x72,0x80] + vcvtph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtph2uw zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7d,0xf5] + vcvtph2uw zmm6, zmm5 + +// CHECK: vcvtph2uw zmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7d,0xf5] + vcvtph2uw zmm6, zmm5, {rn-sae} + +// CHECK: vcvtph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2uw zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x7d,0x31] + vcvtph2uw zmm6, word ptr [ecx]{1to32} + +// CHECK: vcvtph2uw zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7d,0x71,0x7f] + vcvtph2uw zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x7d,0x72,0x80] + vcvtph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvtph2w zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7d,0xf5] + vcvtph2w zmm6, zmm5 + +// CHECK: vcvtph2w zmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7d,0xf5] + vcvtph2w zmm6, zmm5, {rn-sae} + +// CHECK: vcvtph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtph2w zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7d,0x31] + vcvtph2w zmm6, word ptr [ecx]{1to32} + +// CHECK: vcvtph2w zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7d,0x71,0x7f] + vcvtph2w zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7d,0x72,0x80] + vcvtph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvtps2phx ymm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x1d,0xf5] + vcvtps2phx ymm6, zmm5 + +// CHECK: vcvtps2phx ymm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x1d,0xf5] + vcvtps2phx ymm6, zmm5, {rn-sae} + +// CHECK: vcvtps2phx ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtps2phx ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtps2phx ymm6, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x1d,0x31] + vcvtps2phx ymm6, dword ptr [ecx]{1to16} + +// CHECK: vcvtps2phx ymm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x1d,0x71,0x7f] + vcvtps2phx ymm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtps2phx ymm6 {k7} {z}, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x1d,0x72,0x80] + vcvtps2phx ymm6 {k7} {z}, dword ptr [edx - 512]{1to16} + +// CHECK: vcvtqq2ph xmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0xfc,0x48,0x5b,0xf5] + vcvtqq2ph xmm6, zmm5 + +// CHECK: vcvtqq2ph xmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0xfc,0x18,0x5b,0xf5] + vcvtqq2ph xmm6, zmm5, {rn-sae} + +// CHECK: vcvtqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0xfc,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtqq2ph xmm6, qword ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0xfc,0x58,0x5b,0x31] + vcvtqq2ph xmm6, qword ptr [ecx]{1to8} + +// CHECK: vcvtqq2ph xmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0xfc,0x48,0x5b,0x71,0x7f] + vcvtqq2ph xmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8} +// CHECK: encoding: [0x62,0xf5,0xfc,0xdf,0x5b,0x72,0x80] + vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8} + +// CHECK: vcvtsd2sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0xd7,0x08,0x5a,0xf4] + vcvtsd2sh xmm6, xmm5, xmm4 + +// CHECK: vcvtsd2sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0xd7,0x18,0x5a,0xf4] + vcvtsd2sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vcvtsd2sh xmm6 {k7}, xmm5, qword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0xd7,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtsd2sh xmm6 {k7}, xmm5, qword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtsd2sh xmm6, xmm5, qword ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0xd7,0x08,0x5a,0x31] + vcvtsd2sh xmm6, xmm5, qword ptr [ecx] + +// CHECK: vcvtsd2sh xmm6, xmm5, qword ptr [ecx + 1016] +// CHECK: encoding: [0x62,0xf5,0xd7,0x08,0x5a,0x71,0x7f] + vcvtsd2sh xmm6, xmm5, qword ptr [ecx + 1016] + +// CHECK: vcvtsd2sh xmm6 {k7} {z}, xmm5, qword ptr [edx - 1024] +// CHECK: encoding: [0x62,0xf5,0xd7,0x8f,0x5a,0x72,0x80] + vcvtsd2sh xmm6 {k7} {z}, xmm5, qword ptr [edx - 1024] + +// CHECK: vcvtsh2sd xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5a,0xf4] + vcvtsh2sd xmm6, xmm5, xmm4 + +// CHECK: vcvtsh2sd xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5a,0xf4] + vcvtsh2sd xmm6, xmm5, xmm4, {sae} + +// CHECK: vcvtsh2sd xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtsh2sd xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtsh2sd xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5a,0x31] + vcvtsh2sd xmm6, xmm5, word ptr [ecx] + +// CHECK: vcvtsh2sd xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5a,0x71,0x7f] + vcvtsh2sd xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vcvtsh2sd xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5a,0x72,0x80] + vcvtsh2sd xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vcvtsh2si edx, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0xd6] + vcvtsh2si edx, xmm6 + +// CHECK: vcvtsh2si edx, xmm6, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2d,0xd6] + vcvtsh2si edx, xmm6, {rn-sae} + +// CHECK: vcvtsh2si edx, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtsh2si edx, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtsh2si edx, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x11] + vcvtsh2si edx, word ptr [ecx] + +// CHECK: vcvtsh2si edx, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x51,0x7f] + vcvtsh2si edx, word ptr [ecx + 254] + +// CHECK: vcvtsh2si edx, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x52,0x80] + vcvtsh2si edx, word ptr [edx - 256] + +// CHECK: vcvtsh2ss xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x54,0x08,0x13,0xf4] + vcvtsh2ss xmm6, xmm5, xmm4 + +// CHECK: vcvtsh2ss xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf6,0x54,0x18,0x13,0xf4] + vcvtsh2ss xmm6, xmm5, xmm4, {sae} + +// CHECK: vcvtsh2ss xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x54,0x0f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtsh2ss xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtsh2ss xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x54,0x08,0x13,0x31] + vcvtsh2ss xmm6, xmm5, word ptr [ecx] + +// CHECK: vcvtsh2ss xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x54,0x08,0x13,0x71,0x7f] + vcvtsh2ss xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vcvtsh2ss xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x54,0x8f,0x13,0x72,0x80] + vcvtsh2ss xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vcvtsh2usi edx, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0xd6] + vcvtsh2usi edx, xmm6 + +// CHECK: vcvtsh2usi edx, xmm6, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x79,0xd6] + vcvtsh2usi edx, xmm6, {rn-sae} + +// CHECK: vcvtsh2usi edx, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtsh2usi edx, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtsh2usi edx, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x11] + vcvtsh2usi edx, word ptr [ecx] + +// CHECK: vcvtsh2usi edx, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x51,0x7f] + vcvtsh2usi edx, word ptr [ecx + 254] + +// CHECK: vcvtsh2usi edx, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x52,0x80] + vcvtsh2usi edx, word ptr [edx - 256] + +// CHECK: vcvtsi2sh xmm6, xmm5, edx +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0xf2] + vcvtsi2sh xmm6, xmm5, edx + +// CHECK: vcvtsi2sh xmm6, xmm5, {rn-sae}, edx +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x2a,0xf2] + vcvtsi2sh xmm6, xmm5, {rn-sae}, edx + +// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtsi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0x31] + vcvtsi2sh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0x71,0x7f] + vcvtsi2sh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0x72,0x80] + vcvtsi2sh xmm6, xmm5, dword ptr [edx - 512] + +// CHECK: vcvtss2sh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x1d,0xf4] + vcvtss2sh xmm6, xmm5, xmm4 + +// CHECK: vcvtss2sh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x1d,0xf4] + vcvtss2sh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vcvtss2sh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtss2sh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtss2sh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x1d,0x31] + vcvtss2sh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vcvtss2sh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x1d,0x71,0x7f] + vcvtss2sh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vcvtss2sh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf5,0x54,0x8f,0x1d,0x72,0x80] + vcvtss2sh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512] + +// CHECK: vcvttph2dq zmm6, ymm5 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x5b,0xf5] + vcvttph2dq zmm6, ymm5 + +// CHECK: vcvttph2dq zmm6, ymm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x5b,0xf5] + vcvttph2dq zmm6, ymm5, {sae} + +// CHECK: vcvttph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttph2dq zmm6, word ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x5b,0x31] + vcvttph2dq zmm6, word ptr [ecx]{1to16} + +// CHECK: vcvttph2dq zmm6, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x5b,0x71,0x7f] + vcvttph2dq zmm6, ymmword ptr [ecx + 4064] + +// CHECK: vcvttph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x5b,0x72,0x80] + vcvttph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvttph2qq zmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7a,0xf5] + vcvttph2qq zmm6, xmm5 + +// CHECK: vcvttph2qq zmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7a,0xf5] + vcvttph2qq zmm6, xmm5, {sae} + +// CHECK: vcvttph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttph2qq zmm6, word ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7a,0x31] + vcvttph2qq zmm6, word ptr [ecx]{1to8} + +// CHECK: vcvttph2qq zmm6, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7a,0x71,0x7f] + vcvttph2qq zmm6, xmmword ptr [ecx + 2032] + +// CHECK: vcvttph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7a,0x72,0x80] + vcvttph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvttph2udq zmm6, ymm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x78,0xf5] + vcvttph2udq zmm6, ymm5 + +// CHECK: vcvttph2udq zmm6, ymm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x78,0xf5] + vcvttph2udq zmm6, ymm5, {sae} + +// CHECK: vcvttph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttph2udq zmm6, word ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x78,0x31] + vcvttph2udq zmm6, word ptr [ecx]{1to16} + +// CHECK: vcvttph2udq zmm6, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x78,0x71,0x7f] + vcvttph2udq zmm6, ymmword ptr [ecx + 4064] + +// CHECK: vcvttph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x78,0x72,0x80] + vcvttph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvttph2uqq zmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x78,0xf5] + vcvttph2uqq zmm6, xmm5 + +// CHECK: vcvttph2uqq zmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x78,0xf5] + vcvttph2uqq zmm6, xmm5, {sae} + +// CHECK: vcvttph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttph2uqq zmm6, word ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x78,0x31] + vcvttph2uqq zmm6, word ptr [ecx]{1to8} + +// CHECK: vcvttph2uqq zmm6, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x78,0x71,0x7f] + vcvttph2uqq zmm6, xmmword ptr [ecx + 2032] + +// CHECK: vcvttph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x78,0x72,0x80] + vcvttph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvttph2uw zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7c,0xf5] + vcvttph2uw zmm6, zmm5 + +// CHECK: vcvttph2uw zmm6, zmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7c,0xf5] + vcvttph2uw zmm6, zmm5, {sae} + +// CHECK: vcvttph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttph2uw zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x7c,0x31] + vcvttph2uw zmm6, word ptr [ecx]{1to32} + +// CHECK: vcvttph2uw zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7c,0x71,0x7f] + vcvttph2uw zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvttph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x7c,0x72,0x80] + vcvttph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvttph2w zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7c,0xf5] + vcvttph2w zmm6, zmm5 + +// CHECK: vcvttph2w zmm6, zmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7c,0xf5] + vcvttph2w zmm6, zmm5, {sae} + +// CHECK: vcvttph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvttph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttph2w zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7c,0x31] + vcvttph2w zmm6, word ptr [ecx]{1to32} + +// CHECK: vcvttph2w zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7c,0x71,0x7f] + vcvttph2w zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvttph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7c,0x72,0x80] + vcvttph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvttsh2si edx, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0xd6] + vcvttsh2si edx, xmm6 + +// CHECK: vcvttsh2si edx, xmm6, {sae} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2c,0xd6] + vcvttsh2si edx, xmm6, {sae} + +// CHECK: vcvttsh2si edx, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvttsh2si edx, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttsh2si edx, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x11] + vcvttsh2si edx, word ptr [ecx] + +// CHECK: vcvttsh2si edx, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x51,0x7f] + vcvttsh2si edx, word ptr [ecx + 254] + +// CHECK: vcvttsh2si edx, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x52,0x80] + vcvttsh2si edx, word ptr [edx - 256] + +// CHECK: vcvttsh2usi edx, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0xd6] + vcvttsh2usi edx, xmm6 + +// CHECK: vcvttsh2usi edx, xmm6, {sae} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x78,0xd6] + vcvttsh2usi edx, xmm6, {sae} + +// CHECK: vcvttsh2usi edx, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvttsh2usi edx, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcvttsh2usi edx, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x11] + vcvttsh2usi edx, word ptr [ecx] + +// CHECK: vcvttsh2usi edx, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x51,0x7f] + vcvttsh2usi edx, word ptr [ecx + 254] + +// CHECK: vcvttsh2usi edx, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x52,0x80] + vcvttsh2usi edx, word ptr [edx - 256] + +// CHECK: vcvtudq2ph ymm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7a,0xf5] + vcvtudq2ph ymm6, zmm5 + +// CHECK: vcvtudq2ph ymm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7a,0xf5] + vcvtudq2ph ymm6, zmm5, {rn-sae} + +// CHECK: vcvtudq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtudq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtudq2ph ymm6, dword ptr [ecx]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7f,0x58,0x7a,0x31] + vcvtudq2ph ymm6, dword ptr [ecx]{1to16} + +// CHECK: vcvtudq2ph ymm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7a,0x71,0x7f] + vcvtudq2ph ymm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtudq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7f,0xdf,0x7a,0x72,0x80] + vcvtudq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16} + +// CHECK: vcvtuqq2ph xmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0xff,0x48,0x7a,0xf5] + vcvtuqq2ph xmm6, zmm5 + +// CHECK: vcvtuqq2ph xmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0xff,0x18,0x7a,0xf5] + vcvtuqq2ph xmm6, zmm5, {rn-sae} + +// CHECK: vcvtuqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0xff,0x4f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtuqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtuqq2ph xmm6, qword ptr [ecx]{1to8} +// CHECK: encoding: [0x62,0xf5,0xff,0x58,0x7a,0x31] + vcvtuqq2ph xmm6, qword ptr [ecx]{1to8} + +// CHECK: vcvtuqq2ph xmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0xff,0x48,0x7a,0x71,0x7f] + vcvtuqq2ph xmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8} +// CHECK: encoding: [0x62,0xf5,0xff,0xdf,0x7a,0x72,0x80] + vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8} + +// CHECK: vcvtusi2sh xmm6, xmm5, edx +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0xf2] + vcvtusi2sh xmm6, xmm5, edx + +// CHECK: vcvtusi2sh xmm6, xmm5, {rn-sae}, edx +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x7b,0xf2] + vcvtusi2sh xmm6, xmm5, {rn-sae}, edx + +// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtusi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0x31] + vcvtusi2sh xmm6, xmm5, dword ptr [ecx] + +// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [ecx + 508] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0x71,0x7f] + vcvtusi2sh xmm6, xmm5, dword ptr [ecx + 508] + +// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [edx - 512] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0x72,0x80] + vcvtusi2sh xmm6, xmm5, dword ptr [edx - 512] + +// CHECK: vcvtuw2ph zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7d,0xf5] + vcvtuw2ph zmm6, zmm5 + +// CHECK: vcvtuw2ph zmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7d,0xf5] + vcvtuw2ph zmm6, zmm5, {rn-sae} + +// CHECK: vcvtuw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtuw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtuw2ph zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7f,0x58,0x7d,0x31] + vcvtuw2ph zmm6, word ptr [ecx]{1to32} + +// CHECK: vcvtuw2ph zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7d,0x71,0x7f] + vcvtuw2ph zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtuw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7f,0xdf,0x7d,0x72,0x80] + vcvtuw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvtw2ph zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x7d,0xf5] + vcvtw2ph zmm6, zmm5 + +// CHECK: vcvtw2ph zmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x7d,0xf5] + vcvtw2ph zmm6, zmm5, {rn-sae} + +// CHECK: vcvtw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcvtw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtw2ph zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x7d,0x31] + vcvtw2ph zmm6, word ptr [ecx]{1to32} + +// CHECK: vcvtw2ph zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x7d,0x71,0x7f] + vcvtw2ph zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x7d,0x72,0x80] + vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s --- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -279,3 +279,859 @@ // CHECK: vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} // CHECK: encoding: [0x62,0x65,0x14,0x97,0x5c,0x72,0x80] vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtdq2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x5b,0xf5] + vcvtdq2ph xmm30, xmm29 + +// CHECK: vcvtdq2ph xmm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x5b,0xf5] + vcvtdq2ph xmm30, ymm29 + +// CHECK: vcvtdq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtdq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtdq2ph xmm30, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x5b,0x31] + vcvtdq2ph xmm30, dword ptr [r9]{1to4} + +// CHECK: vcvtdq2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x5b,0x71,0x7f] + vcvtdq2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x5b,0x72,0x80] + vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4} + +// CHECK: vcvtdq2ph xmm30, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x5b,0x31] + vcvtdq2ph xmm30, dword ptr [r9]{1to8} + +// CHECK: vcvtdq2ph xmm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x5b,0x71,0x7f] + vcvtdq2ph xmm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x5b,0x72,0x80] + vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8} + +// CHECK: vcvtpd2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0xfd,0x08,0x5a,0xf5] + vcvtpd2ph xmm30, xmm29 + +// CHECK: vcvtpd2ph xmm30, ymm29 +// CHECK: encoding: [0x62,0x05,0xfd,0x28,0x5a,0xf5] + vcvtpd2ph xmm30, ymm29 + +// CHECK: vcvtpd2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0xfd,0x0f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtpd2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtpd2ph xmm30, qword ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0xfd,0x18,0x5a,0x31] + vcvtpd2ph xmm30, qword ptr [r9]{1to2} + +// CHECK: vcvtpd2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0xfd,0x08,0x5a,0x71,0x7f] + vcvtpd2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2} +// CHECK: encoding: [0x62,0x65,0xfd,0x9f,0x5a,0x72,0x80] + vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2} + +// CHECK: vcvtpd2ph xmm30, qword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0xfd,0x38,0x5a,0x31] + vcvtpd2ph xmm30, qword ptr [r9]{1to4} + +// CHECK: vcvtpd2ph xmm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0xfd,0x28,0x5a,0x71,0x7f] + vcvtpd2ph xmm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4} +// CHECK: encoding: [0x62,0x65,0xfd,0xbf,0x5a,0x72,0x80] + vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4} + +// CHECK: vcvtph2dq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x5b,0xf5] + vcvtph2dq xmm30, xmm29 + +// CHECK: vcvtph2dq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x5b,0xf5] + vcvtph2dq ymm30, xmm29 + +// CHECK: vcvtph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2dq xmm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x5b,0x31] + vcvtph2dq xmm30, word ptr [r9]{1to4} + +// CHECK: vcvtph2dq xmm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x5b,0x71,0x7f] + vcvtph2dq xmm30, qword ptr [rcx + 1016] + +// CHECK: vcvtph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x5b,0x72,0x80] + vcvtph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvtph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2dq ymm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x5b,0x31] + vcvtph2dq ymm30, word ptr [r9]{1to8} + +// CHECK: vcvtph2dq ymm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x5b,0x71,0x7f] + vcvtph2dq ymm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x5b,0x72,0x80] + vcvtph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtph2pd xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x5a,0xf5] + vcvtph2pd xmm30, xmm29 + +// CHECK: vcvtph2pd ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x5a,0xf5] + vcvtph2pd ymm30, xmm29 + +// CHECK: vcvtph2pd xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2pd xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2pd xmm30, word ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x5a,0x31] + vcvtph2pd xmm30, word ptr [r9]{1to2} + +// CHECK: vcvtph2pd xmm30, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x5a,0x71,0x7f] + vcvtph2pd xmm30, dword ptr [rcx + 508] + +// CHECK: vcvtph2pd xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x5a,0x72,0x80] + vcvtph2pd xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} + +// CHECK: vcvtph2pd ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2pd ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2pd ymm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x5a,0x31] + vcvtph2pd ymm30, word ptr [r9]{1to4} + +// CHECK: vcvtph2pd ymm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x5a,0x71,0x7f] + vcvtph2pd ymm30, qword ptr [rcx + 1016] + +// CHECK: vcvtph2pd ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x5a,0x72,0x80] + vcvtph2pd ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvtph2psx xmm30, xmm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x13,0xf5] + vcvtph2psx xmm30, xmm29 + +// CHECK: vcvtph2psx ymm30, xmm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x13,0xf5] + vcvtph2psx ymm30, xmm29 + +// CHECK: vcvtph2psx xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2psx xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2psx xmm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x13,0x31] + vcvtph2psx xmm30, word ptr [r9]{1to4} + +// CHECK: vcvtph2psx xmm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x13,0x71,0x7f] + vcvtph2psx xmm30, qword ptr [rcx + 1016] + +// CHECK: vcvtph2psx xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x13,0x72,0x80] + vcvtph2psx xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvtph2psx ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2psx ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2psx ymm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x13,0x31] + vcvtph2psx ymm30, word ptr [r9]{1to8} + +// CHECK: vcvtph2psx ymm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x13,0x71,0x7f] + vcvtph2psx ymm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtph2psx ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x13,0x72,0x80] + vcvtph2psx ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtph2qq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7b,0xf5] + vcvtph2qq xmm30, xmm29 + +// CHECK: vcvtph2qq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7b,0xf5] + vcvtph2qq ymm30, xmm29 + +// CHECK: vcvtph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2qq xmm30, word ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7b,0x31] + vcvtph2qq xmm30, word ptr [r9]{1to2} + +// CHECK: vcvtph2qq xmm30, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7b,0x71,0x7f] + vcvtph2qq xmm30, dword ptr [rcx + 508] + +// CHECK: vcvtph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7b,0x72,0x80] + vcvtph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} + +// CHECK: vcvtph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2qq ymm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7b,0x31] + vcvtph2qq ymm30, word ptr [r9]{1to4} + +// CHECK: vcvtph2qq ymm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7b,0x71,0x7f] + vcvtph2qq ymm30, qword ptr [rcx + 1016] + +// CHECK: vcvtph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7b,0x72,0x80] + vcvtph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvtph2udq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x79,0xf5] + vcvtph2udq xmm30, xmm29 + +// CHECK: vcvtph2udq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x79,0xf5] + vcvtph2udq ymm30, xmm29 + +// CHECK: vcvtph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2udq xmm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x79,0x31] + vcvtph2udq xmm30, word ptr [r9]{1to4} + +// CHECK: vcvtph2udq xmm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x79,0x71,0x7f] + vcvtph2udq xmm30, qword ptr [rcx + 1016] + +// CHECK: vcvtph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x79,0x72,0x80] + vcvtph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvtph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2udq ymm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x79,0x31] + vcvtph2udq ymm30, word ptr [r9]{1to8} + +// CHECK: vcvtph2udq ymm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x79,0x71,0x7f] + vcvtph2udq ymm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x79,0x72,0x80] + vcvtph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtph2uqq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x79,0xf5] + vcvtph2uqq xmm30, xmm29 + +// CHECK: vcvtph2uqq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x79,0xf5] + vcvtph2uqq ymm30, xmm29 + +// CHECK: vcvtph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2uqq xmm30, word ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x79,0x31] + vcvtph2uqq xmm30, word ptr [r9]{1to2} + +// CHECK: vcvtph2uqq xmm30, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x79,0x71,0x7f] + vcvtph2uqq xmm30, dword ptr [rcx + 508] + +// CHECK: vcvtph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x79,0x72,0x80] + vcvtph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} + +// CHECK: vcvtph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2uqq ymm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x79,0x31] + vcvtph2uqq ymm30, word ptr [r9]{1to4} + +// CHECK: vcvtph2uqq ymm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x79,0x71,0x7f] + vcvtph2uqq ymm30, qword ptr [rcx + 1016] + +// CHECK: vcvtph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x79,0x72,0x80] + vcvtph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvtph2uw xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x7d,0xf5] + vcvtph2uw xmm30, xmm29 + +// CHECK: vcvtph2uw ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x7d,0xf5] + vcvtph2uw ymm30, ymm29 + +// CHECK: vcvtph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2uw xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x7d,0x31] + vcvtph2uw xmm30, word ptr [r9]{1to8} + +// CHECK: vcvtph2uw xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x7d,0x71,0x7f] + vcvtph2uw xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x7d,0x72,0x80] + vcvtph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2uw ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x7d,0x31] + vcvtph2uw ymm30, word ptr [r9]{1to16} + +// CHECK: vcvtph2uw ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x7d,0x71,0x7f] + vcvtph2uw ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x7d,0x72,0x80] + vcvtph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtph2w xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7d,0xf5] + vcvtph2w xmm30, xmm29 + +// CHECK: vcvtph2w ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7d,0xf5] + vcvtph2w ymm30, ymm29 + +// CHECK: vcvtph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2w xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7d,0x31] + vcvtph2w xmm30, word ptr [r9]{1to8} + +// CHECK: vcvtph2w xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7d,0x71,0x7f] + vcvtph2w xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7d,0x72,0x80] + vcvtph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtph2w ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7d,0x31] + vcvtph2w ymm30, word ptr [r9]{1to16} + +// CHECK: vcvtph2w ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7d,0x71,0x7f] + vcvtph2w ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7d,0x72,0x80] + vcvtph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtps2phx xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x1d,0xf5] + vcvtps2phx xmm30, xmm29 + +// CHECK: vcvtps2phx xmm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x1d,0xf5] + vcvtps2phx xmm30, ymm29 + +// CHECK: vcvtps2phx xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtps2phx xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtps2phx xmm30, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x1d,0x31] + vcvtps2phx xmm30, dword ptr [r9]{1to4} + +// CHECK: vcvtps2phx xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x1d,0x71,0x7f] + vcvtps2phx xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x1d,0x72,0x80] + vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4} + +// CHECK: vcvtps2phx xmm30, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x1d,0x31] + vcvtps2phx xmm30, dword ptr [r9]{1to8} + +// CHECK: vcvtps2phx xmm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x1d,0x71,0x7f] + vcvtps2phx xmm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x1d,0x72,0x80] + vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8} + +// CHECK: vcvtqq2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0xfc,0x08,0x5b,0xf5] + vcvtqq2ph xmm30, xmm29 + +// CHECK: vcvtqq2ph xmm30, ymm29 +// CHECK: encoding: [0x62,0x05,0xfc,0x28,0x5b,0xf5] + vcvtqq2ph xmm30, ymm29 + +// CHECK: vcvtqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0xfc,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtqq2ph xmm30, qword ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0xfc,0x18,0x5b,0x31] + vcvtqq2ph xmm30, qword ptr [r9]{1to2} + +// CHECK: vcvtqq2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0xfc,0x08,0x5b,0x71,0x7f] + vcvtqq2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2} +// CHECK: encoding: [0x62,0x65,0xfc,0x9f,0x5b,0x72,0x80] + vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2} + +// CHECK: vcvtqq2ph xmm30, qword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0xfc,0x38,0x5b,0x31] + vcvtqq2ph xmm30, qword ptr [r9]{1to4} + +// CHECK: vcvtqq2ph xmm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0xfc,0x28,0x5b,0x71,0x7f] + vcvtqq2ph xmm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4} +// CHECK: encoding: [0x62,0x65,0xfc,0xbf,0x5b,0x72,0x80] + vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4} + +// CHECK: vcvttph2dq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7e,0x08,0x5b,0xf5] + vcvttph2dq xmm30, xmm29 + +// CHECK: vcvttph2dq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7e,0x28,0x5b,0xf5] + vcvttph2dq ymm30, xmm29 + +// CHECK: vcvttph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2dq xmm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7e,0x18,0x5b,0x31] + vcvttph2dq xmm30, word ptr [r9]{1to4} + +// CHECK: vcvttph2dq xmm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x5b,0x71,0x7f] + vcvttph2dq xmm30, qword ptr [rcx + 1016] + +// CHECK: vcvttph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7e,0x9f,0x5b,0x72,0x80] + vcvttph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvttph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7e,0x2f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2dq ymm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7e,0x38,0x5b,0x31] + vcvttph2dq ymm30, word ptr [r9]{1to8} + +// CHECK: vcvttph2dq ymm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7e,0x28,0x5b,0x71,0x7f] + vcvttph2dq ymm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvttph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7e,0xbf,0x5b,0x72,0x80] + vcvttph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvttph2qq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7a,0xf5] + vcvttph2qq xmm30, xmm29 + +// CHECK: vcvttph2qq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7a,0xf5] + vcvttph2qq ymm30, xmm29 + +// CHECK: vcvttph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2qq xmm30, word ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7a,0x31] + vcvttph2qq xmm30, word ptr [r9]{1to2} + +// CHECK: vcvttph2qq xmm30, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7a,0x71,0x7f] + vcvttph2qq xmm30, dword ptr [rcx + 508] + +// CHECK: vcvttph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7a,0x72,0x80] + vcvttph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} + +// CHECK: vcvttph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2qq ymm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7a,0x31] + vcvttph2qq ymm30, word ptr [r9]{1to4} + +// CHECK: vcvttph2qq ymm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7a,0x71,0x7f] + vcvttph2qq ymm30, qword ptr [rcx + 1016] + +// CHECK: vcvttph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7a,0x72,0x80] + vcvttph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvttph2udq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x78,0xf5] + vcvttph2udq xmm30, xmm29 + +// CHECK: vcvttph2udq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x78,0xf5] + vcvttph2udq ymm30, xmm29 + +// CHECK: vcvttph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2udq xmm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x78,0x31] + vcvttph2udq xmm30, word ptr [r9]{1to4} + +// CHECK: vcvttph2udq xmm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x78,0x71,0x7f] + vcvttph2udq xmm30, qword ptr [rcx + 1016] + +// CHECK: vcvttph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x78,0x72,0x80] + vcvttph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvttph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2udq ymm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x78,0x31] + vcvttph2udq ymm30, word ptr [r9]{1to8} + +// CHECK: vcvttph2udq ymm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x78,0x71,0x7f] + vcvttph2udq ymm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvttph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x78,0x72,0x80] + vcvttph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvttph2uqq xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x78,0xf5] + vcvttph2uqq xmm30, xmm29 + +// CHECK: vcvttph2uqq ymm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x78,0xf5] + vcvttph2uqq ymm30, xmm29 + +// CHECK: vcvttph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2uqq xmm30, word ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x78,0x31] + vcvttph2uqq xmm30, word ptr [r9]{1to2} + +// CHECK: vcvttph2uqq xmm30, dword ptr [rcx + 508] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x78,0x71,0x7f] + vcvttph2uqq xmm30, dword ptr [rcx + 508] + +// CHECK: vcvttph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x78,0x72,0x80] + vcvttph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2} + +// CHECK: vcvttph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2uqq ymm30, word ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x78,0x31] + vcvttph2uqq ymm30, word ptr [r9]{1to4} + +// CHECK: vcvttph2uqq ymm30, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x78,0x71,0x7f] + vcvttph2uqq ymm30, qword ptr [rcx + 1016] + +// CHECK: vcvttph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x78,0x72,0x80] + vcvttph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4} + +// CHECK: vcvttph2uw xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x7c,0xf5] + vcvttph2uw xmm30, xmm29 + +// CHECK: vcvttph2uw ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x7c,0xf5] + vcvttph2uw ymm30, ymm29 + +// CHECK: vcvttph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2uw xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x7c,0x31] + vcvttph2uw xmm30, word ptr [r9]{1to8} + +// CHECK: vcvttph2uw xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x7c,0x71,0x7f] + vcvttph2uw xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvttph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x7c,0x72,0x80] + vcvttph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvttph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2uw ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x7c,0x31] + vcvttph2uw ymm30, word ptr [r9]{1to16} + +// CHECK: vcvttph2uw ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x7c,0x71,0x7f] + vcvttph2uw ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvttph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x7c,0x72,0x80] + vcvttph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvttph2w xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7c,0xf5] + vcvttph2w xmm30, xmm29 + +// CHECK: vcvttph2w ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7c,0xf5] + vcvttph2w ymm30, ymm29 + +// CHECK: vcvttph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2w xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7c,0x31] + vcvttph2w xmm30, word ptr [r9]{1to8} + +// CHECK: vcvttph2w xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7c,0x71,0x7f] + vcvttph2w xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvttph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7c,0x72,0x80] + vcvttph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvttph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvttph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvttph2w ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7c,0x31] + vcvttph2w ymm30, word ptr [r9]{1to16} + +// CHECK: vcvttph2w ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7c,0x71,0x7f] + vcvttph2w ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvttph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7c,0x72,0x80] + vcvttph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtudq2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7f,0x08,0x7a,0xf5] + vcvtudq2ph xmm30, xmm29 + +// CHECK: vcvtudq2ph xmm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7f,0x28,0x7a,0xf5] + vcvtudq2ph xmm30, ymm29 + +// CHECK: vcvtudq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7f,0x0f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtudq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtudq2ph xmm30, dword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0x7f,0x18,0x7a,0x31] + vcvtudq2ph xmm30, dword ptr [r9]{1to4} + +// CHECK: vcvtudq2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7f,0x08,0x7a,0x71,0x7f] + vcvtudq2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0x65,0x7f,0x9f,0x7a,0x72,0x80] + vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4} + +// CHECK: vcvtudq2ph xmm30, dword ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7f,0x38,0x7a,0x31] + vcvtudq2ph xmm30, dword ptr [r9]{1to8} + +// CHECK: vcvtudq2ph xmm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7f,0x28,0x7a,0x71,0x7f] + vcvtudq2ph xmm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0x65,0x7f,0xbf,0x7a,0x72,0x80] + vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8} + +// CHECK: vcvtuqq2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0xff,0x08,0x7a,0xf5] + vcvtuqq2ph xmm30, xmm29 + +// CHECK: vcvtuqq2ph xmm30, ymm29 +// CHECK: encoding: [0x62,0x05,0xff,0x28,0x7a,0xf5] + vcvtuqq2ph xmm30, ymm29 + +// CHECK: vcvtuqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0xff,0x0f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtuqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtuqq2ph xmm30, qword ptr [r9]{1to2} +// CHECK: encoding: [0x62,0x45,0xff,0x18,0x7a,0x31] + vcvtuqq2ph xmm30, qword ptr [r9]{1to2} + +// CHECK: vcvtuqq2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0xff,0x08,0x7a,0x71,0x7f] + vcvtuqq2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2} +// CHECK: encoding: [0x62,0x65,0xff,0x9f,0x7a,0x72,0x80] + vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2} + +// CHECK: vcvtuqq2ph xmm30, qword ptr [r9]{1to4} +// CHECK: encoding: [0x62,0x45,0xff,0x38,0x7a,0x31] + vcvtuqq2ph xmm30, qword ptr [r9]{1to4} + +// CHECK: vcvtuqq2ph xmm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0xff,0x28,0x7a,0x71,0x7f] + vcvtuqq2ph xmm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4} +// CHECK: encoding: [0x62,0x65,0xff,0xbf,0x7a,0x72,0x80] + vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4} + +// CHECK: vcvtuw2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7f,0x08,0x7d,0xf5] + vcvtuw2ph xmm30, xmm29 + +// CHECK: vcvtuw2ph ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7f,0x28,0x7d,0xf5] + vcvtuw2ph ymm30, ymm29 + +// CHECK: vcvtuw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7f,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtuw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtuw2ph xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7f,0x18,0x7d,0x31] + vcvtuw2ph xmm30, word ptr [r9]{1to8} + +// CHECK: vcvtuw2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7f,0x08,0x7d,0x71,0x7f] + vcvtuw2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtuw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7f,0x9f,0x7d,0x72,0x80] + vcvtuw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtuw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7f,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtuw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtuw2ph ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7f,0x38,0x7d,0x31] + vcvtuw2ph ymm30, word ptr [r9]{1to16} + +// CHECK: vcvtuw2ph ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7f,0x28,0x7d,0x71,0x7f] + vcvtuw2ph ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtuw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7f,0xbf,0x7d,0x72,0x80] + vcvtuw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtw2ph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7e,0x08,0x7d,0xf5] + vcvtw2ph xmm30, xmm29 + +// CHECK: vcvtw2ph ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7e,0x28,0x7d,0xf5] + vcvtw2ph ymm30, ymm29 + +// CHECK: vcvtw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtw2ph xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7e,0x18,0x7d,0x31] + vcvtw2ph xmm30, word ptr [r9]{1to8} + +// CHECK: vcvtw2ph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x7d,0x71,0x7f] + vcvtw2ph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vcvtw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7e,0x9f,0x7d,0x72,0x80] + vcvtw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7e,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtw2ph ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7e,0x38,0x7d,0x31] + vcvtw2ph ymm30, word ptr [r9]{1to16} + +// CHECK: vcvtw2ph ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7e,0x28,0x7d,0x71,0x7f] + vcvtw2ph ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7e,0xbf,0x7d,0x72,0x80] + vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}